From c9f9cee50d3ff0d7f61151c79e7d84314d7055d1 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Fri, 13 Feb 2026 19:59:19 +0900 Subject: [PATCH 01/19] =?UTF-8?q?=E2=9C=A8=20Create=20enumerate=5Fview=20f?= =?UTF-8?q?rom=20stride=5Fview?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 215 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index aa56b8a..11e0d3b 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -2831,6 +2831,221 @@ namespace gpu_array return self(range); } }; + + template + requires std::is_lvalue_reference_v && std::ranges::sized_range + class enumerate_sentinel; + + template + requires std::is_lvalue_reference_v && std::ranges::sized_range + class enumerate_iterator_base + { + template + __host__ __device__ friend bool operator==(const enumerate_iterator_base& it, + const enumerate_sentinel& se) noexcept; + + public: + enumerate_iterator_base() = default; + __host__ __device__ std::ranges::range_reference_t operator*() const noexcept + { + return (*pointer_)[index_]; + } + + protected: + __host__ __device__ explicit enumerate_iterator_base(Range&& r, + std::ranges::range_size_t index) noexcept + : pointer_(&r), index_(index) + { + } + + std::remove_reference_t* pointer_ = nullptr; + std::ranges::range_size_t index_ = 0; + }; + + template + requires std::is_lvalue_reference_v && std::ranges::sized_range + class enumerate_sentinel + { + template + __host__ __device__ friend bool operator==(const enumerate_iterator_base& it, + const enumerate_sentinel& se) noexcept; + + public: + enumerate_sentinel() = default; + __host__ __device__ explicit enumerate_sentinel(Range&& r) noexcept : end_(r.size()) {} + + protected: + std::ranges::range_size_t end_ = 0; + }; + + template + __host__ __device__ inline bool operator==(const enumerate_iterator_base& it, + const enumerate_sentinel& se) noexcept + { + return it.index_ >= se.end_; + } + + template + requires std::is_lvalue_reference_v + class enumerate_iterator : public enumerate_iterator_base + { + using base = enumerate_iterator_base; + + __host__ __device__ static auto get_initial_index() noexcept + { +#if defined(GPU_DEVICE_COMPILE) + using namespace cooperative_groups; // NOLINT + if constexpr (StrideType == Stride::BlockThread) + { + return this_thread_block().thread_rank(); + } + else if constexpr (StrideType == Stride::GridThread) + { + return this_grid().thread_rank(); + } +#if defined(ENABLE_HIP) + else if constexpr (StrideType == Stride::GridBlock) + { + return (static_cast(blockIdx.z) * gridDim.y * gridDim.x) + // NOLINT + (static_cast(blockIdx.y) * gridDim.x) + // NOLINT + static_cast(blockIdx.x); // NOLINT + } +#else + else if constexpr (StrideType == Stride::GridBlock) + { + return this_grid().block_rank(); + } +#endif +#if defined(_CG_HAS_CLUSTER_GROUP) + else if constexpr (StrideType == Stride::ClusterThread) + { + return this_cluster().thread_rank(); + } + else if constexpr (StrideType == Stride::ClusterBlock) + { + return this_cluster().block_rank(); + } + else if constexpr (StrideType == Stride::GridCluster) + { + return this_grid().cluster_rank(); + } +#endif + else + { + static_assert([]() { return false; }(), "invalid StrideType"); + } +#else + return 0; +#endif + } + + __host__ __device__ static auto get_enumerate() noexcept + { +#if defined(GPU_DEVICE_COMPILE) + using namespace cooperative_groups; // NOLINT + if constexpr (StrideType == Stride::BlockThread) + { + return this_thread_block().size(); + } + else if constexpr (StrideType == Stride::GridThread) + { + return this_grid().size(); + } +#if defined(ENABLE_HIP) + else if constexpr (StrideType == Stride::GridBlock) + { + return static_cast(gridDim.x) * (gridDim.y * gridDim.z); // NOLINT + } +#else + else if constexpr (StrideType == Stride::GridBlock) + { + return this_grid().num_blocks(); + } +#endif +#if defined(_CG_HAS_CLUSTER_GROUP) + else if constexpr (StrideType == Stride::ClusterThread) + { + return this_cluster().size(); + } + else if constexpr (StrideType == Stride::ClusterBlock) + { + return this_cluster().num_blocks(); + } + else if constexpr (StrideType == Stride::GridCluster) + { + return this_grid().num_clusters(); + } +#endif + else + { + static_assert([]() { return false; }(), "invalid StrideType"); + } +#else + return 1; +#endif + } + + public: + using iterator_category = std::forward_iterator_tag; + using value_type = std::ranges::range_value_t; + using difference_type = std::make_signed_t>; + + __host__ __device__ explicit enumerate_iterator(Range&& r) noexcept + : base(std::forward(r), get_initial_index()) + { + } + __host__ __device__ enumerate_iterator& operator++() noexcept + { + base::index_ += get_enumerate(); + return *this; + } + __host__ __device__ enumerate_iterator operator++(int) noexcept + { + auto res = *this; + ++(*this); + return res; + } + __host__ __device__ bool operator==(const enumerate_iterator& it) const noexcept + { + return base::index_ == it.index_; + } + }; + + template + requires std::is_lvalue_reference_v + class enumerate_view : public std::ranges::view_interface> + { + public: + enumerate_view() = default; + __host__ __device__ explicit enumerate_view(Range&& r) noexcept : pointer_(&r) {} + [[nodiscard]] __host__ __device__ auto begin() const noexcept + { + return enumerate_iterator(*pointer_); + } + [[nodiscard]] __host__ __device__ auto end() const noexcept { return enumerate_sentinel(*pointer_); } + + private: + std::remove_reference_t* pointer_ = nullptr; + }; + + template + struct enumerate_adapter + { + template + requires std::ranges::sized_range + [[nodiscard]] constexpr auto operator()(Range& r) const noexcept + { + return enumerate_view(r); + } + + template + requires std::ranges::sized_range + [[nodiscard]] friend constexpr std::ranges::view auto operator|(Range& range, + const enumerate_adapter& self) noexcept + { + return self(range); + } + }; } // namespace detail #if !defined(ENABLE_HIP) From 144b3485dc737e3dae7ce6e9b65eb8d1de8b7c22 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Fri, 13 Feb 2026 20:27:43 +0900 Subject: [PATCH 02/19] =?UTF-8?q?=E2=9C=A8=20Implement=20enumerate=5Fview?= =?UTF-8?q?=20logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 31 ++++++++++++++++++++++++------- test/test.cpp | 26 ++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index 11e0d3b..62bb906 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -2832,6 +2832,8 @@ namespace gpu_array } }; + // enumerate_view + template requires std::is_lvalue_reference_v && std::ranges::sized_range class enumerate_sentinel; @@ -2846,9 +2848,10 @@ namespace gpu_array public: enumerate_iterator_base() = default; - __host__ __device__ std::ranges::range_reference_t operator*() const noexcept + __host__ __device__ std::pair, std::ranges::range_reference_t> + operator*() const noexcept { - return (*pointer_)[index_]; + return {index_, (*pointer_)[index_]}; } protected: @@ -2939,7 +2942,7 @@ namespace gpu_array #endif } - __host__ __device__ static auto get_enumerate() noexcept + __host__ __device__ static auto get_stride() noexcept { #if defined(GPU_DEVICE_COMPILE) using namespace cooperative_groups; // NOLINT @@ -2996,7 +2999,7 @@ namespace gpu_array } __host__ __device__ enumerate_iterator& operator++() noexcept { - base::index_ += get_enumerate(); + base::index_ += get_stride(); return *this; } __host__ __device__ enumerate_iterator operator++(int) noexcept @@ -3033,15 +3036,15 @@ namespace gpu_array { template requires std::ranges::sized_range - [[nodiscard]] constexpr auto operator()(Range& r) const noexcept + [[nodiscard]] __host__ __device__ auto operator()(Range& r) const noexcept { return enumerate_view(r); } template requires std::ranges::sized_range - [[nodiscard]] friend constexpr std::ranges::view auto operator|(Range& range, - const enumerate_adapter& self) noexcept + [[nodiscard]] __host__ __device__ friend std::ranges::view auto operator|( + Range& range, const enumerate_adapter& self) noexcept { return self(range); } @@ -3064,6 +3067,20 @@ namespace gpu_array using cluster_block_stride_view = detail::stride_view; template using grid_cluster_stride_view = detail::stride_view; + + template + using block_thread_enumerate_view = detail::enumerate_view; + template + using grid_thread_enumerate_view = detail::enumerate_view; + template + using grid_block_enumerate_view = detail::enumerate_view; + + template + using cluster_thread_enumerate_view = detail::enumerate_view; + template + using cluster_block_enumerate_view = detail::enumerate_view; + template + using grid_cluster_enumerate_view = detail::enumerate_view; #endif namespace views diff --git a/test/test.cpp b/test/test.cpp index 13cbc77..91e0736 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -2121,5 +2121,31 @@ TEST(StrideView, AliasTemplate) for (const auto& inner_array : nested_array) for (const auto& v : inner_array) EXPECT_EQ(v, 3); } + +template +requires std::ranges::input_range> +__global__ void kernel_enumerate(T array) +{ + for (auto&& [i, xs] : grid_block_enumerate_view(array)) + for (auto&& [j, x] : block_thread_enumerate_view(xs)) x = i * 100 + j; +} + +TEST(EnumerateView, HowToUse) +{ + auto vec_vec = std::vector(32, std::vector(64, 0)); + auto nested_array = managed_array(vec_vec); + + kernel_enumerate<<<32, 64>>>(nested_array); + api::gpuDeviceSynchronize(); + for (int i = 0; const auto& xs : nested_array) + { + for (int j = 0; const auto& x : xs) + { + EXPECT_EQ(x, i * 100 + j); + ++j; + } + ++i; + } +} #endif // NOLINTEND From b15593ba8033765df7de485d4d0ecc7f8aa16133 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Fri, 13 Feb 2026 20:35:29 +0900 Subject: [PATCH 03/19] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 195 ++++++++---------------------------------- 1 file changed, 37 insertions(+), 158 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index 62bb906..a62948a 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -2676,6 +2676,7 @@ namespace gpu_array { using base = stride_iterator_base; + public: __host__ __device__ static auto get_initial_index() noexcept { #if defined(GPU_DEVICE_COMPILE) @@ -2770,7 +2771,6 @@ namespace gpu_array #endif } - public: using iterator_category = std::forward_iterator_tag; using value_type = std::ranges::range_value_t; using difference_type = std::make_signed_t>; @@ -2832,174 +2832,29 @@ namespace gpu_array } }; - // enumerate_view - - template - requires std::is_lvalue_reference_v && std::ranges::sized_range - class enumerate_sentinel; - - template - requires std::is_lvalue_reference_v && std::ranges::sized_range - class enumerate_iterator_base - { - template - __host__ __device__ friend bool operator==(const enumerate_iterator_base& it, - const enumerate_sentinel& se) noexcept; - - public: - enumerate_iterator_base() = default; - __host__ __device__ std::pair, std::ranges::range_reference_t> - operator*() const noexcept - { - return {index_, (*pointer_)[index_]}; - } - - protected: - __host__ __device__ explicit enumerate_iterator_base(Range&& r, - std::ranges::range_size_t index) noexcept - : pointer_(&r), index_(index) - { - } - - std::remove_reference_t* pointer_ = nullptr; - std::ranges::range_size_t index_ = 0; - }; - - template - requires std::is_lvalue_reference_v && std::ranges::sized_range - class enumerate_sentinel - { - template - __host__ __device__ friend bool operator==(const enumerate_iterator_base& it, - const enumerate_sentinel& se) noexcept; - - public: - enumerate_sentinel() = default; - __host__ __device__ explicit enumerate_sentinel(Range&& r) noexcept : end_(r.size()) {} - - protected: - std::ranges::range_size_t end_ = 0; - }; - - template - __host__ __device__ inline bool operator==(const enumerate_iterator_base& it, - const enumerate_sentinel& se) noexcept - { - return it.index_ >= se.end_; - } - template requires std::is_lvalue_reference_v - class enumerate_iterator : public enumerate_iterator_base + class enumerate_iterator { - using base = enumerate_iterator_base; - - __host__ __device__ static auto get_initial_index() noexcept - { -#if defined(GPU_DEVICE_COMPILE) - using namespace cooperative_groups; // NOLINT - if constexpr (StrideType == Stride::BlockThread) - { - return this_thread_block().thread_rank(); - } - else if constexpr (StrideType == Stride::GridThread) - { - return this_grid().thread_rank(); - } -#if defined(ENABLE_HIP) - else if constexpr (StrideType == Stride::GridBlock) - { - return (static_cast(blockIdx.z) * gridDim.y * gridDim.x) + // NOLINT - (static_cast(blockIdx.y) * gridDim.x) + // NOLINT - static_cast(blockIdx.x); // NOLINT - } -#else - else if constexpr (StrideType == Stride::GridBlock) - { - return this_grid().block_rank(); - } -#endif -#if defined(_CG_HAS_CLUSTER_GROUP) - else if constexpr (StrideType == Stride::ClusterThread) - { - return this_cluster().thread_rank(); - } - else if constexpr (StrideType == Stride::ClusterBlock) - { - return this_cluster().block_rank(); - } - else if constexpr (StrideType == Stride::GridCluster) - { - return this_grid().cluster_rank(); - } -#endif - else - { - static_assert([]() { return false; }(), "invalid StrideType"); - } -#else - return 0; -#endif - } - - __host__ __device__ static auto get_stride() noexcept - { -#if defined(GPU_DEVICE_COMPILE) - using namespace cooperative_groups; // NOLINT - if constexpr (StrideType == Stride::BlockThread) - { - return this_thread_block().size(); - } - else if constexpr (StrideType == Stride::GridThread) - { - return this_grid().size(); - } -#if defined(ENABLE_HIP) - else if constexpr (StrideType == Stride::GridBlock) - { - return static_cast(gridDim.x) * (gridDim.y * gridDim.z); // NOLINT - } -#else - else if constexpr (StrideType == Stride::GridBlock) - { - return this_grid().num_blocks(); - } -#endif -#if defined(_CG_HAS_CLUSTER_GROUP) - else if constexpr (StrideType == Stride::ClusterThread) - { - return this_cluster().size(); - } - else if constexpr (StrideType == Stride::ClusterBlock) - { - return this_cluster().num_blocks(); - } - else if constexpr (StrideType == Stride::GridCluster) - { - return this_grid().num_clusters(); - } -#endif - else - { - static_assert([]() { return false; }(), "invalid StrideType"); - } -#else - return 1; -#endif - } - public: using iterator_category = std::forward_iterator_tag; using value_type = std::ranges::range_value_t; using difference_type = std::make_signed_t>; + enumerate_iterator() = default; __host__ __device__ explicit enumerate_iterator(Range&& r) noexcept - : base(std::forward(r), get_initial_index()) + : pointer_(&r), index_(stride_iterator::get_initial_index()) + { + } + __host__ __device__ std::ranges::range_size_t index() const noexcept { return index_; } + __host__ __device__ std::pair, std::ranges::range_reference_t> + operator*() const noexcept { + return {index_, (*pointer_)[index_]}; } __host__ __device__ enumerate_iterator& operator++() noexcept { - base::index_ += get_stride(); + index_ += stride_iterator::get_stride(); return *this; } __host__ __device__ enumerate_iterator operator++(int) noexcept @@ -3010,8 +2865,29 @@ namespace gpu_array } __host__ __device__ bool operator==(const enumerate_iterator& it) const noexcept { - return base::index_ == it.index_; + return index_ == it.index_; } + + private: + std::remove_reference_t* pointer_ = nullptr; + std::ranges::range_size_t index_ = 0; + }; + + template + requires std::is_lvalue_reference_v && std::ranges::sized_range + class enumerate_sentinel + { + public: + enumerate_sentinel() = default; + __host__ __device__ explicit enumerate_sentinel(Range&& r) noexcept : end_(r.size()) {} + __host__ __device__ friend bool operator==(const enumerate_iterator& it, + const enumerate_sentinel& se) noexcept + { + return it.index() >= se.end_; + } + + private: + std::ranges::range_size_t end_ = 0; }; template @@ -3025,7 +2901,10 @@ namespace gpu_array { return enumerate_iterator(*pointer_); } - [[nodiscard]] __host__ __device__ auto end() const noexcept { return enumerate_sentinel(*pointer_); } + [[nodiscard]] __host__ __device__ auto end() const noexcept + { + return enumerate_sentinel(*pointer_); + } private: std::remove_reference_t* pointer_ = nullptr; From 87db19a9ebec5dcae1c98d0ecb51b1e075ebd7c2 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Fri, 13 Feb 2026 21:03:41 +0900 Subject: [PATCH 04/19] =?UTF-8?q?=E2=9C=A8=20Add=20block=5Fthread=5Fenumer?= =?UTF-8?q?ate,=20etc.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index a62948a..4181f2a 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -2983,6 +2983,26 @@ namespace gpu_array inline constexpr detail::stride_adapter cluster_block_stride; inline constexpr detail::stride_adapter grid_cluster_stride; #endif +#endif + +#ifdef GPU_CHECK_ERROR + __device__ static constexpr detail::enumerate_adapter block_thread_enumerate; + __device__ static constexpr detail::enumerate_adapter grid_thread_enumerate; + __device__ static constexpr detail::enumerate_adapter grid_block_enumerate; +#if defined(_CG_HAS_CLUSTER_GROUP) + __device__ static constexpr detail::enumerate_adapter cluster_thread_enumerate; + __device__ static constexpr detail::enumerate_adapter cluster_block_enumerate; + __device__ static constexpr detail::enumerate_adapter grid_cluster_enumerate; +#endif +#else + inline constexpr detail::enumerate_adapter block_thread_enumerate; + inline constexpr detail::enumerate_adapter grid_thread_enumerate; + inline constexpr detail::enumerate_adapter grid_block_enumerate; +#if defined(_CG_HAS_CLUSTER_GROUP) + inline constexpr detail::enumerate_adapter cluster_thread_enumerate; + inline constexpr detail::enumerate_adapter cluster_block_enumerate; + inline constexpr detail::enumerate_adapter grid_cluster_enumerate; +#endif #endif } // namespace views } // namespace gpu_array From 92931af828daa2947dde00228c3bec4cb3cbd8b9 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Mon, 2 Mar 2026 11:55:57 +0900 Subject: [PATCH 05/19] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 82 ++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index 4181f2a..f432179 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -2831,7 +2831,52 @@ namespace gpu_array return self(range); } }; + } // namespace detail + +#if !defined(ENABLE_HIP) + // The following three alias templates are also disabled in HIP because HIP does not support alias template argument + // deduction. + template + using block_thread_stride_view = detail::stride_view; + template + using grid_thread_stride_view = detail::stride_view; + template + using grid_block_stride_view = detail::stride_view; + + template + using cluster_thread_stride_view = detail::stride_view; + template + using cluster_block_stride_view = detail::stride_view; + template + using grid_cluster_stride_view = detail::stride_view; +#endif + + namespace views + { + using detail::Stride; +#ifdef GPU_CHECK_ERROR + __device__ static constexpr detail::stride_adapter block_thread_stride; + __device__ static constexpr detail::stride_adapter grid_thread_stride; + __device__ static constexpr detail::stride_adapter grid_block_stride; +#if defined(_CG_HAS_CLUSTER_GROUP) + __device__ static constexpr detail::stride_adapter cluster_thread_stride; + __device__ static constexpr detail::stride_adapter cluster_block_stride; + __device__ static constexpr detail::stride_adapter grid_cluster_stride; +#endif +#else + inline constexpr detail::stride_adapter block_thread_stride; + inline constexpr detail::stride_adapter grid_thread_stride; + inline constexpr detail::stride_adapter grid_block_stride; +#if defined(_CG_HAS_CLUSTER_GROUP) + inline constexpr detail::stride_adapter cluster_thread_stride; + inline constexpr detail::stride_adapter cluster_block_stride; + inline constexpr detail::stride_adapter grid_cluster_stride; +#endif +#endif + } // namespace views + namespace detail + { template requires std::is_lvalue_reference_v class enumerate_iterator @@ -2931,22 +2976,6 @@ namespace gpu_array } // namespace detail #if !defined(ENABLE_HIP) - // The following three alias templates are also disabled in HIP because HIP does not support alias template argument - // deduction. - template - using block_thread_stride_view = detail::stride_view; - template - using grid_thread_stride_view = detail::stride_view; - template - using grid_block_stride_view = detail::stride_view; - - template - using cluster_thread_stride_view = detail::stride_view; - template - using cluster_block_stride_view = detail::stride_view; - template - using grid_cluster_stride_view = detail::stride_view; - template using block_thread_enumerate_view = detail::enumerate_view; template @@ -2964,27 +2993,6 @@ namespace gpu_array namespace views { - using detail::Stride; -#ifdef GPU_CHECK_ERROR - __device__ static constexpr detail::stride_adapter block_thread_stride; - __device__ static constexpr detail::stride_adapter grid_thread_stride; - __device__ static constexpr detail::stride_adapter grid_block_stride; -#if defined(_CG_HAS_CLUSTER_GROUP) - __device__ static constexpr detail::stride_adapter cluster_thread_stride; - __device__ static constexpr detail::stride_adapter cluster_block_stride; - __device__ static constexpr detail::stride_adapter grid_cluster_stride; -#endif -#else - inline constexpr detail::stride_adapter block_thread_stride; - inline constexpr detail::stride_adapter grid_thread_stride; - inline constexpr detail::stride_adapter grid_block_stride; -#if defined(_CG_HAS_CLUSTER_GROUP) - inline constexpr detail::stride_adapter cluster_thread_stride; - inline constexpr detail::stride_adapter cluster_block_stride; - inline constexpr detail::stride_adapter grid_cluster_stride; -#endif -#endif - #ifdef GPU_CHECK_ERROR __device__ static constexpr detail::enumerate_adapter block_thread_enumerate; __device__ static constexpr detail::enumerate_adapter grid_thread_enumerate; From f97667d3354044f7cc6ebe9f5bfe1e2a79e2fceb Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Mon, 2 Mar 2026 12:07:09 +0900 Subject: [PATCH 06/19] =?UTF-8?q?=E2=9C=A8=20Create=20zip=5Fview=20from=20?= =?UTF-8?q?enumerate=5Fview?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 136 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index f432179..c626820 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -3011,6 +3011,142 @@ namespace gpu_array inline constexpr detail::enumerate_adapter cluster_block_enumerate; inline constexpr detail::enumerate_adapter grid_cluster_enumerate; #endif +#endif + } // namespace views + + namespace detail + { + template + requires std::is_lvalue_reference_v + class zip_iterator + { + public: + using iterator_category = std::forward_iterator_tag; + using value_type = std::ranges::range_value_t; + using difference_type = std::make_signed_t>; + + zip_iterator() = default; + __host__ __device__ explicit zip_iterator(Range&& r) noexcept + : pointer_(&r), index_(stride_iterator::get_initial_index()) + { + } + __host__ __device__ std::ranges::range_size_t index() const noexcept { return index_; } + __host__ __device__ std::pair, std::ranges::range_reference_t> + operator*() const noexcept + { + return {index_, (*pointer_)[index_]}; + } + __host__ __device__ zip_iterator& operator++() noexcept + { + index_ += stride_iterator::get_stride(); + return *this; + } + __host__ __device__ zip_iterator operator++(int) noexcept + { + auto res = *this; + ++(*this); + return res; + } + __host__ __device__ bool operator==(const zip_iterator& it) const noexcept { return index_ == it.index_; } + + private: + std::remove_reference_t* pointer_ = nullptr; + std::ranges::range_size_t index_ = 0; + }; + + template + requires std::is_lvalue_reference_v && std::ranges::sized_range + class zip_sentinel + { + public: + zip_sentinel() = default; + __host__ __device__ explicit zip_sentinel(Range&& r) noexcept : end_(r.size()) {} + __host__ __device__ friend bool operator==(const zip_iterator& it, + const zip_sentinel& se) noexcept + { + return it.index() >= se.end_; + } + + private: + std::ranges::range_size_t end_ = 0; + }; + + template + requires std::is_lvalue_reference_v + class zip_view : public std::ranges::view_interface> + { + public: + zip_view() = default; + __host__ __device__ explicit zip_view(Range&& r) noexcept : pointer_(&r) {} + [[nodiscard]] __host__ __device__ auto begin() const noexcept + { + return zip_iterator(*pointer_); + } + [[nodiscard]] __host__ __device__ auto end() const noexcept + { + return zip_sentinel(*pointer_); + } + + private: + std::remove_reference_t* pointer_ = nullptr; + }; + + template + struct zip_adapter + { + template + requires std::ranges::sized_range + [[nodiscard]] __host__ __device__ auto operator()(Range& r) const noexcept + { + return zip_view(r); + } + + template + requires std::ranges::sized_range + [[nodiscard]] __host__ __device__ friend std::ranges::view auto operator|(Range& range, + const zip_adapter& self) noexcept + { + return self(range); + } + }; + } // namespace detail + +#if !defined(ENABLE_HIP) + template + using block_thread_zip_view = detail::zip_view; + template + using grid_thread_zip_view = detail::zip_view; + template + using grid_block_zip_view = detail::zip_view; + + template + using cluster_thread_zip_view = detail::zip_view; + template + using cluster_block_zip_view = detail::zip_view; + template + using grid_cluster_zip_view = detail::zip_view; +#endif + + namespace views + { +#ifdef GPU_CHECK_ERROR + __device__ static constexpr detail::zip_adapter block_thread_zip; + __device__ static constexpr detail::zip_adapter grid_thread_zip; + __device__ static constexpr detail::zip_adapter grid_block_zip; +#if defined(_CG_HAS_CLUSTER_GROUP) + __device__ static constexpr detail::zip_adapter cluster_thread_zip; + __device__ static constexpr detail::zip_adapter cluster_block_zip; + __device__ static constexpr detail::zip_adapter grid_cluster_zip; +#endif +#else + inline constexpr detail::zip_adapter block_thread_zip; + inline constexpr detail::zip_adapter grid_thread_zip; + inline constexpr detail::zip_adapter grid_block_zip; +#if defined(_CG_HAS_CLUSTER_GROUP) + inline constexpr detail::zip_adapter cluster_thread_zip; + inline constexpr detail::zip_adapter cluster_block_zip; + inline constexpr detail::zip_adapter grid_cluster_zip; +#endif #endif } // namespace views } // namespace gpu_array From ea84e03bf3e5b98a4615bbc7bbbe47c56b311df0 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Mon, 2 Mar 2026 14:07:05 +0900 Subject: [PATCH 07/19] =?UTF-8?q?=E2=9C=A8=20Implement=20zip=5Fview=20logi?= =?UTF-8?q?c?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 103 ++++++++++++++++++++++-------------------- test/test.cpp | 50 +++++++++++++++++++- 2 files changed, 102 insertions(+), 51 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index c626820..caa6ab1 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -3016,29 +3016,38 @@ namespace gpu_array namespace detail { - template - requires std::is_lvalue_reference_v + template + using first_t = std::tuple_element_t<0, std::tuple>; + + template + requires (std::is_lvalue_reference_v && ...) class zip_iterator { public: using iterator_category = std::forward_iterator_tag; - using value_type = std::ranges::range_value_t; - using difference_type = std::make_signed_t>; + using value_type = std::tuple...>; + using difference_type = std::common_type_t>...>; zip_iterator() = default; - __host__ __device__ explicit zip_iterator(Range&& r) noexcept - : pointer_(&r), index_(stride_iterator::get_initial_index()) + __host__ __device__ explicit zip_iterator(Ranges&&... rs) noexcept + : pointers_(&rs...), index_(stride_iterator>::get_initial_index()) { } - __host__ __device__ std::ranges::range_size_t index() const noexcept { return index_; } - __host__ __device__ std::pair, std::ranges::range_reference_t> - operator*() const noexcept + __host__ __device__ std::common_type_t...> index() const noexcept { - return {index_, (*pointer_)[index_]}; + return index_; + } + __host__ __device__ auto operator*() const noexcept + { + return std::apply( + [this](auto&... pointers) { + return std::tuple...>((*pointers)[index_]...); + }, + pointers_); } __host__ __device__ zip_iterator& operator++() noexcept { - index_ += stride_iterator::get_stride(); + index_ += stride_iterator>::get_stride(); return *this; } __host__ __device__ zip_iterator operator++(int) noexcept @@ -3050,81 +3059,75 @@ namespace gpu_array __host__ __device__ bool operator==(const zip_iterator& it) const noexcept { return index_ == it.index_; } private: - std::remove_reference_t* pointer_ = nullptr; - std::ranges::range_size_t index_ = 0; + std::tuple*...> pointers_{}; + std::common_type_t...> index_ = 0; }; - template - requires std::is_lvalue_reference_v && std::ranges::sized_range + template + requires (std::is_lvalue_reference_v && ...) && (std::ranges::sized_range && ...) class zip_sentinel { public: zip_sentinel() = default; - __host__ __device__ explicit zip_sentinel(Range&& r) noexcept : end_(r.size()) {} - __host__ __device__ friend bool operator==(const zip_iterator& it, + __host__ __device__ explicit zip_sentinel(Ranges&&... rs) noexcept : end_(std::min({rs.size()...})) {} + __host__ __device__ friend bool operator==(const zip_iterator& it, const zip_sentinel& se) noexcept { return it.index() >= se.end_; } private: - std::ranges::range_size_t end_ = 0; + std::common_type_t...> end_ = 0; }; - template - requires std::is_lvalue_reference_v - class zip_view : public std::ranges::view_interface> + template + requires (std::is_lvalue_reference_v && ...) + class zip_view : public std::ranges::view_interface> { public: zip_view() = default; - __host__ __device__ explicit zip_view(Range&& r) noexcept : pointer_(&r) {} + __host__ __device__ explicit zip_view(Ranges&&... rs) noexcept : pointers_(&rs...) {} [[nodiscard]] __host__ __device__ auto begin() const noexcept { - return zip_iterator(*pointer_); + return std::apply( + [this](auto&... pointers) { return zip_iterator(*pointers...); }, pointers_); } [[nodiscard]] __host__ __device__ auto end() const noexcept { - return zip_sentinel(*pointer_); + return std::apply( + [this](auto&... pointers) { return zip_sentinel(*pointers...); }, pointers_); } private: - std::remove_reference_t* pointer_ = nullptr; + std::tuple*...> pointers_{}; }; template struct zip_adapter { - template - requires std::ranges::sized_range - [[nodiscard]] __host__ __device__ auto operator()(Range& r) const noexcept - { - return zip_view(r); - } - - template - requires std::ranges::sized_range - [[nodiscard]] __host__ __device__ friend std::ranges::view auto operator|(Range& range, - const zip_adapter& self) noexcept + template + requires (std::ranges::sized_range && ...) + [[nodiscard]] __host__ __device__ auto operator()(Ranges&... rs) const noexcept { - return self(range); + return zip_view(rs...); } }; } // namespace detail #if !defined(ENABLE_HIP) - template - using block_thread_zip_view = detail::zip_view; - template - using grid_thread_zip_view = detail::zip_view; - template - using grid_block_zip_view = detail::zip_view; - - template - using cluster_thread_zip_view = detail::zip_view; - template - using cluster_block_zip_view = detail::zip_view; - template - using grid_cluster_zip_view = detail::zip_view; + template + using block_thread_zip_view = detail::zip_view; + template + using grid_thread_zip_view = detail::zip_view; + template + using grid_block_zip_view = detail::zip_view; + + template + using cluster_thread_zip_view = detail::zip_view; + template + using cluster_block_zip_view = detail::zip_view; + template + using grid_cluster_zip_view = detail::zip_view; #endif namespace views diff --git a/test/test.cpp b/test/test.cpp index 91e0736..22e4f7e 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -2130,7 +2130,7 @@ __global__ void kernel_enumerate(T array) for (auto&& [j, x] : block_thread_enumerate_view(xs)) x = i * 100 + j; } -TEST(EnumerateView, HowToUse) +TEST(EnumerateView, Simple) { auto vec_vec = std::vector(32, std::vector(64, 0)); auto nested_array = managed_array(vec_vec); @@ -2147,5 +2147,53 @@ TEST(EnumerateView, HowToUse) ++i; } } + +template +requires std::ranges::input_range> +__global__ void zip_test_init(T array, int coeff) +{ + for (auto&& [i, xs] : grid_block_enumerate_view(array)) + for (auto&& [j, x] : block_thread_enumerate_view(xs)) x = (i * xs.size() + j) * coeff; +} + +template +requires std::ranges::input_range> && + std::ranges::input_range> +__global__ void kernel_zip(T array1, const U array2) +{ + for (auto&& [xs, ys] : detail::zip_adapter{}(array1, array2)) + for (auto&& [x, y] : detail::zip_adapter{}(xs, ys)) x = x + y; +} + +TEST(ZipView, Simple) +{ + auto vec_vec = std::vector(10, std::vector(20, 0)); + auto array1 = managed_array(vec_vec); + auto array2 = managed_array(vec_vec); + zip_test_init<<<10, 20>>>(array1, 1); + zip_test_init<<<10, 20>>>(array2, 1000); + api::gpuDeviceSynchronize(); + for (int i = 0; const auto& xs : array1) + { + for (int j = 0; const auto& x : xs) + { + EXPECT_EQ(x, i * 20 + j); + ++j; + } + ++i; + } + + kernel_zip<<<10, 20>>>(array1, array2); + api::gpuDeviceSynchronize(); + for (int i = 0; const auto& xs : array1) + { + for (int j = 0; const auto& x : xs) + { + EXPECT_EQ(x, (i * 20 + j) * 1001); + ++j; + } + ++i; + } +} #endif // NOLINTEND From 742725866226d2f8aa2a7411287dd09532b0c55e Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Mon, 2 Mar 2026 14:37:39 +0900 Subject: [PATCH 08/19] =?UTF-8?q?=E2=9C=A8=20Add=20block=5Fthread=5Fzip=5F?= =?UTF-8?q?view,=20etc.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 32 ++++++++++++++++++-------------- test/test.cpp | 29 ++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index caa6ab1..b15d252 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -3114,20 +3114,24 @@ namespace gpu_array }; } // namespace detail -#if !defined(ENABLE_HIP) - template - using block_thread_zip_view = detail::zip_view; - template - using grid_thread_zip_view = detail::zip_view; - template - using grid_block_zip_view = detail::zip_view; - - template - using cluster_thread_zip_view = detail::zip_view; - template - using cluster_block_zip_view = detail::zip_view; - template - using grid_cluster_zip_view = detail::zip_view; +#ifdef GPU_CHECK_ERROR + __device__ static constexpr detail::zip_adapter block_thread_zip_view; + __device__ static constexpr detail::zip_adapter grid_thread_zip_view; + __device__ static constexpr detail::zip_adapter grid_block_zip_view; +#if defined(_CG_HAS_CLUSTER_GROUP) + __device__ static constexpr detail::zip_adapter cluster_thread_zip_view; + __device__ static constexpr detail::zip_adapter cluster_block_zip_view; + __device__ static constexpr detail::zip_adapter grid_cluster_zip_view; +#endif +#else + inline constexpr detail::zip_adapter block_thread_zip_view; + inline constexpr detail::zip_adapter grid_thread_zip_view; + inline constexpr detail::zip_adapter grid_block_zip_view; +#if defined(_CG_HAS_CLUSTER_GROUP) + inline constexpr detail::zip_adapter cluster_thread_zip_view; + inline constexpr detail::zip_adapter cluster_block_zip_view; + inline constexpr detail::zip_adapter grid_cluster_zip_view; +#endif #endif namespace views diff --git a/test/test.cpp b/test/test.cpp index 22e4f7e..3b071cd 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -2161,8 +2161,17 @@ requires std::ranges::input_range> && std::ranges::input_range> __global__ void kernel_zip(T array1, const U array2) { - for (auto&& [xs, ys] : detail::zip_adapter{}(array1, array2)) - for (auto&& [x, y] : detail::zip_adapter{}(xs, ys)) x = x + y; + for (auto&& [xs, ys] : views::grid_block_zip(array1, array2)) + for (auto&& [x, y] : views::block_thread_zip(xs, ys)) x = x + y; +} + +template +requires std::ranges::input_range> && + std::ranges::input_range> +__global__ void kernel_zip2(T array1, const U array2) +{ + for (auto&& [xs, ys] : grid_block_zip_view(array1, array2)) + for (auto&& [x, y] : block_thread_zip_view(xs, ys)) x = x + y; } TEST(ZipView, Simple) @@ -2171,7 +2180,6 @@ TEST(ZipView, Simple) auto array1 = managed_array(vec_vec); auto array2 = managed_array(vec_vec); zip_test_init<<<10, 20>>>(array1, 1); - zip_test_init<<<10, 20>>>(array2, 1000); api::gpuDeviceSynchronize(); for (int i = 0; const auto& xs : array1) { @@ -2183,6 +2191,7 @@ TEST(ZipView, Simple) ++i; } + zip_test_init<<<10, 20>>>(array2, 1000); kernel_zip<<<10, 20>>>(array1, array2); api::gpuDeviceSynchronize(); for (int i = 0; const auto& xs : array1) @@ -2194,6 +2203,20 @@ TEST(ZipView, Simple) } ++i; } + + zip_test_init<<<10, 20>>>(array1, 1); + zip_test_init<<<10, 20>>>(array2, 2000); + kernel_zip2<<<10, 20>>>(array1, array2); + api::gpuDeviceSynchronize(); + for (int i = 0; const auto& xs : array1) + { + for (int j = 0; const auto& x : xs) + { + EXPECT_EQ(x, (i * 20 + j) * 2001); + ++j; + } + ++i; + } } #endif // NOLINTEND From e94b9870d51b11cd748b05c6eb24aa82aa2cf0d7 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Mon, 2 Mar 2026 17:32:53 +0900 Subject: [PATCH 09/19] =?UTF-8?q?=F0=9F=90=9B=20Make=20stride=5Fview=20sat?= =?UTF-8?q?isfy=20forward=5Frange?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 1 + test/test.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index b15d252..6182790 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -2775,6 +2775,7 @@ namespace gpu_array using value_type = std::ranges::range_value_t; using difference_type = std::make_signed_t>; + stride_iterator() = default; __host__ __device__ explicit stride_iterator(Range&& r) noexcept : base(std::forward(r), get_initial_index()) { diff --git a/test/test.cpp b/test/test.cpp index 3b071cd..c2494ba 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -2071,6 +2071,8 @@ TEST(JaggedArray, MemoryManagement) } #if !defined(ENABLE_HIP) +static_assert(std::ranges::forward_range&>>); + template requires std::ranges::input_range> __global__ void kernel_stride(T array) From 9cb3387b785d2270e83b3117247eeea9358f2e26 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Mon, 2 Mar 2026 17:52:59 +0900 Subject: [PATCH 10/19] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor=20enumerate?= =?UTF-8?q?=5Fiterator=20and=20enumerate=5Fview?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 190 +++++++++++++++++++++++++++++++----------- 1 file changed, 142 insertions(+), 48 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index 6182790..5ce8f0b 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -2878,20 +2878,17 @@ namespace gpu_array namespace detail { - template + template requires std::is_lvalue_reference_v class enumerate_iterator { public: - using iterator_category = std::forward_iterator_tag; + using iterator_category = std::random_access_iterator_tag; using value_type = std::ranges::range_value_t; using difference_type = std::make_signed_t>; enumerate_iterator() = default; - __host__ __device__ explicit enumerate_iterator(Range&& r) noexcept - : pointer_(&r), index_(stride_iterator::get_initial_index()) - { - } + __host__ __device__ explicit enumerate_iterator(Range&& r) noexcept : pointer_(&r), index_(0) {} __host__ __device__ std::ranges::range_size_t index() const noexcept { return index_; } __host__ __device__ std::pair, std::ranges::range_reference_t> operator*() const noexcept @@ -2900,7 +2897,7 @@ namespace gpu_array } __host__ __device__ enumerate_iterator& operator++() noexcept { - index_ += stride_iterator::get_stride(); + ++index_; return *this; } __host__ __device__ enumerate_iterator operator++(int) noexcept @@ -2909,61 +2906,145 @@ namespace gpu_array ++(*this); return res; } - __host__ __device__ bool operator==(const enumerate_iterator& it) const noexcept + __host__ __device__ enumerate_iterator& operator--() noexcept + { + --index_; + return *this; + } + __host__ __device__ enumerate_iterator operator--(int) noexcept + { + auto res = *this; + --(*this); + return res; + } + __host__ __device__ enumerate_iterator& operator+=(difference_type n) + { + index_ += n; + return *this; + } + __host__ __device__ enumerate_iterator& operator-=(difference_type n) + { + index_ -= n; + return *this; + } + __host__ __device__ std::pair, std::ranges::range_reference_t> + operator[](difference_type n) const + { + return *(*this + n); + } + + __host__ __device__ bool operator==(const enumerate_iterator& it) const& noexcept { return index_ == it.index_; } + __host__ __device__ bool operator<(const enumerate_iterator& it) const& noexcept + { + return index_ < it.index_; + } + __host__ __device__ bool operator>(const enumerate_iterator& it) const& noexcept + { + return index_ > it.index_; + } + __host__ __device__ bool operator<=(const enumerate_iterator& it) const& noexcept + { + return index_ <= it.index_; + } + __host__ __device__ bool operator>=(const enumerate_iterator& it) const& noexcept + { + return index_ >= it.index_; + } + + __host__ __device__ friend enumerate_iterator operator+(enumerate_iterator x, difference_type n) + { + x += n; + return x; + } + __host__ __device__ friend enumerate_iterator operator+(difference_type n, enumerate_iterator x) + { + x += n; + return x; + } + __host__ __device__ friend enumerate_iterator operator-(enumerate_iterator x, difference_type n) + { + x -= n; + return x; + } + __host__ __device__ friend difference_type operator-(const enumerate_iterator& x, + const enumerate_iterator& y) + { + return x.index() - y.index(); + } + + __host__ __device__ friend std::pair, + std::ranges::range_rvalue_reference_t> + iter_move(const enumerate_iterator& x) + { + return {x.index(), std::move(x->second)}; + } private: std::remove_reference_t* pointer_ = nullptr; std::ranges::range_size_t index_ = 0; }; - template + template requires std::is_lvalue_reference_v && std::ranges::sized_range class enumerate_sentinel { + using difference_type = std::make_signed_t>; + public: enumerate_sentinel() = default; __host__ __device__ explicit enumerate_sentinel(Range&& r) noexcept : end_(r.size()) {} - __host__ __device__ friend bool operator==(const enumerate_iterator& it, + __host__ __device__ friend bool operator==(const enumerate_iterator& it, const enumerate_sentinel& se) noexcept { return it.index() >= se.end_; } + __host__ __device__ friend difference_type operator-(const enumerate_iterator& it, + const enumerate_sentinel& se) noexcept + { + return it.index() - se.end_; + } + __host__ __device__ friend difference_type operator-(const enumerate_sentinel& se, + const enumerate_iterator& it) noexcept + { + return se.end_ - it.index(); + } + private: std::ranges::range_size_t end_ = 0; }; - template + template requires std::is_lvalue_reference_v - class enumerate_view : public std::ranges::view_interface> + class enumerate_view : public std::ranges::view_interface> { public: enumerate_view() = default; __host__ __device__ explicit enumerate_view(Range&& r) noexcept : pointer_(&r) {} [[nodiscard]] __host__ __device__ auto begin() const noexcept { - return enumerate_iterator(*pointer_); - } - [[nodiscard]] __host__ __device__ auto end() const noexcept - { - return enumerate_sentinel(*pointer_); + return enumerate_iterator(*pointer_); } + [[nodiscard]] __host__ __device__ auto end() const noexcept { return enumerate_sentinel(*pointer_); } + [[nodiscard]] __host__ __device__ auto size() const noexcept { return pointer_->size(); } private: std::remove_reference_t* pointer_ = nullptr; }; - template + template + enumerate_view(Range&) -> enumerate_view; + struct enumerate_adapter { template requires std::ranges::sized_range [[nodiscard]] __host__ __device__ auto operator()(Range& r) const noexcept { - return enumerate_view(r); + return enumerate_view(r); } template @@ -2977,44 +3058,57 @@ namespace gpu_array } // namespace detail #if !defined(ENABLE_HIP) - template - using block_thread_enumerate_view = detail::enumerate_view; - template - using grid_thread_enumerate_view = detail::enumerate_view; - template - using grid_block_enumerate_view = detail::enumerate_view; - - template - using cluster_thread_enumerate_view = detail::enumerate_view; - template - using cluster_block_enumerate_view = detail::enumerate_view; - template - using grid_cluster_enumerate_view = detail::enumerate_view; + using detail::enumerate_view; #endif namespace views { #ifdef GPU_CHECK_ERROR - __device__ static constexpr detail::enumerate_adapter block_thread_enumerate; - __device__ static constexpr detail::enumerate_adapter grid_thread_enumerate; - __device__ static constexpr detail::enumerate_adapter grid_block_enumerate; -#if defined(_CG_HAS_CLUSTER_GROUP) - __device__ static constexpr detail::enumerate_adapter cluster_thread_enumerate; - __device__ static constexpr detail::enumerate_adapter cluster_block_enumerate; - __device__ static constexpr detail::enumerate_adapter grid_cluster_enumerate; -#endif + __device__ static constexpr detail::enumerate_adapter enumerate; #else - inline constexpr detail::enumerate_adapter block_thread_enumerate; - inline constexpr detail::enumerate_adapter grid_thread_enumerate; - inline constexpr detail::enumerate_adapter grid_block_enumerate; -#if defined(_CG_HAS_CLUSTER_GROUP) - inline constexpr detail::enumerate_adapter cluster_thread_enumerate; - inline constexpr detail::enumerate_adapter cluster_block_enumerate; - inline constexpr detail::enumerate_adapter grid_cluster_enumerate; -#endif + inline constexpr detail::enumerate_adapter enumerate; #endif } // namespace views + // #if !defined(ENABLE_HIP) + // template + // using block_thread_enumerate_view = detail::enumerate_view; + // template + // using grid_thread_enumerate_view = detail::enumerate_view; + // template + // using grid_block_enumerate_view = detail::enumerate_view; + + // template + // using cluster_thread_enumerate_view = detail::enumerate_view; + // template + // using cluster_block_enumerate_view = detail::enumerate_view; + // template + // using grid_cluster_enumerate_view = detail::enumerate_view; + // #endif + + // namespace views + // { + // #ifdef GPU_CHECK_ERROR + // __device__ static constexpr detail::enumerate_adapter block_thread_enumerate; + // __device__ static constexpr detail::enumerate_adapter grid_thread_enumerate; + // __device__ static constexpr detail::enumerate_adapter grid_block_enumerate; + // #if defined(_CG_HAS_CLUSTER_GROUP) + // __device__ static constexpr detail::enumerate_adapter cluster_thread_enumerate; + // __device__ static constexpr detail::enumerate_adapter cluster_block_enumerate; + // __device__ static constexpr detail::enumerate_adapter grid_cluster_enumerate; + // #endif + // #else + // inline constexpr detail::enumerate_adapter block_thread_enumerate; + // inline constexpr detail::enumerate_adapter grid_thread_enumerate; + // inline constexpr detail::enumerate_adapter grid_block_enumerate; + // #if defined(_CG_HAS_CLUSTER_GROUP) + // inline constexpr detail::enumerate_adapter cluster_thread_enumerate; + // inline constexpr detail::enumerate_adapter cluster_block_enumerate; + // inline constexpr detail::enumerate_adapter grid_cluster_enumerate; + // #endif + // #endif + // } // namespace views + namespace detail { template From 77ee080020f6b01171f71504394854d484c241fb Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Mon, 2 Mar 2026 19:05:38 +0900 Subject: [PATCH 11/19] =?UTF-8?q?=E2=9C=A8=20Make=20stride=5Fview=20satisf?= =?UTF-8?q?y=20ranges::view?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 52 +++++++++------ test/test.cpp | 147 +++++++++++++++++++++--------------------- 2 files changed, 105 insertions(+), 94 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index 5ce8f0b..a0fad72 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -2618,11 +2618,11 @@ namespace gpu_array }; template - requires std::is_lvalue_reference_v && std::ranges::sized_range + requires std::ranges::view && std::ranges::sized_range class stride_sentinel; template - requires std::is_lvalue_reference_v && std::ranges::sized_range + requires std::ranges::view && std::ranges::sized_range class stride_iterator_base { template @@ -2637,18 +2637,17 @@ namespace gpu_array } protected: - __host__ __device__ explicit stride_iterator_base(Range&& r, - std::ranges::range_size_t index) noexcept + __host__ __device__ explicit stride_iterator_base(Range& r, std::ranges::range_size_t index) noexcept : pointer_(&r), index_(index) { } - std::remove_reference_t* pointer_ = nullptr; + Range* pointer_ = nullptr; std::ranges::range_size_t index_ = 0; }; template - requires std::is_lvalue_reference_v && std::ranges::sized_range + requires std::ranges::view && std::ranges::sized_range class stride_sentinel { template @@ -2657,7 +2656,7 @@ namespace gpu_array public: stride_sentinel() = default; - __host__ __device__ explicit stride_sentinel(Range&& r) noexcept : end_(r.size()) {} + __host__ __device__ explicit stride_sentinel(const Range& r) noexcept : end_(r.size()) {} protected: std::ranges::range_size_t end_ = 0; @@ -2671,7 +2670,7 @@ namespace gpu_array } template - requires std::is_lvalue_reference_v + requires std::ranges::view class stride_iterator : public stride_iterator_base { using base = stride_iterator_base; @@ -2776,10 +2775,7 @@ namespace gpu_array using difference_type = std::make_signed_t>; stride_iterator() = default; - __host__ __device__ explicit stride_iterator(Range&& r) noexcept - : base(std::forward(r), get_initial_index()) - { - } + __host__ __device__ explicit stride_iterator(Range& r) noexcept : base(r, get_initial_index()) {} __host__ __device__ stride_iterator& operator++() noexcept { base::index_ += get_stride(); @@ -2798,20 +2794,30 @@ namespace gpu_array }; template - requires std::is_lvalue_reference_v + requires std::ranges::view class stride_view : public std::ranges::view_interface> { public: stride_view() = default; - __host__ __device__ explicit stride_view(Range&& r) noexcept : pointer_(&r) {} + __host__ __device__ explicit stride_view(Range r) noexcept : range_(r) {} + [[nodiscard]] __host__ __device__ auto begin() noexcept + { + return stride_iterator(range_); + } [[nodiscard]] __host__ __device__ auto begin() const noexcept + requires std::is_const_v { - return stride_iterator(*pointer_); + return stride_iterator(range_); + } + [[nodiscard]] __host__ __device__ auto end() noexcept { return stride_sentinel(range_); } + [[nodiscard]] __host__ __device__ auto end() const noexcept + requires std::is_const_v + { + return stride_sentinel(range_); } - [[nodiscard]] __host__ __device__ auto end() const noexcept { return stride_sentinel(*pointer_); } private: - std::remove_reference_t* pointer_ = nullptr; + Range range_{}; }; template @@ -2819,17 +2825,17 @@ namespace gpu_array { template requires std::ranges::sized_range - [[nodiscard]] constexpr auto operator()(Range& r) const noexcept + [[nodiscard]] constexpr auto operator()(Range&& r) const noexcept { - return stride_view(r); + return stride_view>(std::forward(r)); } template requires std::ranges::sized_range - [[nodiscard]] friend constexpr std::ranges::view auto operator|(Range& range, + [[nodiscard]] friend constexpr std::ranges::view auto operator|(Range&& r, const stride_adapter& self) noexcept { - return self(range); + return self(std::forward(r)); } }; } // namespace detail @@ -3278,6 +3284,10 @@ inline constexpr bool std::ranges::enable_borrowed_range inline constexpr bool std::ranges::enable_borrowed_range> = true; +template +inline constexpr bool std::ranges::enable_view> = true; +template +inline constexpr bool std::ranges::enable_view> = true; #undef SIGSEGV_DEPRECATED #undef INCR_GPU_MEMORY_USAGE diff --git a/test/test.cpp b/test/test.cpp index c2494ba..a109597 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -2071,7 +2071,8 @@ TEST(JaggedArray, MemoryManagement) } #if !defined(ENABLE_HIP) -static_assert(std::ranges::forward_range&>>); +static_assert(std::ranges::forward_range>>); +static_assert(std::ranges::view>>); template requires std::ranges::input_range> @@ -2128,8 +2129,8 @@ template requires std::ranges::input_range> __global__ void kernel_enumerate(T array) { - for (auto&& [i, xs] : grid_block_enumerate_view(array)) - for (auto&& [j, x] : block_thread_enumerate_view(xs)) x = i * 100 + j; + for (auto&& [i, xs] : enumerate_view(array) | views::grid_block_stride) + for (auto&& [j, x] : enumerate_view(xs) | views::block_thread_stride) x = i * 100 + j; } TEST(EnumerateView, Simple) @@ -2150,75 +2151,75 @@ TEST(EnumerateView, Simple) } } -template -requires std::ranges::input_range> -__global__ void zip_test_init(T array, int coeff) -{ - for (auto&& [i, xs] : grid_block_enumerate_view(array)) - for (auto&& [j, x] : block_thread_enumerate_view(xs)) x = (i * xs.size() + j) * coeff; -} - -template -requires std::ranges::input_range> && - std::ranges::input_range> -__global__ void kernel_zip(T array1, const U array2) -{ - for (auto&& [xs, ys] : views::grid_block_zip(array1, array2)) - for (auto&& [x, y] : views::block_thread_zip(xs, ys)) x = x + y; -} - -template -requires std::ranges::input_range> && - std::ranges::input_range> -__global__ void kernel_zip2(T array1, const U array2) -{ - for (auto&& [xs, ys] : grid_block_zip_view(array1, array2)) - for (auto&& [x, y] : block_thread_zip_view(xs, ys)) x = x + y; -} - -TEST(ZipView, Simple) -{ - auto vec_vec = std::vector(10, std::vector(20, 0)); - auto array1 = managed_array(vec_vec); - auto array2 = managed_array(vec_vec); - zip_test_init<<<10, 20>>>(array1, 1); - api::gpuDeviceSynchronize(); - for (int i = 0; const auto& xs : array1) - { - for (int j = 0; const auto& x : xs) - { - EXPECT_EQ(x, i * 20 + j); - ++j; - } - ++i; - } - - zip_test_init<<<10, 20>>>(array2, 1000); - kernel_zip<<<10, 20>>>(array1, array2); - api::gpuDeviceSynchronize(); - for (int i = 0; const auto& xs : array1) - { - for (int j = 0; const auto& x : xs) - { - EXPECT_EQ(x, (i * 20 + j) * 1001); - ++j; - } - ++i; - } - - zip_test_init<<<10, 20>>>(array1, 1); - zip_test_init<<<10, 20>>>(array2, 2000); - kernel_zip2<<<10, 20>>>(array1, array2); - api::gpuDeviceSynchronize(); - for (int i = 0; const auto& xs : array1) - { - for (int j = 0; const auto& x : xs) - { - EXPECT_EQ(x, (i * 20 + j) * 2001); - ++j; - } - ++i; - } -} +// template +// requires std::ranges::input_range> +// __global__ void zip_test_init(T array, int coeff) +// { +// for (auto&& [i, xs] : grid_block_enumerate_view(array)) +// for (auto&& [j, x] : block_thread_enumerate_view(xs)) x = (i * xs.size() + j) * coeff; +// } + +// template +// requires std::ranges::input_range> && +// std::ranges::input_range> +// __global__ void kernel_zip(T array1, const U array2) +// { +// for (auto&& [xs, ys] : views::grid_block_zip(array1, array2)) +// for (auto&& [x, y] : views::block_thread_zip(xs, ys)) x = x + y; +// } + +// template +// requires std::ranges::input_range> && +// std::ranges::input_range> +// __global__ void kernel_zip2(T array1, const U array2) +// { +// for (auto&& [xs, ys] : grid_block_zip_view(array1, array2)) +// for (auto&& [x, y] : block_thread_zip_view(xs, ys)) x = x + y; +// } + +// TEST(ZipView, Simple) +// { +// auto vec_vec = std::vector(10, std::vector(20, 0)); +// auto array1 = managed_array(vec_vec); +// auto array2 = managed_array(vec_vec); +// zip_test_init<<<10, 20>>>(array1, 1); +// api::gpuDeviceSynchronize(); +// for (int i = 0; const auto& xs : array1) +// { +// for (int j = 0; const auto& x : xs) +// { +// EXPECT_EQ(x, i * 20 + j); +// ++j; +// } +// ++i; +// } + +// zip_test_init<<<10, 20>>>(array2, 1000); +// kernel_zip<<<10, 20>>>(array1, array2); +// api::gpuDeviceSynchronize(); +// for (int i = 0; const auto& xs : array1) +// { +// for (int j = 0; const auto& x : xs) +// { +// EXPECT_EQ(x, (i * 20 + j) * 1001); +// ++j; +// } +// ++i; +// } + +// zip_test_init<<<10, 20>>>(array1, 1); +// zip_test_init<<<10, 20>>>(array2, 2000); +// kernel_zip2<<<10, 20>>>(array1, array2); +// api::gpuDeviceSynchronize(); +// for (int i = 0; const auto& xs : array1) +// { +// for (int j = 0; const auto& x : xs) +// { +// EXPECT_EQ(x, (i * 20 + j) * 2001); +// ++j; +// } +// ++i; +// } +// } #endif // NOLINTEND From f279a2f82d8c26354041f33f458f1518f2f2951b Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Mon, 2 Mar 2026 23:04:28 +0900 Subject: [PATCH 12/19] =?UTF-8?q?=E2=9C=A8=20Make=20enumerate=5Fview=20sat?= =?UTF-8?q?isfy=20ranges::view?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 266 ++++++++++++++++++++++++++++-------------- test/test.cpp | 4 + 2 files changed, 182 insertions(+), 88 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index a0fad72..f5dd90a 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -48,6 +48,25 @@ #define SIGSEGV_DEPRECATED [[deprecated("Cannot access GPU memory directly")]] #endif +#if !defined(__cpp_lib_tuple_like) || __cpp_lib_tuple_like < 202207L +template class TQual, template class UQual> +requires requires { + typename std::pair, UQual>, std::common_reference_t, UQual>>; +} +struct std::basic_common_reference, std::pair, TQual, UQual> +{ + using type = + std::pair, UQual>, std::common_reference_t, UQual>>; +}; + +template +requires requires { typename std::pair, std::common_type_t>; } +struct std::common_type, std::pair> +{ + using type = std::pair, std::common_type_t>; +}; +#endif + namespace gpu_array { #if defined(GPU_USE_32BIT_SIZE_TYPE_DEFAULT) @@ -2605,6 +2624,112 @@ namespace gpu_array namespace detail { + // WORKAROUND: Because std::common_reference_with in C++20 does not work correctly + template + concept RandomAccessRange = true; + + template + requires std::is_class_v && std::same_as> + class ViewInterface + { + __host__ __device__ Derived& derived() noexcept { return static_cast(*this); } + + __host__ __device__ Derived const& derived() const noexcept { return static_cast(*this); } + + template + __host__ __device__ static auto to_unsigned(V v) + { + return static_cast>(v); + } + + public: + template + [[nodiscard]] __host__ __device__ bool empty() + requires std::ranges::sized_range || RandomAccessRange + { + if constexpr (std::ranges::sized_range) + { + return derived().size() == 0; + } + else + { + return derived().begin() == derived().end(); + } + } + + template + [[nodiscard]] __host__ __device__ bool empty() const + requires std::ranges::sized_range || RandomAccessRange + { + if constexpr (std::ranges::sized_range) + { + return derived().size() == 0; + } + else + { + return derived().begin() == derived().end(); + } + } + + template + __host__ __device__ explicit operator bool() + requires requires(D2& t) { std::ranges::empty(t); } + { + return !std::ranges::empty(derived()); + } + + template + __host__ __device__ explicit operator bool() const + requires requires(const D2& t) { std::ranges::empty(t); } + { + return !std::ranges::empty(derived()); + } + + template + [[nodiscard]] __host__ __device__ auto size() + requires RandomAccessRange && + std::sized_sentinel_for, std::ranges::iterator_t> + { + return to_unsigned(derived().end() - derived().begin()); + } + + template + [[nodiscard]] __host__ __device__ auto size() const + requires RandomAccessRange && + std::sized_sentinel_for, std::ranges::iterator_t> + { + return to_unsigned(derived().end() - derived().begin()); + } + + template + [[nodiscard]] __host__ __device__ decltype(auto) front() + requires RandomAccessRange + { + return *derived().begin(); + } + + template + [[nodiscard]] __host__ __device__ decltype(auto) front() const + requires RandomAccessRange + { + return *derived().begin(); + } + + template + [[nodiscard]] __host__ __device__ decltype(auto) operator[](std::ranges::range_difference_t index) + requires RandomAccessRange + { + return derived().begin()[index]; + } + + template + [[nodiscard]] __host__ __device__ decltype(auto) operator[](std::ranges::range_difference_t index) const + requires RandomAccessRange + { + return derived().begin()[index]; + } + }; + enum class Stride : std::uint8_t { BlockThread, @@ -2617,11 +2742,11 @@ namespace gpu_array #endif }; - template + template requires std::ranges::view && std::ranges::sized_range class stride_sentinel; - template + template requires std::ranges::view && std::ranges::sized_range class stride_iterator_base { @@ -2646,7 +2771,7 @@ namespace gpu_array std::ranges::range_size_t index_ = 0; }; - template + template requires std::ranges::view && std::ranges::sized_range class stride_sentinel { @@ -2669,7 +2794,7 @@ namespace gpu_array return it.index_ >= se.end_; } - template + template requires std::ranges::view class stride_iterator : public stride_iterator_base { @@ -2793,9 +2918,9 @@ namespace gpu_array } }; - template + template requires std::ranges::view - class stride_view : public std::ranges::view_interface> + class stride_view : public ViewInterface> { public: stride_view() = default; @@ -2823,17 +2948,17 @@ namespace gpu_array template struct stride_adapter { - template + template requires std::ranges::sized_range - [[nodiscard]] constexpr auto operator()(Range&& r) const noexcept + [[nodiscard]] __host__ __device__ auto operator()(Range&& r) const noexcept { return stride_view>(std::forward(r)); } - template + template requires std::ranges::sized_range - [[nodiscard]] friend constexpr std::ranges::view auto operator|(Range&& r, - const stride_adapter& self) noexcept + [[nodiscard]] __host__ __device__ friend std::ranges::view auto operator|( + Range&& r, const stride_adapter& self) noexcept { return self(std::forward(r)); } @@ -2843,18 +2968,18 @@ namespace gpu_array #if !defined(ENABLE_HIP) // The following three alias templates are also disabled in HIP because HIP does not support alias template argument // deduction. - template + template using block_thread_stride_view = detail::stride_view; - template + template using grid_thread_stride_view = detail::stride_view; - template + template using grid_block_stride_view = detail::stride_view; - template + template using cluster_thread_stride_view = detail::stride_view; - template + template using cluster_block_stride_view = detail::stride_view; - template + template using grid_cluster_stride_view = detail::stride_view; #endif @@ -2884,17 +3009,17 @@ namespace gpu_array namespace detail { - template - requires std::is_lvalue_reference_v + template + requires std::ranges::view class enumerate_iterator { public: using iterator_category = std::random_access_iterator_tag; - using value_type = std::ranges::range_value_t; + using value_type = std::pair, std::ranges::range_value_t>; using difference_type = std::make_signed_t>; enumerate_iterator() = default; - __host__ __device__ explicit enumerate_iterator(Range&& r) noexcept : pointer_(&r), index_(0) {} + __host__ __device__ explicit enumerate_iterator(Range& r) noexcept : pointer_(&r), index_(0) {} __host__ __device__ std::ranges::range_size_t index() const noexcept { return index_; } __host__ __device__ std::pair, std::ranges::range_reference_t> operator*() const noexcept @@ -2989,19 +3114,19 @@ namespace gpu_array } private: - std::remove_reference_t* pointer_ = nullptr; + Range* pointer_ = nullptr; std::ranges::range_size_t index_ = 0; }; - template - requires std::is_lvalue_reference_v && std::ranges::sized_range + template + requires std::ranges::view && std::ranges::sized_range class enumerate_sentinel { using difference_type = std::make_signed_t>; public: enumerate_sentinel() = default; - __host__ __device__ explicit enumerate_sentinel(Range&& r) noexcept : end_(r.size()) {} + __host__ __device__ explicit enumerate_sentinel(Range& r) noexcept : end_(r.size()) {} __host__ __device__ friend bool operator==(const enumerate_iterator& it, const enumerate_sentinel& se) noexcept { @@ -3023,42 +3148,46 @@ namespace gpu_array std::ranges::range_size_t end_ = 0; }; - template - requires std::is_lvalue_reference_v - class enumerate_view : public std::ranges::view_interface> + template + requires std::ranges::view + class enumerate_view : public ViewInterface> { public: enumerate_view() = default; - __host__ __device__ explicit enumerate_view(Range&& r) noexcept : pointer_(&r) {} + __host__ __device__ explicit enumerate_view(Range r) noexcept : range_(r) {} + [[nodiscard]] __host__ __device__ auto begin() noexcept { return enumerate_iterator(range_); } [[nodiscard]] __host__ __device__ auto begin() const noexcept + requires std::is_const_v { - return enumerate_iterator(*pointer_); + return enumerate_iterator(range_); } - [[nodiscard]] __host__ __device__ auto end() const noexcept { return enumerate_sentinel(*pointer_); } - [[nodiscard]] __host__ __device__ auto size() const noexcept { return pointer_->size(); } + [[nodiscard]] __host__ __device__ auto end() noexcept { return enumerate_sentinel(range_); } + [[nodiscard]] __host__ __device__ auto end() const noexcept + requires std::is_const_v + { + return enumerate_sentinel(range_); + } + [[nodiscard]] __host__ __device__ auto size() const noexcept { return range_.size(); } private: - std::remove_reference_t* pointer_ = nullptr; + Range range_{}; }; - template - enumerate_view(Range&) -> enumerate_view; - struct enumerate_adapter { - template + template requires std::ranges::sized_range - [[nodiscard]] __host__ __device__ auto operator()(Range& r) const noexcept + [[nodiscard]] __host__ __device__ auto operator()(Range&& r) const noexcept { - return enumerate_view(r); + return enumerate_view>(std::forward(r)); } - template + template requires std::ranges::sized_range [[nodiscard]] __host__ __device__ friend std::ranges::view auto operator|( - Range& range, const enumerate_adapter& self) noexcept + Range&& r, const enumerate_adapter& self) noexcept { - return self(range); + return self(std::forward(r)); } }; } // namespace detail @@ -3076,51 +3205,12 @@ namespace gpu_array #endif } // namespace views - // #if !defined(ENABLE_HIP) - // template - // using block_thread_enumerate_view = detail::enumerate_view; - // template - // using grid_thread_enumerate_view = detail::enumerate_view; - // template - // using grid_block_enumerate_view = detail::enumerate_view; - - // template - // using cluster_thread_enumerate_view = detail::enumerate_view; - // template - // using cluster_block_enumerate_view = detail::enumerate_view; - // template - // using grid_cluster_enumerate_view = detail::enumerate_view; - // #endif - - // namespace views - // { - // #ifdef GPU_CHECK_ERROR - // __device__ static constexpr detail::enumerate_adapter block_thread_enumerate; - // __device__ static constexpr detail::enumerate_adapter grid_thread_enumerate; - // __device__ static constexpr detail::enumerate_adapter grid_block_enumerate; - // #if defined(_CG_HAS_CLUSTER_GROUP) - // __device__ static constexpr detail::enumerate_adapter cluster_thread_enumerate; - // __device__ static constexpr detail::enumerate_adapter cluster_block_enumerate; - // __device__ static constexpr detail::enumerate_adapter grid_cluster_enumerate; - // #endif - // #else - // inline constexpr detail::enumerate_adapter block_thread_enumerate; - // inline constexpr detail::enumerate_adapter grid_thread_enumerate; - // inline constexpr detail::enumerate_adapter grid_block_enumerate; - // #if defined(_CG_HAS_CLUSTER_GROUP) - // inline constexpr detail::enumerate_adapter cluster_thread_enumerate; - // inline constexpr detail::enumerate_adapter cluster_block_enumerate; - // inline constexpr detail::enumerate_adapter grid_cluster_enumerate; - // #endif - // #endif - // } // namespace views - namespace detail { template using first_t = std::tuple_element_t<0, std::tuple>; - template + template requires (std::is_lvalue_reference_v && ...) class zip_iterator { @@ -3164,7 +3254,7 @@ namespace gpu_array std::common_type_t...> index_ = 0; }; - template + template requires (std::is_lvalue_reference_v && ...) && (std::ranges::sized_range && ...) class zip_sentinel { @@ -3181,9 +3271,9 @@ namespace gpu_array std::common_type_t...> end_ = 0; }; - template + template requires (std::is_lvalue_reference_v && ...) - class zip_view : public std::ranges::view_interface> + class zip_view : public ViewInterface> { public: zip_view() = default; @@ -3206,7 +3296,7 @@ namespace gpu_array template struct zip_adapter { - template + template requires (std::ranges::sized_range && ...) [[nodiscard]] __host__ __device__ auto operator()(Ranges&... rs) const noexcept { @@ -3284,9 +3374,9 @@ inline constexpr bool std::ranges::enable_borrowed_range inline constexpr bool std::ranges::enable_borrowed_range> = true; -template +template inline constexpr bool std::ranges::enable_view> = true; -template +template inline constexpr bool std::ranges::enable_view> = true; #undef SIGSEGV_DEPRECATED diff --git a/test/test.cpp b/test/test.cpp index a109597..8a06cd0 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -2125,6 +2125,10 @@ TEST(StrideView, AliasTemplate) for (const auto& v : inner_array) EXPECT_EQ(v, 3); } +static_assert(detail::RandomAccessRange>>); +static_assert(std::ranges::sized_range>>); +static_assert(std::ranges::view>>); + template requires std::ranges::input_range> __global__ void kernel_enumerate(T array) From 993a3d1e7198cf2c15e6c8e289a2cc78b296aa6b Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Tue, 3 Mar 2026 00:02:41 +0900 Subject: [PATCH 13/19] =?UTF-8?q?=E2=9C=A8=20Make=20zip=5Fview=20satisfy?= =?UTF-8?q?=20ranges::view?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 174 ++++++++++++++++++++++++++---------------- test/test.cpp | 14 ++-- 2 files changed, 116 insertions(+), 72 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index f5dd90a..bbe0831 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -2923,7 +2923,9 @@ namespace gpu_array class stride_view : public ViewInterface> { public: - stride_view() = default; + stride_view() + requires std::default_initializable + = default; __host__ __device__ explicit stride_view(Range r) noexcept : range_(r) {} [[nodiscard]] __host__ __device__ auto begin() noexcept { @@ -3153,7 +3155,9 @@ namespace gpu_array class enumerate_view : public ViewInterface> { public: - enumerate_view() = default; + enumerate_view() + requires std::default_initializable + = default; __host__ __device__ explicit enumerate_view(Range r) noexcept : range_(r) {} [[nodiscard]] __host__ __device__ auto begin() noexcept { return enumerate_iterator(range_); } [[nodiscard]] __host__ __device__ auto begin() const noexcept @@ -3192,9 +3196,7 @@ namespace gpu_array }; } // namespace detail -#if !defined(ENABLE_HIP) using detail::enumerate_view; -#endif namespace views { @@ -3207,11 +3209,8 @@ namespace gpu_array namespace detail { - template - using first_t = std::tuple_element_t<0, std::tuple>; - - template - requires (std::is_lvalue_reference_v && ...) + template + requires (std::ranges::view && ...) class zip_iterator { public: @@ -3220,10 +3219,7 @@ namespace gpu_array using difference_type = std::common_type_t>...>; zip_iterator() = default; - __host__ __device__ explicit zip_iterator(Ranges&&... rs) noexcept - : pointers_(&rs...), index_(stride_iterator>::get_initial_index()) - { - } + __host__ __device__ explicit zip_iterator(Ranges&... rs) noexcept : pointers_(&rs...), index_(0) {} __host__ __device__ std::common_type_t...> index() const noexcept { return index_; @@ -3238,7 +3234,7 @@ namespace gpu_array } __host__ __device__ zip_iterator& operator++() noexcept { - index_ += stride_iterator>::get_stride(); + ++index_; return *this; } __host__ __device__ zip_iterator operator++(int) noexcept @@ -3247,21 +3243,82 @@ namespace gpu_array ++(*this); return res; } + __host__ __device__ zip_iterator& operator--() noexcept + { + --index_; + return *this; + } + __host__ __device__ zip_iterator operator--(int) noexcept + { + auto res = *this; + --(*this); + return res; + } + __host__ __device__ zip_iterator& operator+=(difference_type n) + { + index_ += n; + return *this; + } + __host__ __device__ zip_iterator& operator-=(difference_type n) + { + index_ -= n; + return *this; + } + __host__ __device__ std::tuple...> operator[]( + difference_type n) const + { + return *(*this + n); + } + __host__ __device__ bool operator==(const zip_iterator& it) const noexcept { return index_ == it.index_; } + __host__ __device__ bool operator<(const zip_iterator& it) const& noexcept { return index_ < it.index_; } + __host__ __device__ bool operator>(const zip_iterator& it) const& noexcept { return index_ > it.index_; } + __host__ __device__ bool operator<=(const zip_iterator& it) const& noexcept { return index_ <= it.index_; } + __host__ __device__ bool operator>=(const zip_iterator& it) const& noexcept { return index_ >= it.index_; } + + __host__ __device__ friend zip_iterator operator+(zip_iterator x, difference_type n) + { + x += n; + return x; + } + __host__ __device__ friend zip_iterator operator+(difference_type n, zip_iterator x) + { + x += n; + return x; + } + __host__ __device__ friend zip_iterator operator-(zip_iterator x, difference_type n) + { + x -= n; + return x; + } + __host__ __device__ friend difference_type operator-(const zip_iterator& x, const zip_iterator& y) + { + return x.index() - y.index(); + } + + __host__ __device__ friend auto iter_move(const zip_iterator& x) + { + return std::apply( + [&x](auto&... pointers) { + return std::tuple...>( + std::move((*pointers)[x.index()])...); + }, + x.pointers_); + } private: - std::tuple*...> pointers_{}; + std::tuple pointers_{}; std::common_type_t...> index_ = 0; }; - template - requires (std::is_lvalue_reference_v && ...) && (std::ranges::sized_range && ...) + template + requires (std::ranges::view && ...) && (std::ranges::sized_range && ...) class zip_sentinel { public: zip_sentinel() = default; - __host__ __device__ explicit zip_sentinel(Ranges&&... rs) noexcept : end_(std::min({rs.size()...})) {} - __host__ __device__ friend bool operator==(const zip_iterator& it, + __host__ __device__ explicit zip_sentinel(Ranges&... rs) noexcept : end_(std::min({rs.size()...})) {} + __host__ __device__ friend bool operator==(const zip_iterator& it, const zip_sentinel& se) noexcept { return it.index() >= se.end_; @@ -3271,80 +3328,61 @@ namespace gpu_array std::common_type_t...> end_ = 0; }; - template - requires (std::is_lvalue_reference_v && ...) - class zip_view : public ViewInterface> + template + requires (std::ranges::view && ...) + class zip_view : public ViewInterface> { public: - zip_view() = default; - __host__ __device__ explicit zip_view(Ranges&&... rs) noexcept : pointers_(&rs...) {} + zip_view() + requires (std::default_initializable && ...) + = default; + __host__ __device__ explicit zip_view(Ranges... rs) noexcept : ranges_(rs...) {} + [[nodiscard]] __host__ __device__ auto begin() noexcept + { + return std::apply([](auto&... ranges) { return zip_iterator(ranges...); }, ranges_); + } [[nodiscard]] __host__ __device__ auto begin() const noexcept + requires (std::is_const_v && ...) { - return std::apply( - [this](auto&... pointers) { return zip_iterator(*pointers...); }, pointers_); + return std::apply([](auto&... ranges) { return zip_iterator(ranges...); }, ranges_); + } + [[nodiscard]] __host__ __device__ auto end() noexcept + { + return std::apply([](auto&... ranges) { return zip_sentinel(ranges...); }, ranges_); } [[nodiscard]] __host__ __device__ auto end() const noexcept + requires (std::is_const_v && ...) { - return std::apply( - [this](auto&... pointers) { return zip_sentinel(*pointers...); }, pointers_); + return std::apply([](auto&... ranges) { return zip_sentinel(ranges...); }, ranges_); + } + [[nodiscard]] __host__ __device__ auto size() const noexcept + { + return std::apply([](auto&... ranges) { return std::min({ranges.size()...}); }, ranges_); } private: - std::tuple*...> pointers_{}; + std::tuple ranges_{}; }; - template struct zip_adapter { template requires (std::ranges::sized_range && ...) - [[nodiscard]] __host__ __device__ auto operator()(Ranges&... rs) const noexcept + [[nodiscard]] __host__ __device__ auto operator()(Ranges&&... rs) const noexcept { - return zip_view(rs...); + return zip_view...>(std::forward(rs)...); } }; } // namespace detail -#ifdef GPU_CHECK_ERROR - __device__ static constexpr detail::zip_adapter block_thread_zip_view; - __device__ static constexpr detail::zip_adapter grid_thread_zip_view; - __device__ static constexpr detail::zip_adapter grid_block_zip_view; -#if defined(_CG_HAS_CLUSTER_GROUP) - __device__ static constexpr detail::zip_adapter cluster_thread_zip_view; - __device__ static constexpr detail::zip_adapter cluster_block_zip_view; - __device__ static constexpr detail::zip_adapter grid_cluster_zip_view; -#endif -#else - inline constexpr detail::zip_adapter block_thread_zip_view; - inline constexpr detail::zip_adapter grid_thread_zip_view; - inline constexpr detail::zip_adapter grid_block_zip_view; -#if defined(_CG_HAS_CLUSTER_GROUP) - inline constexpr detail::zip_adapter cluster_thread_zip_view; - inline constexpr detail::zip_adapter cluster_block_zip_view; - inline constexpr detail::zip_adapter grid_cluster_zip_view; -#endif -#endif + using detail::zip_view; namespace views { #ifdef GPU_CHECK_ERROR - __device__ static constexpr detail::zip_adapter block_thread_zip; - __device__ static constexpr detail::zip_adapter grid_thread_zip; - __device__ static constexpr detail::zip_adapter grid_block_zip; -#if defined(_CG_HAS_CLUSTER_GROUP) - __device__ static constexpr detail::zip_adapter cluster_thread_zip; - __device__ static constexpr detail::zip_adapter cluster_block_zip; - __device__ static constexpr detail::zip_adapter grid_cluster_zip; -#endif + __device__ static constexpr detail::zip_adapter zip; #else - inline constexpr detail::zip_adapter block_thread_zip; - inline constexpr detail::zip_adapter grid_thread_zip; - inline constexpr detail::zip_adapter grid_block_zip; -#if defined(_CG_HAS_CLUSTER_GROUP) - inline constexpr detail::zip_adapter cluster_thread_zip; - inline constexpr detail::zip_adapter cluster_block_zip; - inline constexpr detail::zip_adapter grid_cluster_zip; -#endif + inline constexpr detail::zip_adapter zip; #endif } // namespace views } // namespace gpu_array @@ -3378,6 +3416,8 @@ template > = true; template inline constexpr bool std::ranges::enable_view> = true; +template +inline constexpr bool std::ranges::enable_view> = true; #undef SIGSEGV_DEPRECATED #undef INCR_GPU_MEMORY_USAGE diff --git a/test/test.cpp b/test/test.cpp index 8a06cd0..c360986 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -2155,21 +2155,25 @@ TEST(EnumerateView, Simple) } } +static_assert(detail::RandomAccessRange>>); +static_assert(std::ranges::sized_range>>); +static_assert(std::ranges::view>>); + // template // requires std::ranges::input_range> // __global__ void zip_test_init(T array, int coeff) // { -// for (auto&& [i, xs] : grid_block_enumerate_view(array)) -// for (auto&& [j, x] : block_thread_enumerate_view(xs)) x = (i * xs.size() + j) * coeff; +// for (auto&& [i, xs] : enumerate_view(array) | views::grid_block_stride) +// for (auto&& [j, x] : enumerate_view(xs) | views::block_thread_stride) x = (i * xs.size() + j) * coeff; // } // template // requires std::ranges::input_range> && // std::ranges::input_range> -// __global__ void kernel_zip(T array1, const U array2) +// __global__ void kernel_zip(T array1, U array2) // { -// for (auto&& [xs, ys] : views::grid_block_zip(array1, array2)) -// for (auto&& [x, y] : views::block_thread_zip(xs, ys)) x = x + y; +// for (auto&& [xs, ys] : views::zip(array1, array2) | views::grid_block_stride) +// for (auto&& [x, y] : views::zip(xs, ys) | views::block_thread_stride) x = x + y; // } // template From 7dd1c2882a96ed4f79bef80c4b6a5e52aaa15912 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Tue, 3 Mar 2026 23:19:47 +0900 Subject: [PATCH 14/19] =?UTF-8?q?=F0=9F=90=9B=20Fix=20kernel=20crush=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 118 +++++++++++++++++++++++++++++++++++++----- test/test.cpp | 117 +++++++++++++++++------------------------ 2 files changed, 152 insertions(+), 83 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index bbe0831..87c1f68 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -48,6 +48,83 @@ #define SIGSEGV_DEPRECATED [[deprecated("Cannot access GPU memory directly")]] #endif +namespace gpu_array::detail +{ + // Custom implementation of tuple for device code + + template + struct tuple_leaf + { + using type = T; + T value; + }; + + template + tuple_leaf at_index(const tuple_leaf&); // undefined + + template + struct tuple_impl; + + template + struct tuple_impl, Ts...> : tuple_leaf... + { + }; + + template + struct tuple + { + __host__ __device__ tuple() + requires (std::default_initializable && ...) + = default; + __host__ __device__ tuple(Ts... ts) : base_{std::forward(ts)...} {} + template + __host__ __device__ friend auto& get(detail::tuple&); + template + __host__ __device__ friend const auto& get(const detail::tuple&); + template + __host__ __device__ friend auto&& get(detail::tuple&&); + template + __host__ __device__ friend const auto&& get(const detail::tuple&&); + + private: + using base = tuple_impl, Ts...>; + base base_; + }; + + template + __host__ __device__ auto& get(detail::tuple& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(t.base_).value; + } + template + __host__ __device__ const auto& get(const detail::tuple& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(t.base_).value; + } + template + __host__ __device__ auto&& get(detail::tuple&& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(static_cast(t.base_).value); + } + template + __host__ __device__ const auto&& get(const detail::tuple&& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(static_cast(t.base_).value); + } +} // namespace gpu_array::detail + +template +struct std::tuple_size> : std::integral_constant +{ +}; +template +struct std::tuple_element> : std::tuple_element> +{ +}; #if !defined(__cpp_lib_tuple_like) || __cpp_lib_tuple_like < 202207L template class TQual, template class UQual> requires requires { @@ -3209,13 +3286,28 @@ namespace gpu_array namespace detail { + template + __host__ __device__ auto apply_impl(F&& f, Tuple&& t, std::index_sequence) + -> decltype(std::forward(f)(detail::get(std::forward(t))...)) + { + return std::forward(f)(detail::get(std::forward(t))...); + } + + template + requires requires { std::tuple_size_v>; } + __host__ __device__ decltype(auto) apply(F&& f, Tuple&& t) + { + return apply_impl(std::forward(f), std::forward(t), + std::make_index_sequence>>{}); + } + template requires (std::ranges::view && ...) class zip_iterator { public: using iterator_category = std::forward_iterator_tag; - using value_type = std::tuple...>; + using value_type = detail::tuple...>; using difference_type = std::common_type_t>...>; zip_iterator() = default; @@ -3226,9 +3318,9 @@ namespace gpu_array } __host__ __device__ auto operator*() const noexcept { - return std::apply( + return detail::apply( [this](auto&... pointers) { - return std::tuple...>((*pointers)[index_]...); + return detail::tuple...>((*pointers)[index_]...); }, pointers_); } @@ -3264,7 +3356,7 @@ namespace gpu_array index_ -= n; return *this; } - __host__ __device__ std::tuple...> operator[]( + __host__ __device__ detail::tuple...> operator[]( difference_type n) const { return *(*this + n); @@ -3298,16 +3390,16 @@ namespace gpu_array __host__ __device__ friend auto iter_move(const zip_iterator& x) { - return std::apply( + return detail::apply( [&x](auto&... pointers) { - return std::tuple...>( + return detail::tuple...>( std::move((*pointers)[x.index()])...); }, x.pointers_); } private: - std::tuple pointers_{}; + detail::tuple pointers_{}; std::common_type_t...> index_ = 0; }; @@ -3339,29 +3431,29 @@ namespace gpu_array __host__ __device__ explicit zip_view(Ranges... rs) noexcept : ranges_(rs...) {} [[nodiscard]] __host__ __device__ auto begin() noexcept { - return std::apply([](auto&... ranges) { return zip_iterator(ranges...); }, ranges_); + return detail::apply([](auto&... ranges) { return zip_iterator(ranges...); }, ranges_); } [[nodiscard]] __host__ __device__ auto begin() const noexcept requires (std::is_const_v && ...) { - return std::apply([](auto&... ranges) { return zip_iterator(ranges...); }, ranges_); + return detail::apply([](auto&... ranges) { return zip_iterator(ranges...); }, ranges_); } [[nodiscard]] __host__ __device__ auto end() noexcept { - return std::apply([](auto&... ranges) { return zip_sentinel(ranges...); }, ranges_); + return detail::apply([](auto&... ranges) { return zip_sentinel(ranges...); }, ranges_); } [[nodiscard]] __host__ __device__ auto end() const noexcept requires (std::is_const_v && ...) { - return std::apply([](auto&... ranges) { return zip_sentinel(ranges...); }, ranges_); + return detail::apply([](auto&... ranges) { return zip_sentinel(ranges...); }, ranges_); } [[nodiscard]] __host__ __device__ auto size() const noexcept { - return std::apply([](auto&... ranges) { return std::min({ranges.size()...}); }, ranges_); + return detail::apply([](auto&... ranges) { return std::min({ranges.size()...}); }, ranges_); } private: - std::tuple ranges_{}; + detail::tuple ranges_{}; }; struct zip_adapter diff --git a/test/test.cpp b/test/test.cpp index c360986..940ee0f 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -2159,75 +2159,52 @@ static_assert(detail::RandomAccessRange>>); static_assert(std::ranges::sized_range>>); static_assert(std::ranges::view>>); -// template -// requires std::ranges::input_range> -// __global__ void zip_test_init(T array, int coeff) -// { -// for (auto&& [i, xs] : enumerate_view(array) | views::grid_block_stride) -// for (auto&& [j, x] : enumerate_view(xs) | views::block_thread_stride) x = (i * xs.size() + j) * coeff; -// } - -// template -// requires std::ranges::input_range> && -// std::ranges::input_range> -// __global__ void kernel_zip(T array1, U array2) -// { -// for (auto&& [xs, ys] : views::zip(array1, array2) | views::grid_block_stride) -// for (auto&& [x, y] : views::zip(xs, ys) | views::block_thread_stride) x = x + y; -// } - -// template -// requires std::ranges::input_range> && -// std::ranges::input_range> -// __global__ void kernel_zip2(T array1, const U array2) -// { -// for (auto&& [xs, ys] : grid_block_zip_view(array1, array2)) -// for (auto&& [x, y] : block_thread_zip_view(xs, ys)) x = x + y; -// } - -// TEST(ZipView, Simple) -// { -// auto vec_vec = std::vector(10, std::vector(20, 0)); -// auto array1 = managed_array(vec_vec); -// auto array2 = managed_array(vec_vec); -// zip_test_init<<<10, 20>>>(array1, 1); -// api::gpuDeviceSynchronize(); -// for (int i = 0; const auto& xs : array1) -// { -// for (int j = 0; const auto& x : xs) -// { -// EXPECT_EQ(x, i * 20 + j); -// ++j; -// } -// ++i; -// } - -// zip_test_init<<<10, 20>>>(array2, 1000); -// kernel_zip<<<10, 20>>>(array1, array2); -// api::gpuDeviceSynchronize(); -// for (int i = 0; const auto& xs : array1) -// { -// for (int j = 0; const auto& x : xs) -// { -// EXPECT_EQ(x, (i * 20 + j) * 1001); -// ++j; -// } -// ++i; -// } - -// zip_test_init<<<10, 20>>>(array1, 1); -// zip_test_init<<<10, 20>>>(array2, 2000); -// kernel_zip2<<<10, 20>>>(array1, array2); -// api::gpuDeviceSynchronize(); -// for (int i = 0; const auto& xs : array1) -// { -// for (int j = 0; const auto& x : xs) -// { -// EXPECT_EQ(x, (i * 20 + j) * 2001); -// ++j; -// } -// ++i; -// } -// } +template +requires std::ranges::input_range> +__global__ void zip_test_init(T array, int coeff) +{ + for (auto&& [i, xs] : enumerate_view(array) | views::grid_block_stride) + for (auto&& [j, x] : enumerate_view(xs) | views::block_thread_stride) x = (i * xs.size() + j) * coeff; +} + +template +requires std::ranges::input_range> && + std::ranges::input_range> +__global__ void kernel_zip(T array1, U array2) +{ + for (auto&& [xs, ys] : zip_view(array1, array2) | views::grid_block_stride) + for (auto&& [x, y] : zip_view(xs, ys) | views::block_thread_stride) x = x + y; +} + +TEST(ZipView, Simple) +{ + auto vec_vec = std::vector(10, std::vector(20, 0)); + auto array1 = managed_array(vec_vec); + auto array2 = managed_array(vec_vec); + zip_test_init<<<10, 20>>>(array1, 1); + api::gpuDeviceSynchronize(); + for (int i = 0; const auto& xs : array1) + { + for (int j = 0; const auto& x : xs) + { + EXPECT_EQ(x, i * 20 + j); + ++j; + } + ++i; + } + + zip_test_init<<<10, 20>>>(array2, 1000); + kernel_zip<<<10, 20>>>(array1, array2); + api::gpuDeviceSynchronize(); + for (int i = 0; const auto& xs : array1) + { + for (int j = 0; const auto& x : xs) + { + EXPECT_EQ(x, (i * 20 + j) * 1001); + ++j; + } + ++i; + } +} #endif // NOLINTEND From 9f37ae3c3b3fd8e293bc0ef16ee453891eb14b76 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Tue, 3 Mar 2026 23:28:43 +0900 Subject: [PATCH 15/19] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Use=20custom=20tuple?= =?UTF-8?q?=20instead=20of=20std::pair=20in=20enumerate=5Fview?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index 87c1f68..3e11d19 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -73,7 +73,7 @@ namespace gpu_array::detail template struct tuple { - __host__ __device__ tuple() + tuple() requires (std::default_initializable && ...) = default; __host__ __device__ tuple(Ts... ts) : base_{std::forward(ts)...} {} @@ -125,24 +125,19 @@ template struct std::tuple_element> : std::tuple_element> { }; -#if !defined(__cpp_lib_tuple_like) || __cpp_lib_tuple_like < 202207L -template class TQual, template class UQual> -requires requires { - typename std::pair, UQual>, std::common_reference_t, UQual>>; -} -struct std::basic_common_reference, std::pair, TQual, UQual> +template +requires requires { typename gpu_array::detail::tuple...>; } +struct std::common_type, gpu_array::detail::tuple> { - using type = - std::pair, UQual>, std::common_reference_t, UQual>>; + using type = gpu_array::detail::tuple...>; }; - -template -requires requires { typename std::pair, std::common_type_t>; } -struct std::common_type, std::pair> +template class TQual, template class UQual> +requires requires { typename gpu_array::detail::tuple, UQual>...>; } +struct std::basic_common_reference, gpu_array::detail::tuple, TQual, + UQual> { - using type = std::pair, std::common_type_t>; + using type = gpu_array::detail::tuple, UQual>...>; }; -#endif namespace gpu_array { @@ -3094,13 +3089,13 @@ namespace gpu_array { public: using iterator_category = std::random_access_iterator_tag; - using value_type = std::pair, std::ranges::range_value_t>; + using value_type = detail::tuple, std::ranges::range_value_t>; using difference_type = std::make_signed_t>; enumerate_iterator() = default; __host__ __device__ explicit enumerate_iterator(Range& r) noexcept : pointer_(&r), index_(0) {} __host__ __device__ std::ranges::range_size_t index() const noexcept { return index_; } - __host__ __device__ std::pair, std::ranges::range_reference_t> + __host__ __device__ detail::tuple, std::ranges::range_reference_t> operator*() const noexcept { return {index_, (*pointer_)[index_]}; @@ -3137,7 +3132,7 @@ namespace gpu_array index_ -= n; return *this; } - __host__ __device__ std::pair, std::ranges::range_reference_t> + __host__ __device__ detail::tuple, std::ranges::range_reference_t> operator[](difference_type n) const { return *(*this + n); @@ -3185,8 +3180,8 @@ namespace gpu_array return x.index() - y.index(); } - __host__ __device__ friend std::pair, - std::ranges::range_rvalue_reference_t> + __host__ __device__ friend detail::tuple, + std::ranges::range_rvalue_reference_t> iter_move(const enumerate_iterator& x) { return {x.index(), std::move(x->second)}; From 978b72d2f49333b5beceffaa28fda2fbff8b89b7 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Wed, 4 Mar 2026 00:03:10 +0900 Subject: [PATCH 16/19] =?UTF-8?q?=E2=9C=85=20Add=20tests=20combining=20enu?= =?UTF-8?q?merate=20and=20zip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 9 +++----- test/test.cpp | 52 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 7 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index 3e11d19..4702637 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -3385,12 +3385,9 @@ namespace gpu_array __host__ __device__ friend auto iter_move(const zip_iterator& x) { - return detail::apply( - [&x](auto&... pointers) { - return detail::tuple...>( - std::move((*pointers)[x.index()])...); - }, - x.pointers_); + using Tuple = detail::tuple...>; + return detail::apply([&x](auto&... pointers) { return Tuple(std::move((*pointers)[x.index()])...); }, + x.pointers_); } private: diff --git a/test/test.cpp b/test/test.cpp index 940ee0f..1d83728 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -2170,7 +2170,7 @@ __global__ void zip_test_init(T array, int coeff) template requires std::ranges::input_range> && std::ranges::input_range> -__global__ void kernel_zip(T array1, U array2) +__global__ void kernel_zip(T array1, const U array2) { for (auto&& [xs, ys] : zip_view(array1, array2) | views::grid_block_stride) for (auto&& [x, y] : zip_view(xs, ys) | views::block_thread_stride) x = x + y; @@ -2206,5 +2206,55 @@ TEST(ZipView, Simple) ++i; } } + +template +__global__ void kernel_zip_enumerate(Ts ts, const Us us) +{ + for (auto&& [i, zipped] : zip_view(ts, us) | views::enumerate | views::grid_thread_stride) + { + auto&& [t, u] = zipped; + t = t * 100 + u * (i + 1); + } +} + +TEST(ZipView, WithEnumerate) +{ + auto vec1 = std::vector{19, 70, 86, 69}; + auto vec2 = std::vector{16, 6, 14, 17}; + auto array1 = managed_array(vec1); + auto array2 = managed_array(vec2); + kernel_zip_enumerate<<<1, 2>>>(array1, array2); + api::gpuDeviceSynchronize(); + for (int i = 0; const auto& t : array1) + { + EXPECT_EQ(t, vec1[i] * 100 + vec2[i] * (i + 1)); + ++i; + } +} + +template +__global__ void kernel_enumerate_zip(Ts ts, const Us us) +{ + for (auto&& [enumerated, u] : zip_view(enumerate_view(ts), us) | views::grid_thread_stride) + { + auto&& [i, t] = enumerated; + t = t * 100 + u * (i + 1); + } +} + +TEST(EnumerateView, WithZip) +{ + auto vec1 = std::vector{19, 70, 86, 69}; + auto vec2 = std::vector{16, 6, 14, 17}; + auto array1 = managed_array(vec1); + auto array2 = managed_array(vec2); + kernel_enumerate_zip<<<1, 2>>>(array1, array2); + api::gpuDeviceSynchronize(); + for (int i = 0; const auto& t : array1) + { + EXPECT_EQ(t, vec1[i] * 100 + vec2[i] * (i + 1)); + ++i; + } +} #endif // NOLINTEND From 48e57b4857cfa8bab78f3f151c86b4f7f941dbb3 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Wed, 4 Mar 2026 12:20:24 +0900 Subject: [PATCH 17/19] =?UTF-8?q?=F0=9F=9A=9A=20Rename=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test/test.cpp b/test/test.cpp index 1d83728..8447e6b 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -2131,18 +2131,18 @@ static_assert(std::ranges::view>>); template requires std::ranges::input_range> -__global__ void kernel_enumerate(T array) +__global__ void kernel_enumerate_stride(T array) { for (auto&& [i, xs] : enumerate_view(array) | views::grid_block_stride) for (auto&& [j, x] : enumerate_view(xs) | views::block_thread_stride) x = i * 100 + j; } -TEST(EnumerateView, Simple) +TEST(EnumerateView, WithStride) { auto vec_vec = std::vector(32, std::vector(64, 0)); auto nested_array = managed_array(vec_vec); - kernel_enumerate<<<32, 64>>>(nested_array); + kernel_enumerate_stride<<<32, 64>>>(nested_array); api::gpuDeviceSynchronize(); for (int i = 0; const auto& xs : nested_array) { @@ -2170,13 +2170,13 @@ __global__ void zip_test_init(T array, int coeff) template requires std::ranges::input_range> && std::ranges::input_range> -__global__ void kernel_zip(T array1, const U array2) +__global__ void kernel_zip_stride(T array1, const U array2) { for (auto&& [xs, ys] : zip_view(array1, array2) | views::grid_block_stride) for (auto&& [x, y] : zip_view(xs, ys) | views::block_thread_stride) x = x + y; } -TEST(ZipView, Simple) +TEST(ZipView, WithStride) { auto vec_vec = std::vector(10, std::vector(20, 0)); auto array1 = managed_array(vec_vec); @@ -2194,7 +2194,7 @@ TEST(ZipView, Simple) } zip_test_init<<<10, 20>>>(array2, 1000); - kernel_zip<<<10, 20>>>(array1, array2); + kernel_zip_stride<<<10, 20>>>(array1, array2); api::gpuDeviceSynchronize(); for (int i = 0; const auto& xs : array1) { @@ -2208,7 +2208,7 @@ TEST(ZipView, Simple) } template -__global__ void kernel_zip_enumerate(Ts ts, const Us us) +__global__ void kernel_zip_enumerate_stride(Ts ts, const Us us) { for (auto&& [i, zipped] : zip_view(ts, us) | views::enumerate | views::grid_thread_stride) { @@ -2217,13 +2217,13 @@ __global__ void kernel_zip_enumerate(Ts ts, const Us us) } } -TEST(ZipView, WithEnumerate) +TEST(ZipView, WithEnumerateStride) { auto vec1 = std::vector{19, 70, 86, 69}; auto vec2 = std::vector{16, 6, 14, 17}; auto array1 = managed_array(vec1); auto array2 = managed_array(vec2); - kernel_zip_enumerate<<<1, 2>>>(array1, array2); + kernel_zip_enumerate_stride<<<1, 2>>>(array1, array2); api::gpuDeviceSynchronize(); for (int i = 0; const auto& t : array1) { @@ -2233,7 +2233,7 @@ TEST(ZipView, WithEnumerate) } template -__global__ void kernel_enumerate_zip(Ts ts, const Us us) +__global__ void kernel_enumerate_zip_stride(Ts ts, const Us us) { for (auto&& [enumerated, u] : zip_view(enumerate_view(ts), us) | views::grid_thread_stride) { @@ -2242,13 +2242,13 @@ __global__ void kernel_enumerate_zip(Ts ts, const Us us) } } -TEST(EnumerateView, WithZip) +TEST(EnumerateView, WithZipStride) { auto vec1 = std::vector{19, 70, 86, 69}; auto vec2 = std::vector{16, 6, 14, 17}; auto array1 = managed_array(vec1); auto array2 = managed_array(vec2); - kernel_enumerate_zip<<<1, 2>>>(array1, array2); + kernel_enumerate_zip_stride<<<1, 2>>>(array1, array2); api::gpuDeviceSynchronize(); for (int i = 0; const auto& t : array1) { From f6c576b4e7d3ff2bec770e8ed42c2632b80c0b93 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Wed, 4 Mar 2026 12:29:45 +0900 Subject: [PATCH 18/19] =?UTF-8?q?=E2=9C=85=20Add=20test=20for=20enumerate?= =?UTF-8?q?=20and=20zip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test.cpp | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/test/test.cpp b/test/test.cpp index 8447e6b..bd81477 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -2129,6 +2129,35 @@ static_assert(detail::RandomAccessRange>>); static_assert(std::ranges::sized_range>>); static_assert(std::ranges::view>>); +template +requires std::ranges::input_range> +__global__ void kernel_enumerate(const Ts ts, Uss uss) +{ + auto i = cooperative_groups::this_thread_block().thread_rank(); + for (auto&& [j, t] : enumerate_view(ts)) + { + uss[i][j] = t * (j + 1); + } +} + +TEST(EnumerateView, Simple) +{ + auto vec1 = std::vector{16, 6, 14, 17}; + auto vec2 = std::vector(10, std::vector(4, 0)); + auto array1 = managed_array(vec1); + auto array2 = managed_array(vec2); + kernel_enumerate<<<1, 10>>>(array1, array2); + api::gpuDeviceSynchronize(); + for (const auto& us : array2) + { + for (int j = 0; const auto& u : us) + { + EXPECT_EQ(u, vec1[j] * (j + 1)); + ++j; + } + } +} + template requires std::ranges::input_range> __global__ void kernel_enumerate_stride(T array) @@ -2159,6 +2188,30 @@ static_assert(detail::RandomAccessRange>>); static_assert(std::ranges::sized_range>>); static_assert(std::ranges::view>>); +template +__global__ void kernel_zip(Ts ts, const Us us) +{ + for (auto&& [t, u] : zip_view(ts, us)) + { + t = t + u; + } +} + +TEST(ZipView, Simple) +{ + auto vec1 = std::vector{19, 70, 86, 69}; + auto vec2 = std::vector{16, 6, 14, 17}; + auto array1 = managed_array(vec1); + auto array2 = managed_array(vec2); + kernel_zip<<<1, 2>>>(array1, array2); + api::gpuDeviceSynchronize(); + for (int i = 0; const auto& t : array1) + { + EXPECT_EQ(t, vec1[i] + vec2[i]); + ++i; + } +} + template requires std::ranges::input_range> __global__ void zip_test_init(T array, int coeff) From 5406178c0021b197192bc793992b3d80e19f612b Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Thu, 5 Mar 2026 18:28:35 +0900 Subject: [PATCH 19/19] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Replace=20std::tuple?= =?UTF-8?q?=20with=20custom=20tuple?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 536 +++++++++++++++++++++++++----------------- test/test.cpp | 197 ++++++++-------- 2 files changed, 420 insertions(+), 313 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index 4702637..24aabee 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -48,95 +48,195 @@ #define SIGSEGV_DEPRECATED [[deprecated("Cannot access GPU memory directly")]] #endif -namespace gpu_array::detail +namespace gpu_array { // Custom implementation of tuple for device code - template - struct tuple_leaf + namespace detail { - using type = T; - T value; - }; + template + struct tuple_leaf + { + tuple_leaf() + requires std::default_initializable + = default; + template + __host__ __device__ tuple_leaf(U&& u) : value(std::forward(u)) + { + } + using type = T; + T value; + }; - template - tuple_leaf at_index(const tuple_leaf&); // undefined + template + tuple_leaf at_index(const tuple_leaf&); // undefined - template - struct tuple_impl; + template + struct tuple_impl; - template - struct tuple_impl, Ts...> : tuple_leaf... - { - }; + template + struct tuple_impl, Ts...> : tuple_leaf... + { + tuple_impl() + requires (std::default_initializable && ...) + = default; + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ tuple_impl(Us&&... us) : tuple_leaf(std::forward(us))... + { + } + }; + + template + struct tuple + { + tuple() + requires (std::default_initializable && ...) + = default; + + template + struct is_single_tuple : std::false_type + { + }; + template + struct is_single_tuple> : std::true_type + { + }; + template + requires (sizeof...(Us) == sizeof...(Ts) && !is_single_tuple...>::value) + __host__ __device__ tuple(Us&&... us) : base_(std::forward(us)...) + { + } + + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ tuple(const tuple& t) : tuple{tuple_convert(t, std::index_sequence_for{})} + { + } + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ tuple(tuple&& t) + : tuple{tuple_convert(std::move(t), std::index_sequence_for{})} + { + } + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ auto& operator=(const tuple& t) + { + *this = tuple_convert(t, std::index_sequence_for{}); + return *this; + } + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ auto& operator=(tuple&& t) + { + *this = tuple_convert(std::move(t), std::index_sequence_for{}); + return *this; + } + + template + __host__ __device__ friend auto& get(tuple&); + template + __host__ __device__ friend const auto& get(const tuple&); + template + __host__ __device__ friend auto&& get(tuple&&); + template + __host__ __device__ friend const auto&& get(const tuple&&); + + private: + template + static auto tuple_convert(const tuple& t, std::index_sequence) + { + return tuple{get(t)...}; + } + template + static auto tuple_convert(tuple&& t, std::index_sequence) + { + return tuple{std::move(get(t))...}; + } + + using base = tuple_impl, Ts...>; + base base_; + }; + + template + tuple(Ts...) -> tuple; - template - struct tuple - { - tuple() - requires (std::default_initializable && ...) - = default; - __host__ __device__ tuple(Ts... ts) : base_{std::forward(ts)...} {} template - __host__ __device__ friend auto& get(detail::tuple&); + __host__ __device__ auto& get(tuple& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(t.base_).value; + } template - __host__ __device__ friend const auto& get(const detail::tuple&); + __host__ __device__ const auto& get(const tuple& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(t.base_).value; + } template - __host__ __device__ friend auto&& get(detail::tuple&&); + __host__ __device__ auto&& get(tuple&& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(static_cast(t.base_).value); + } template - __host__ __device__ friend const auto&& get(const detail::tuple&&); + __host__ __device__ const auto&& get(const tuple&& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(static_cast(t.base_).value); + } - private: - using base = tuple_impl, Ts...>; - base base_; - }; + template + __host__ __device__ bool tuple_equal_impl(const tuple& t, const tuple& u, + std::index_sequence) + { + return ((get(t) == get(u)) && ...); + } + template + requires (sizeof...(Ts) == sizeof...(Us)) + __host__ __device__ bool operator==(const tuple& t, const tuple& u) + { + return tuple_equal_impl(t, u, std::index_sequence_for{}); + } - template - __host__ __device__ auto& get(detail::tuple& t) - { - using leaf = decltype(at_index(t.base_)); - return static_cast(t.base_).value; - } - template - __host__ __device__ const auto& get(const detail::tuple& t) - { - using leaf = decltype(at_index(t.base_)); - return static_cast(t.base_).value; - } - template - __host__ __device__ auto&& get(detail::tuple&& t) - { - using leaf = decltype(at_index(t.base_)); - return static_cast(static_cast(t.base_).value); - } - template - __host__ __device__ const auto&& get(const detail::tuple&& t) - { - using leaf = decltype(at_index(t.base_)); - return static_cast(static_cast(t.base_).value); - } -} // namespace gpu_array::detail + template + requires requires { std::declval()(get(std::declval())...); } + __host__ __device__ decltype(auto) apply_impl(F&& f, Tuple&& t, std::index_sequence) + { + return std::forward(f)(get(std::forward(t))...); + } + template + requires requires { std::tuple_size_v>; } + __host__ __device__ decltype(auto) apply(F&& f, Tuple&& t) + { + return apply_impl(std::forward(f), std::forward(t), + std::make_index_sequence>>{}); + } + } // namespace detail + + using detail::tuple, detail::get, detail::apply; +} // namespace gpu_array template -struct std::tuple_size> : std::integral_constant +struct std::tuple_size> : std::integral_constant { }; template -struct std::tuple_element> : std::tuple_element> +struct std::tuple_element> : std::tuple_element> { }; template -requires requires { typename gpu_array::detail::tuple...>; } -struct std::common_type, gpu_array::detail::tuple> +requires requires { typename gpu_array::tuple...>; } +struct std::common_type, gpu_array::tuple> { - using type = gpu_array::detail::tuple...>; + using type = gpu_array::tuple...>; }; template class TQual, template class UQual> -requires requires { typename gpu_array::detail::tuple, UQual>...>; } -struct std::basic_common_reference, gpu_array::detail::tuple, TQual, - UQual> +requires requires { typename gpu_array::tuple, UQual>...>; } +struct std::basic_common_reference, gpu_array::tuple, TQual, UQual> { - using type = gpu_array::detail::tuple, UQual>...>; + using type = gpu_array::tuple, UQual>...>; }; namespace gpu_array @@ -211,11 +311,11 @@ namespace gpu_array protected: size_type size_ = 0U; - std::tuple data_; + gpu_array::tuple data_; std::uint32_t* ref_count_ = nullptr; // reference counter, not used on GPU template - using element_type = std::tuple_element_t>; + using element_type = std::tuple_element_t>; __host__ __device__ void init() { @@ -346,11 +446,11 @@ namespace gpu_array __host__ __device__ void tuple_for_each(auto&& f) const { - std::apply([&f](const auto&... args) { (f(args), ...); }, data_); + gpu_array::apply([&f](const auto&... args) { (f(args), ...); }, data_); } __host__ __device__ void tuple_for_each(auto&& f) { - std::apply([&f](auto&... args) { (f(args), ...); }, data_); + gpu_array::apply([&f](auto&... args) { (f(args), ...); }, data_); } }; @@ -518,8 +618,8 @@ namespace gpu_array { return *(data() + base::size_ - 1); } - __host__ __device__ pointer data() noexcept { return std::get<0>(base::data_); } - __host__ __device__ const_pointer data() const noexcept { return std::get<0>(base::data_); } + __host__ __device__ pointer data() noexcept { return gpu_array::get<0>(base::data_); } + __host__ __device__ const_pointer data() const noexcept { return gpu_array::get<0>(base::data_); } array() = default; __host__ __device__ array(const array& r) : base(r) {} @@ -529,8 +629,8 @@ namespace gpu_array { if (base::size_ == 0) return; auto buf = std::make_unique(base::size_); - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(data(), buf.get(), sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); } @@ -538,8 +638,8 @@ namespace gpu_array __host__ array(std::size_t size, default_init_tag) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); if constexpr (!std::is_trivially_default_constructible_v) { @@ -556,8 +656,8 @@ namespace gpu_array auto al = std::allocator(); auto buf = al.allocate(base::size_); std::ranges::uninitialized_fill(buf, buf + base::size_, value); - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(data(), buf, sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); al.deallocate(buf, base::size_); @@ -572,8 +672,8 @@ namespace gpu_array { if (base::size_ == 0) return; - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR( api::gpuMemcpy(data(), std::ranges::data(r), sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); @@ -590,8 +690,8 @@ namespace gpu_array auto buf = al.allocate(base::size_); for (auto i = std::size_t{0}; const auto& v : r) std::ranges::construct_at(buf + i++, v); - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(data(), buf, sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); al.deallocate(buf, base::size_); @@ -600,8 +700,8 @@ namespace gpu_array __host__ array(std::initializer_list r) : base(std::ranges::size(r)) { if (base::size_ == 0) return; - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR( api::gpuMemcpy(data(), std::ranges::data(r), sizeof(ValueType) * base::size_, gpuMemcpyHostToDevice)); @@ -801,7 +901,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected device memory pointer"); } - std::get<0>(base::data_) = ptr; + gpu_array::get<0>(base::data_) = ptr; } #endif #if defined(GPU_OVERLOAD_DEVICE) @@ -847,7 +947,7 @@ namespace gpu_array [](auto acc, const auto& r) { return acc + std::ranges::size(r); })) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(ValueType) * base::size_)); assert(data() != nullptr); @@ -903,8 +1003,8 @@ namespace gpu_array __host__ __device__ const_reference front() const noexcept { return *begin(); } __host__ __device__ reference back() noexcept { return *(data() + base::size_ - 1); } __host__ __device__ const_reference back() const noexcept { return *(data() + base::size_ - 1); } - __host__ __device__ pointer data() noexcept { return std::get<0>(base::data_); } - __host__ __device__ const_pointer data() const noexcept { return std::get<0>(base::data_); } + __host__ __device__ pointer data() noexcept { return gpu_array::get<0>(base::data_); } + __host__ __device__ const_pointer data() const noexcept { return gpu_array::get<0>(base::data_); } managed_array() = default; __host__ __device__ managed_array(const managed_array& r) : base(r) {} @@ -913,7 +1013,7 @@ namespace gpu_array __host__ explicit managed_array(std::size_t size) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); std::ranges::uninitialized_value_construct(*this); @@ -922,7 +1022,7 @@ namespace gpu_array __host__ explicit managed_array(std::size_t size, default_init_tag) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); std::ranges::uninitialized_default_construct(*this); @@ -931,7 +1031,7 @@ namespace gpu_array __host__ managed_array(std::size_t size, const value_type& value) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); std::ranges::uninitialized_fill(*this, value); @@ -942,7 +1042,7 @@ namespace gpu_array __host__ explicit managed_array(const T& r) : base(std::ranges::size(r)) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); @@ -955,7 +1055,7 @@ namespace gpu_array __host__ managed_array(std::initializer_list r) : base(std::ranges::size(r)) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); for (auto i = std::size_t{0}; const auto& v : r) std::ranges::construct_at(data() + i++, v); @@ -1161,7 +1261,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected managed memory pointer"); } - std::get<0>(base::data_) = ptr; + gpu_array::get<0>(base::data_) = ptr; } #endif #if defined(GPU_OVERLOAD_DEVICE) @@ -1216,7 +1316,8 @@ namespace gpu_array __host__ explicit value(default_init_tag) : base(1) { - GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type))); + GPU_CHECK_ERROR( + api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type))); assert(get() != nullptr); if constexpr (!std::is_trivially_default_constructible_v) { @@ -1227,7 +1328,8 @@ namespace gpu_array __host__ explicit value(const value_type& r) : base(1) { - GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type))); + GPU_CHECK_ERROR( + api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type))); assert(get() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(get(), &r, sizeof(value_type), gpuMemcpyHostToDevice)); } @@ -1237,7 +1339,8 @@ namespace gpu_array __host__ explicit value(Args&&... args) : base(1) { auto temp = value_type(std::forward(args)...); - GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type))); + GPU_CHECK_ERROR( + api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type))); assert(get() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(get(), &temp, sizeof(value_type), gpuMemcpyHostToDevice)); } @@ -1253,7 +1356,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected device memory pointer"); } - std::get<0>(base::data_) = ptr; + gpu_array::get<0>(base::data_) = ptr; } #else : base(ptr, ptr == nullptr ? 0 : 1) @@ -1272,8 +1375,11 @@ namespace gpu_array return *this; } - __host__ __device__ pointer get() const noexcept { return std::get<0>(base::data_); } - __host__ __device__ explicit operator bool() const noexcept { return std::get<0>(base::data_) != nullptr; } + __host__ __device__ pointer get() const noexcept { return gpu_array::get<0>(base::data_); } + __host__ __device__ explicit operator bool() const noexcept + { + return gpu_array::get<0>(base::data_) != nullptr; + } #if defined(GPU_OVERLOAD_DEVICE) __device__ reference operator*() const noexcept @@ -1343,7 +1449,7 @@ namespace gpu_array __host__ explicit managed_value(default_init_tag) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::uninitialized_default_construct_n(get(), 1); } @@ -1351,7 +1457,7 @@ namespace gpu_array __host__ explicit managed_value(const ValueType& r) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::construct_at(get(), r); } @@ -1359,7 +1465,7 @@ namespace gpu_array __host__ explicit managed_value(ValueType&& r) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::construct_at(get(), std::move(r)); } @@ -1369,7 +1475,7 @@ namespace gpu_array __host__ explicit managed_value(Args&&... args) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::construct_at(get(), std::forward(args)...); } @@ -1385,7 +1491,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected managed memory pointer"); } - std::get<0>(base::data_) = ptr; + gpu_array::get<0>(base::data_) = ptr; } #else : base(ptr, ptr == nullptr ? 0 : 1) @@ -1415,8 +1521,11 @@ namespace gpu_array return get(); } - __host__ __device__ pointer get() const noexcept { return std::get<0>(base::data_); } - __host__ __device__ explicit operator bool() const noexcept { return std::get<0>(base::data_) != nullptr; } + __host__ __device__ pointer get() const noexcept { return gpu_array::get<0>(base::data_); } + __host__ __device__ explicit operator bool() const noexcept + { + return gpu_array::get<0>(base::data_) != nullptr; + } __host__ void prefetch(int device_id, api::gpuStream_t stream = 0, bool recursive = true) const { @@ -1467,10 +1576,10 @@ namespace gpu_array template constexpr bool assignable_to_tuple_helper_n() { - return requires(const Tuple& t1, std::tuple& t2) { - std::get(t1); - std::get(t2); - requires std::assignable_from(t2)), decltype(std::get(t1))>; + return requires(const Tuple& t1, gpu_array::tuple& t2) { + gpu_array::get(t1); + gpu_array::get(t2); + requires std::assignable_from(t2)), decltype(gpu_array::get(t1))>; }; } template @@ -1487,7 +1596,7 @@ namespace gpu_array template