From b5625e98b3bbff1116137cbca19d687fd42d6ee4 Mon Sep 17 00:00:00 2001 From: acd1034 <48613285+acd1034@users.noreply.github.com> Date: Thu, 5 Mar 2026 17:44:49 +0900 Subject: [PATCH 1/3] =?UTF-8?q?=E2=9C=A8=20Implement=20custom=20tuple?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/gpu_array.hpp | 419 ++++++++++++++++++++++++++++++------------ test/test.cpp | 194 +++++++++---------- 2 files changed, 399 insertions(+), 214 deletions(-) diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index aa56b8a..fc78c80 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -48,6 +48,187 @@ #define SIGSEGV_DEPRECATED [[deprecated("Cannot access GPU memory directly")]] #endif +namespace gpu_array::detail +{ + // Custom implementation of tuple for device code + + template + struct tuple_leaf + { + using type = T; + T value; + }; + + template + tuple_leaf at_index(const tuple_leaf&); // undefined + + template + struct tuple_impl; + + template + struct tuple_impl, Ts...> : tuple_leaf... + { + tuple_impl() + requires (std::default_initializable && ...) + = default; + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ tuple_impl(Us&&... us) : tuple_leaf(std::forward(us))... + { + } + }; + + template + struct tuple + { + tuple() + requires (std::default_initializable && ...) + = default; + + template + struct is_single_tuple : std::false_type + { + }; + template + struct is_single_tuple> : std::true_type + { + }; + template + requires (sizeof...(Us) == sizeof...(Ts) && !is_single_tuple...>::value) + __host__ __device__ tuple(Us&&... us) : base_{std::forward(us)...} + { + } + + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ tuple(const detail::tuple& t) + : tuple{tuple_convert(t, std::index_sequence_for{})} + { + } + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ tuple(detail::tuple&& t) + : tuple{tuple_convert(std::move(t), std::index_sequence_for{})} + { + } + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ auto& operator=(const detail::tuple& t) + { + *this = tuple_convert(t, std::index_sequence_for{}); + return *this; + } + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ auto& operator=(detail::tuple&& t) + { + *this = tuple_convert(std::move(t), std::index_sequence_for{}); + return *this; + } + + template + __host__ __device__ friend auto& get(detail::tuple&); + template + __host__ __device__ friend const auto& get(const detail::tuple&); + template + __host__ __device__ friend auto&& get(detail::tuple&&); + template + __host__ __device__ friend const auto&& get(const detail::tuple&&); + + private: + template + static auto tuple_convert(const detail::tuple& t, std::index_sequence) + { + return detail::tuple{get(t)...}; + } + template + static auto tuple_convert(detail::tuple&& t, std::index_sequence) + { + return detail::tuple{std::move(get(t))...}; + } + + using base = tuple_impl, Ts...>; + base base_; + }; + + template + tuple(Ts...) -> tuple; + + template + __host__ __device__ auto& get(detail::tuple& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(t.base_).value; + } + template + __host__ __device__ const auto& get(const detail::tuple& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(t.base_).value; + } + template + __host__ __device__ auto&& get(detail::tuple&& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(static_cast(t.base_).value); + } + template + __host__ __device__ const auto&& get(const detail::tuple&& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(static_cast(t.base_).value); + } + + template + __host__ __device__ bool tuple_equal_impl(const detail::tuple& t, const detail::tuple& u, + std::index_sequence) + { + return ((detail::get(t) == detail::get(u)) && ...); + } + template + requires (sizeof...(Ts) == sizeof...(Us)) + __host__ __device__ bool operator==(const detail::tuple& t, const detail::tuple& u) + { + return tuple_equal_impl(t, u, std::index_sequence_for{}); + } + + template + requires requires { std::declval()(detail::get(std::declval())...); } + __host__ __device__ decltype(auto) apply_impl(F&& f, Tuple&& t, std::index_sequence) + { + return std::forward(f)(detail::get(std::forward(t))...); + } + template + requires requires { std::tuple_size_v>; } + __host__ __device__ decltype(auto) apply(F&& f, Tuple&& t) + { + return apply_impl(std::forward(f), std::forward(t), + std::make_index_sequence>>{}); + } +} // namespace gpu_array::detail + +template +struct std::tuple_size> : std::integral_constant +{ +}; +template +struct std::tuple_element> : std::tuple_element> +{ +}; +template +requires requires { typename gpu_array::detail::tuple...>; } +struct std::common_type, gpu_array::detail::tuple> +{ + using type = gpu_array::detail::tuple...>; +}; +template class TQual, template class UQual> +requires requires { typename gpu_array::detail::tuple, UQual>...>; } +struct std::basic_common_reference, gpu_array::detail::tuple, TQual, + UQual> +{ + using type = gpu_array::detail::tuple, UQual>...>; +}; + namespace gpu_array { #if defined(GPU_USE_32BIT_SIZE_TYPE_DEFAULT) @@ -120,11 +301,11 @@ namespace gpu_array protected: size_type size_ = 0U; - std::tuple data_; + detail::tuple data_; std::uint32_t* ref_count_ = nullptr; // reference counter, not used on GPU template - using element_type = std::tuple_element_t>; + using element_type = std::tuple_element_t>; __host__ __device__ void init() { @@ -255,11 +436,11 @@ namespace gpu_array __host__ __device__ void tuple_for_each(auto&& f) const { - std::apply([&f](const auto&... args) { (f(args), ...); }, data_); + detail::apply([&f](const auto&... args) { (f(args), ...); }, data_); } __host__ __device__ void tuple_for_each(auto&& f) { - std::apply([&f](auto&... args) { (f(args), ...); }, data_); + detail::apply([&f](auto&... args) { (f(args), ...); }, data_); } }; @@ -427,8 +608,8 @@ namespace gpu_array { return *(data() + base::size_ - 1); } - __host__ __device__ pointer data() noexcept { return std::get<0>(base::data_); } - __host__ __device__ const_pointer data() const noexcept { return std::get<0>(base::data_); } + __host__ __device__ pointer data() noexcept { return detail::get<0>(base::data_); } + __host__ __device__ const_pointer data() const noexcept { return detail::get<0>(base::data_); } array() = default; __host__ __device__ array(const array& r) : base(r) {} @@ -438,8 +619,8 @@ namespace gpu_array { if (base::size_ == 0) return; auto buf = std::make_unique(base::size_); - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&detail::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(data(), buf.get(), sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); } @@ -447,8 +628,8 @@ namespace gpu_array __host__ array(std::size_t size, default_init_tag) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&detail::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); if constexpr (!std::is_trivially_default_constructible_v) { @@ -465,8 +646,8 @@ namespace gpu_array auto al = std::allocator(); auto buf = al.allocate(base::size_); std::ranges::uninitialized_fill(buf, buf + base::size_, value); - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&detail::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(data(), buf, sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); al.deallocate(buf, base::size_); @@ -481,8 +662,8 @@ namespace gpu_array { if (base::size_ == 0) return; - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&detail::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR( api::gpuMemcpy(data(), std::ranges::data(r), sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); @@ -499,8 +680,8 @@ namespace gpu_array auto buf = al.allocate(base::size_); for (auto i = std::size_t{0}; const auto& v : r) std::ranges::construct_at(buf + i++, v); - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&detail::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(data(), buf, sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); al.deallocate(buf, base::size_); @@ -509,8 +690,8 @@ namespace gpu_array __host__ array(std::initializer_list r) : base(std::ranges::size(r)) { if (base::size_ == 0) return; - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&detail::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR( api::gpuMemcpy(data(), std::ranges::data(r), sizeof(ValueType) * base::size_, gpuMemcpyHostToDevice)); @@ -710,7 +891,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected device memory pointer"); } - std::get<0>(base::data_) = ptr; + detail::get<0>(base::data_) = ptr; } #endif #if defined(GPU_OVERLOAD_DEVICE) @@ -756,7 +937,7 @@ namespace gpu_array [](auto acc, const auto& r) { return acc + std::ranges::size(r); })) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(ValueType) * base::size_)); assert(data() != nullptr); @@ -812,8 +993,8 @@ namespace gpu_array __host__ __device__ const_reference front() const noexcept { return *begin(); } __host__ __device__ reference back() noexcept { return *(data() + base::size_ - 1); } __host__ __device__ const_reference back() const noexcept { return *(data() + base::size_ - 1); } - __host__ __device__ pointer data() noexcept { return std::get<0>(base::data_); } - __host__ __device__ const_pointer data() const noexcept { return std::get<0>(base::data_); } + __host__ __device__ pointer data() noexcept { return detail::get<0>(base::data_); } + __host__ __device__ const_pointer data() const noexcept { return detail::get<0>(base::data_); } managed_array() = default; __host__ __device__ managed_array(const managed_array& r) : base(r) {} @@ -822,7 +1003,7 @@ namespace gpu_array __host__ explicit managed_array(std::size_t size) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); std::ranges::uninitialized_value_construct(*this); @@ -831,7 +1012,7 @@ namespace gpu_array __host__ explicit managed_array(std::size_t size, default_init_tag) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); std::ranges::uninitialized_default_construct(*this); @@ -840,7 +1021,7 @@ namespace gpu_array __host__ managed_array(std::size_t size, const value_type& value) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); std::ranges::uninitialized_fill(*this, value); @@ -851,7 +1032,7 @@ namespace gpu_array __host__ explicit managed_array(const T& r) : base(std::ranges::size(r)) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); @@ -864,7 +1045,7 @@ namespace gpu_array __host__ managed_array(std::initializer_list r) : base(std::ranges::size(r)) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); for (auto i = std::size_t{0}; const auto& v : r) std::ranges::construct_at(data() + i++, v); @@ -1070,7 +1251,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected managed memory pointer"); } - std::get<0>(base::data_) = ptr; + detail::get<0>(base::data_) = ptr; } #endif #if defined(GPU_OVERLOAD_DEVICE) @@ -1125,7 +1306,7 @@ namespace gpu_array __host__ explicit value(default_init_tag) : base(1) { - GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type))); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(value_type))); assert(get() != nullptr); if constexpr (!std::is_trivially_default_constructible_v) { @@ -1136,7 +1317,7 @@ namespace gpu_array __host__ explicit value(const value_type& r) : base(1) { - GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type))); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(value_type))); assert(get() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(get(), &r, sizeof(value_type), gpuMemcpyHostToDevice)); } @@ -1146,7 +1327,7 @@ namespace gpu_array __host__ explicit value(Args&&... args) : base(1) { auto temp = value_type(std::forward(args)...); - GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type))); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(value_type))); assert(get() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(get(), &temp, sizeof(value_type), gpuMemcpyHostToDevice)); } @@ -1162,7 +1343,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected device memory pointer"); } - std::get<0>(base::data_) = ptr; + detail::get<0>(base::data_) = ptr; } #else : base(ptr, ptr == nullptr ? 0 : 1) @@ -1181,8 +1362,8 @@ namespace gpu_array return *this; } - __host__ __device__ pointer get() const noexcept { return std::get<0>(base::data_); } - __host__ __device__ explicit operator bool() const noexcept { return std::get<0>(base::data_) != nullptr; } + __host__ __device__ pointer get() const noexcept { return detail::get<0>(base::data_); } + __host__ __device__ explicit operator bool() const noexcept { return detail::get<0>(base::data_) != nullptr; } #if defined(GPU_OVERLOAD_DEVICE) __device__ reference operator*() const noexcept @@ -1252,7 +1433,7 @@ namespace gpu_array __host__ explicit managed_value(default_init_tag) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::uninitialized_default_construct_n(get(), 1); } @@ -1260,7 +1441,7 @@ namespace gpu_array __host__ explicit managed_value(const ValueType& r) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::construct_at(get(), r); } @@ -1268,7 +1449,7 @@ namespace gpu_array __host__ explicit managed_value(ValueType&& r) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::construct_at(get(), std::move(r)); } @@ -1278,7 +1459,7 @@ namespace gpu_array __host__ explicit managed_value(Args&&... args) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&detail::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::construct_at(get(), std::forward(args)...); } @@ -1294,7 +1475,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected managed memory pointer"); } - std::get<0>(base::data_) = ptr; + detail::get<0>(base::data_) = ptr; } #else : base(ptr, ptr == nullptr ? 0 : 1) @@ -1324,8 +1505,8 @@ namespace gpu_array return get(); } - __host__ __device__ pointer get() const noexcept { return std::get<0>(base::data_); } - __host__ __device__ explicit operator bool() const noexcept { return std::get<0>(base::data_) != nullptr; } + __host__ __device__ pointer get() const noexcept { return detail::get<0>(base::data_); } + __host__ __device__ explicit operator bool() const noexcept { return detail::get<0>(base::data_) != nullptr; } __host__ void prefetch(int device_id, api::gpuStream_t stream = 0, bool recursive = true) const { @@ -1376,10 +1557,10 @@ namespace gpu_array template constexpr bool assignable_to_tuple_helper_n() { - return requires(const Tuple& t1, std::tuple& t2) { - std::get(t1); - std::get(t2); - requires std::assignable_from(t2)), decltype(std::get(t1))>; + return requires(const Tuple& t1, detail::tuple& t2) { + detail::get(t1); + detail::get(t2); + requires std::assignable_from(t2)), decltype(detail::get(t1))>; }; } template @@ -1396,7 +1577,7 @@ namespace gpu_array template