diff --git a/include/gpu_array.hpp b/include/gpu_array.hpp index aa56b8a..e9fd8a8 100644 --- a/include/gpu_array.hpp +++ b/include/gpu_array.hpp @@ -48,6 +48,197 @@ #define SIGSEGV_DEPRECATED [[deprecated("Cannot access GPU memory directly")]] #endif +namespace gpu_array +{ + // Custom implementation of tuple for device code + + namespace detail + { + template + struct tuple_leaf + { + tuple_leaf() + requires std::default_initializable + = default; + template + __host__ __device__ tuple_leaf(U&& u) : value(std::forward(u)) + { + } + using type = T; + T value; + }; + + template + tuple_leaf at_index(const tuple_leaf&); // undefined + + template + struct tuple_impl; + + template + struct tuple_impl, Ts...> : tuple_leaf... + { + tuple_impl() + requires (std::default_initializable && ...) + = default; + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ tuple_impl(Us&&... us) : tuple_leaf(std::forward(us))... + { + } + }; + + template + struct tuple + { + tuple() + requires (std::default_initializable && ...) + = default; + + template + struct is_single_tuple : std::false_type + { + }; + template + struct is_single_tuple> : std::true_type + { + }; + template + requires (sizeof...(Us) == sizeof...(Ts) && !is_single_tuple...>::value) + __host__ __device__ tuple(Us&&... us) : base_(std::forward(us)...) + { + } + + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ tuple(const tuple& t) : tuple{tuple_convert(t, std::index_sequence_for{})} + { + } + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ tuple(tuple&& t) + : tuple{tuple_convert(std::move(t), std::index_sequence_for{})} + { + } + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ auto& operator=(const tuple& t) + { + *this = tuple_convert(t, std::index_sequence_for{}); + return *this; + } + template + requires (sizeof...(Us) == sizeof...(Ts)) + __host__ __device__ auto& operator=(tuple&& t) + { + *this = tuple_convert(std::move(t), std::index_sequence_for{}); + return *this; + } + + template + __host__ __device__ friend auto& get(tuple&); + template + __host__ __device__ friend const auto& get(const tuple&); + template + __host__ __device__ friend auto&& get(tuple&&); + template + __host__ __device__ friend const auto&& get(const tuple&&); + + private: + template + static auto tuple_convert(const tuple& t, std::index_sequence) + { + return tuple{get(t)...}; + } + template + static auto tuple_convert(tuple&& t, std::index_sequence) + { + return tuple{std::move(get(t))...}; + } + + using base = tuple_impl, Ts...>; + base base_; + }; + + template + tuple(Ts...) -> tuple; + + template + __host__ __device__ auto& get(tuple& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(t.base_).value; + } + template + __host__ __device__ const auto& get(const tuple& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(t.base_).value; + } + template + __host__ __device__ auto&& get(tuple&& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(static_cast(t.base_).value); + } + template + __host__ __device__ const auto&& get(const tuple&& t) + { + using leaf = decltype(at_index(t.base_)); + return static_cast(static_cast(t.base_).value); + } + + template + __host__ __device__ bool tuple_equal_impl(const tuple& t, const tuple& u, + std::index_sequence) + { + return ((get(t) == get(u)) && ...); + } + template + requires (sizeof...(Ts) == sizeof...(Us)) + __host__ __device__ bool operator==(const tuple& t, const tuple& u) + { + return tuple_equal_impl(t, u, std::index_sequence_for{}); + } + + template + requires requires { std::declval()(get(std::declval())...); } + __host__ __device__ decltype(auto) apply_impl(F&& f, Tuple&& t, std::index_sequence) + { + return std::forward(f)(get(std::forward(t))...); + } + template + requires requires { std::tuple_size_v>; } + __host__ __device__ decltype(auto) apply(F&& f, Tuple&& t) + { + return apply_impl(std::forward(f), std::forward(t), + std::make_index_sequence>>{}); + } + } // namespace detail + + using detail::tuple, detail::get, detail::apply; +} // namespace gpu_array + +template +struct std::tuple_size> : std::integral_constant +{ +}; +template +struct std::tuple_element> : std::tuple_element> +{ +}; +template +requires requires { typename gpu_array::tuple...>; } +struct std::common_type, gpu_array::tuple> +{ + using type = gpu_array::tuple...>; +}; +template class TQual, template class UQual> +requires requires { typename gpu_array::tuple, UQual>...>; } +struct std::basic_common_reference, gpu_array::tuple, TQual, UQual> +{ + using type = gpu_array::tuple, UQual>...>; +}; + namespace gpu_array { #if defined(GPU_USE_32BIT_SIZE_TYPE_DEFAULT) @@ -120,11 +311,11 @@ namespace gpu_array protected: size_type size_ = 0U; - std::tuple data_; + gpu_array::tuple data_; std::uint32_t* ref_count_ = nullptr; // reference counter, not used on GPU template - using element_type = std::tuple_element_t>; + using element_type = std::tuple_element_t>; __host__ __device__ void init() { @@ -255,11 +446,11 @@ namespace gpu_array __host__ __device__ void tuple_for_each(auto&& f) const { - std::apply([&f](const auto&... args) { (f(args), ...); }, data_); + gpu_array::apply([&f](const auto&... args) { (f(args), ...); }, data_); } __host__ __device__ void tuple_for_each(auto&& f) { - std::apply([&f](auto&... args) { (f(args), ...); }, data_); + gpu_array::apply([&f](auto&... args) { (f(args), ...); }, data_); } }; @@ -427,8 +618,8 @@ namespace gpu_array { return *(data() + base::size_ - 1); } - __host__ __device__ pointer data() noexcept { return std::get<0>(base::data_); } - __host__ __device__ const_pointer data() const noexcept { return std::get<0>(base::data_); } + __host__ __device__ pointer data() noexcept { return gpu_array::get<0>(base::data_); } + __host__ __device__ const_pointer data() const noexcept { return gpu_array::get<0>(base::data_); } array() = default; __host__ __device__ array(const array& r) : base(r) {} @@ -438,8 +629,8 @@ namespace gpu_array { if (base::size_ == 0) return; auto buf = std::make_unique(base::size_); - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(data(), buf.get(), sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); } @@ -447,8 +638,8 @@ namespace gpu_array __host__ array(std::size_t size, default_init_tag) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); if constexpr (!std::is_trivially_default_constructible_v) { @@ -465,8 +656,8 @@ namespace gpu_array auto al = std::allocator(); auto buf = al.allocate(base::size_); std::ranges::uninitialized_fill(buf, buf + base::size_, value); - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(data(), buf, sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); al.deallocate(buf, base::size_); @@ -481,8 +672,8 @@ namespace gpu_array { if (base::size_ == 0) return; - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR( api::gpuMemcpy(data(), std::ranges::data(r), sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); @@ -499,8 +690,8 @@ namespace gpu_array auto buf = al.allocate(base::size_); for (auto i = std::size_t{0}; const auto& v : r) std::ranges::construct_at(buf + i++, v); - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(data(), buf, sizeof(value_type) * base::size_, gpuMemcpyHostToDevice)); al.deallocate(buf, base::size_); @@ -509,8 +700,8 @@ namespace gpu_array __host__ array(std::initializer_list r) : base(std::ranges::size(r)) { if (base::size_ == 0) return; - GPU_CHECK_ERROR( - api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type) * base::size_)); + GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), + sizeof(value_type) * base::size_)); assert(data() != nullptr); GPU_CHECK_ERROR( api::gpuMemcpy(data(), std::ranges::data(r), sizeof(ValueType) * base::size_, gpuMemcpyHostToDevice)); @@ -710,7 +901,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected device memory pointer"); } - std::get<0>(base::data_) = ptr; + gpu_array::get<0>(base::data_) = ptr; } #endif #if defined(GPU_OVERLOAD_DEVICE) @@ -756,7 +947,7 @@ namespace gpu_array [](auto acc, const auto& r) { return acc + std::ranges::size(r); })) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(ValueType) * base::size_)); assert(data() != nullptr); @@ -812,8 +1003,8 @@ namespace gpu_array __host__ __device__ const_reference front() const noexcept { return *begin(); } __host__ __device__ reference back() noexcept { return *(data() + base::size_ - 1); } __host__ __device__ const_reference back() const noexcept { return *(data() + base::size_ - 1); } - __host__ __device__ pointer data() noexcept { return std::get<0>(base::data_); } - __host__ __device__ const_pointer data() const noexcept { return std::get<0>(base::data_); } + __host__ __device__ pointer data() noexcept { return gpu_array::get<0>(base::data_); } + __host__ __device__ const_pointer data() const noexcept { return gpu_array::get<0>(base::data_); } managed_array() = default; __host__ __device__ managed_array(const managed_array& r) : base(r) {} @@ -822,7 +1013,7 @@ namespace gpu_array __host__ explicit managed_array(std::size_t size) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); std::ranges::uninitialized_value_construct(*this); @@ -831,7 +1022,7 @@ namespace gpu_array __host__ explicit managed_array(std::size_t size, default_init_tag) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); std::ranges::uninitialized_default_construct(*this); @@ -840,7 +1031,7 @@ namespace gpu_array __host__ managed_array(std::size_t size, const value_type& value) : base(size) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); std::ranges::uninitialized_fill(*this, value); @@ -851,7 +1042,7 @@ namespace gpu_array __host__ explicit managed_array(const T& r) : base(std::ranges::size(r)) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); @@ -864,7 +1055,7 @@ namespace gpu_array __host__ managed_array(std::initializer_list r) : base(std::ranges::size(r)) { if (base::size_ == 0) return; - GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), + GPU_CHECK_ERROR(api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type) * base::size_)); assert(data() != nullptr); for (auto i = std::size_t{0}; const auto& v : r) std::ranges::construct_at(data() + i++, v); @@ -1070,7 +1261,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected managed memory pointer"); } - std::get<0>(base::data_) = ptr; + gpu_array::get<0>(base::data_) = ptr; } #endif #if defined(GPU_OVERLOAD_DEVICE) @@ -1125,7 +1316,8 @@ namespace gpu_array __host__ explicit value(default_init_tag) : base(1) { - GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type))); + GPU_CHECK_ERROR( + api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type))); assert(get() != nullptr); if constexpr (!std::is_trivially_default_constructible_v) { @@ -1136,7 +1328,8 @@ namespace gpu_array __host__ explicit value(const value_type& r) : base(1) { - GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type))); + GPU_CHECK_ERROR( + api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type))); assert(get() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(get(), &r, sizeof(value_type), gpuMemcpyHostToDevice)); } @@ -1146,7 +1339,8 @@ namespace gpu_array __host__ explicit value(Args&&... args) : base(1) { auto temp = value_type(std::forward(args)...); - GPU_CHECK_ERROR(api::gpuMalloc(reinterpret_cast(&std::get<0>(base::data_)), sizeof(value_type))); + GPU_CHECK_ERROR( + api::gpuMalloc(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(value_type))); assert(get() != nullptr); GPU_CHECK_ERROR(api::gpuMemcpy(get(), &temp, sizeof(value_type), gpuMemcpyHostToDevice)); } @@ -1162,7 +1356,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected device memory pointer"); } - std::get<0>(base::data_) = ptr; + gpu_array::get<0>(base::data_) = ptr; } #else : base(ptr, ptr == nullptr ? 0 : 1) @@ -1181,8 +1375,11 @@ namespace gpu_array return *this; } - __host__ __device__ pointer get() const noexcept { return std::get<0>(base::data_); } - __host__ __device__ explicit operator bool() const noexcept { return std::get<0>(base::data_) != nullptr; } + __host__ __device__ pointer get() const noexcept { return gpu_array::get<0>(base::data_); } + __host__ __device__ explicit operator bool() const noexcept + { + return gpu_array::get<0>(base::data_) != nullptr; + } #if defined(GPU_OVERLOAD_DEVICE) __device__ reference operator*() const noexcept @@ -1252,7 +1449,7 @@ namespace gpu_array __host__ explicit managed_value(default_init_tag) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::uninitialized_default_construct_n(get(), 1); } @@ -1260,7 +1457,7 @@ namespace gpu_array __host__ explicit managed_value(const ValueType& r) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::construct_at(get(), r); } @@ -1268,7 +1465,7 @@ namespace gpu_array __host__ explicit managed_value(ValueType&& r) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::construct_at(get(), std::move(r)); } @@ -1278,7 +1475,7 @@ namespace gpu_array __host__ explicit managed_value(Args&&... args) : base(1) { GPU_CHECK_ERROR( - api::gpuMallocManaged(reinterpret_cast(&std::get<0>(base::data_)), sizeof(ValueType))); + api::gpuMallocManaged(reinterpret_cast(&gpu_array::get<0>(base::data_)), sizeof(ValueType))); assert(get() != nullptr); std::ranges::construct_at(get(), std::forward(args)...); } @@ -1294,7 +1491,7 @@ namespace gpu_array { throw std::runtime_error("pointer type mismatch: expected managed memory pointer"); } - std::get<0>(base::data_) = ptr; + gpu_array::get<0>(base::data_) = ptr; } #else : base(ptr, ptr == nullptr ? 0 : 1) @@ -1324,8 +1521,11 @@ namespace gpu_array return get(); } - __host__ __device__ pointer get() const noexcept { return std::get<0>(base::data_); } - __host__ __device__ explicit operator bool() const noexcept { return std::get<0>(base::data_) != nullptr; } + __host__ __device__ pointer get() const noexcept { return gpu_array::get<0>(base::data_); } + __host__ __device__ explicit operator bool() const noexcept + { + return gpu_array::get<0>(base::data_) != nullptr; + } __host__ void prefetch(int device_id, api::gpuStream_t stream = 0, bool recursive = true) const { @@ -1376,10 +1576,10 @@ namespace gpu_array template constexpr bool assignable_to_tuple_helper_n() { - return requires(const Tuple& t1, std::tuple& t2) { - std::get(t1); - std::get(t2); - requires std::assignable_from(t2)), decltype(std::get(t1))>; + return requires(const Tuple& t1, gpu_array::tuple& t2) { + gpu_array::get(t1); + gpu_array::get(t2); + requires std::assignable_from(t2)), decltype(gpu_array::get(t1))>; }; } template @@ -1396,7 +1596,7 @@ namespace gpu_array template