From 20eea2c9ab6d9f994d1a57cf167344570365f4fc Mon Sep 17 00:00:00 2001 From: fjbarter Date: Mon, 16 Mar 2026 01:06:06 +0000 Subject: [PATCH 01/18] on-device rand! with three options for stateless counter-based RNG --- prototype/rand/Project.toml | 6 ++ prototype/rand/test_rand.jl | 74 +++++++++++++++++++ src/AcceleratedKernels.jl | 1 + src/rand/philox.jl | 37 ++++++++++ src/rand/rand.jl | 137 ++++++++++++++++++++++++++++++++++++ src/rand/splitmix64.jl | 26 +++++++ src/rand/threefry.jl | 49 +++++++++++++ src/rand/utilities.jl | 36 ++++++++++ test/rand.jl | 132 ++++++++++++++++++++++++++++++++++ test/runtests.jl | 1 + 10 files changed, 499 insertions(+) create mode 100644 prototype/rand/Project.toml create mode 100644 prototype/rand/test_rand.jl create mode 100644 src/rand/philox.jl create mode 100644 src/rand/rand.jl create mode 100644 src/rand/splitmix64.jl create mode 100644 src/rand/threefry.jl create mode 100644 src/rand/utilities.jl create mode 100644 test/rand.jl diff --git a/prototype/rand/Project.toml b/prototype/rand/Project.toml new file mode 100644 index 0000000..7e92c89 --- /dev/null +++ b/prototype/rand/Project.toml @@ -0,0 +1,6 @@ +[deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +PProf = "e4faabce-9ead-11e9-39d9-4379958e3056" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" diff --git a/prototype/rand/test_rand.jl b/prototype/rand/test_rand.jl new file mode 100644 index 0000000..329214d --- /dev/null +++ b/prototype/rand/test_rand.jl @@ -0,0 +1,74 @@ +using BenchmarkTools +using CUDA + +import AcceleratedKernels as AK + + +const N = 100_000_000 +const GPU_BLOCK_SIZE = 256 + +const RNG_SPLITMIX = AK.CounterRNG(0x12345678; alg=AK.SplitMix64()) +const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox()) +const RNG_THREEFRY = AK.CounterRNG(0x12345678; alg=AK.Threefry()) + +x_cuda = CuArray{Float32}(undef, N) +x_splitmix = CuArray{Float32}(undef, N) +x_philox = CuArray{Float32}(undef, N) +x_threefry = CuArray{Float32}(undef, N) +x_cpu = Vector{Float32}(undef, N) + + +function run_cuda_rand!(x) + CUDA.rand!(x) + CUDA.synchronize() + return x +end + + +function run_ak_rand_gpu!(rng, x) + AK.rand!(rng, x; block_size=GPU_BLOCK_SIZE) + AK.synchronize(AK.get_backend(x)) + return x +end + + +function run_ak_rand_cpu!(rng, x) + AK.rand!(rng, x) + return x +end + + +# Julia base rand() gives [0, 1) and so does EVERYTHING ELSE EVER! but CuRAND gives (0, 1] ... +is_unit_interval(v) = all(x -> 0.0f0 <= x <= 1.0f0, v) + +# warmup compile +run_cuda_rand!(x_cuda) +run_ak_rand_gpu!(RNG_SPLITMIX, x_splitmix) +run_ak_rand_gpu!(RNG_PHILOX, x_philox) +run_ak_rand_gpu!(RNG_THREEFRY, x_threefry) +run_ak_rand_cpu!(RNG_SPLITMIX, x_cpu) + +@assert is_unit_interval(Array(x_cuda)) +@assert is_unit_interval(Array(x_splitmix)) +@assert is_unit_interval(Array(x_philox)) +@assert is_unit_interval(Array(x_threefry)) +@assert is_unit_interval(x_cpu) + +println("N = ", N) +println("CPU threads: ", Threads.nthreads()) + +println("\nCUDA.rand! benchmark (CuArray{Float32}, in-place)") +display(@benchmark run_cuda_rand!($x_cuda)) + +println("\nAK.rand! SplitMix64 benchmark (GPU, CuArray{Float32})") +display(@benchmark run_ak_rand_gpu!($RNG_SPLITMIX, $x_splitmix)) + +println("\nAK.rand! Philox benchmark (GPU, CuArray{Float32})") +display(@benchmark run_ak_rand_gpu!($RNG_PHILOX, $x_philox)) + +println("\nAK.rand! Threefry benchmark (GPU, CuArray{Float32})") +display(@benchmark run_ak_rand_gpu!($RNG_THREEFRY, $x_threefry)) + +println("\nAK.rand! benchmark (CPU, Vector{Float32}, SplitMix64)") +display(@benchmark run_ak_rand_cpu!($RNG_SPLITMIX, $x_cpu)) + diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl index d662c2a..06a401a 100644 --- a/src/AcceleratedKernels.jl +++ b/src/AcceleratedKernels.jl @@ -31,6 +31,7 @@ include("map.jl") include("sort/sort.jl") include("reduce/reduce.jl") include("accumulate/accumulate.jl") +include("rand/rand.jl") include("searchsorted.jl") include("predicates.jl") include("arithmetics.jl") diff --git a/src/rand/philox.jl b/src/rand/philox.jl new file mode 100644 index 0000000..2b32d95 --- /dev/null +++ b/src/rand/philox.jl @@ -0,0 +1,37 @@ +struct Philox <: CounterRNGAlgorithm end + + +# Philox magic numbers +const PHILOX_M0 = UInt32(0xD256D193) +const PHILOX_W0 = UInt32(0x9E3779B9) +const PHILOX_ROUNDS = 10 + + +# Each round destroys x0 with multiplication, addition, and XORs +@inline function _philox2x32_round(x0::UInt32, x1::UInt32, k0::UInt32) + lo = PHILOX_M0 * x0 + hi = _mulhi_u32(PHILOX_M0, x0) + y0 = xor(xor(hi, k0), x1) + y1 = lo + return y0, y1 +end + + +""" + rand_uint32(rng::CounterRNG{<:Unsigned, Philox}, counter::UInt64) -> UInt32 +""" +@inline function rand_uint32(rng::CounterRNG{<:Unsigned, Philox}, counter::UInt64)::UInt32 + x0 = _u32_lo(counter) + x1 = _u32_hi(counter) + + seed = UInt64(rng.seed) + k0 = _u32_lo(seed) + x1 = xor(x1, _u32_hi(seed)) + + @inbounds for _ in 1:PHILOX_ROUNDS + x0, x1 = _philox2x32_round(x0, x1, k0) + k0 += PHILOX_W0 + end + + return x0 +end diff --git a/src/rand/rand.jl b/src/rand/rand.jl new file mode 100644 index 0000000..07952e1 --- /dev/null +++ b/src/rand/rand.jl @@ -0,0 +1,137 @@ +""" + abstract type AbstractCounterRNG end + abstract type CounterRNGAlgorithm end + +RNG interface for counter-based random generation with AcceleratedKernels. +""" + +abstract type AbstractCounterRNG end +abstract type CounterRNGAlgorithm end + + +""" + CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=SplitMix64()) + +Stateless counter-based RNG configuration for [`rand!`](@ref). + +`CounterRNG` is immutable and does not hold mutable thread-local or global state. Each generated +value is a pure function of: +- `seed` +- logical linear element index +- algorithm (`alg`) + +The default algorithm is `Philox()`. +""" +struct CounterRNG{K <: Unsigned, A <: CounterRNGAlgorithm} <: AbstractCounterRNG + seed::K + alg::A +end + + +function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox()) + @argcheck seed >= 0 + CounterRNG(UInt64(seed), alg) +end + + + + +# Shared helpers +include("utilities.jl") + +# Algorithm-specific integer generators +include("splitmix64.jl") +include("philox.jl") +include("threefry.jl") + + + + + +function _rand_fill_threads!( + rng::AbstractCounterRNG, + x::AbstractArray{Float32}; + max_tasks::Int, + min_elems::Int, +) + task_partition(length(x), max_tasks, min_elems) do irange + @inbounds for i in irange + counter = _counter_from_index(i) + x[i] = uint32_to_unit_float32(rand_uint32(rng, counter)) + end + end + return x +end + + +@kernel inbounds=true cpu=false unsafe_indices=true function _rand_fill_kernel!( + rng, + x, +) + i = @index(Global, Linear) + if i <= length(x) + counter = _counter_from_index(i) + x[i] = uint32_to_unit_float32(rand_uint32(rng, counter)) + end +end + + +function _rand_fill_gpu!( + rng::AbstractCounterRNG, + x::AbstractArray{Float32}, + backend::Backend; + block_size::Int, +) + @argcheck block_size > 0 + len = length(x) + len == 0 && return x + + blocks = div(len, block_size, RoundUp) + kernel! = _rand_fill_kernel!(backend, block_size) + kernel!(rng, x, ndrange=(blocks * block_size,)) + return x +end + + +""" + rand!( + rng::AbstractCounterRNG, + x::AbstractArray{Float32}, + backend::Backend=get_backend(x); + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + + # Implementation choice + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, + ) + +Fill `x` in-place with pseudo-random `Float32` values in `[0, 1)` using a stateless counter-based +RNG. For `x[i]`, the counter is exactly `UInt64(i - 1)` in linear indexing order. + +The float conversion is mantissa-based: uniform over the produced mantissa grid, not over all +representable `Float32` values in `[0, 1)`. +""" +function rand!( + rng::AbstractCounterRNG, + x::AbstractArray{Float32}, + backend::Backend=get_backend(x); + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, +) + if use_gpu_algorithm(backend, prefer_threads) + _rand_fill_gpu!(rng, x, backend; block_size) + else + _rand_fill_threads!(rng, x; max_tasks, min_elems) + end +end diff --git a/src/rand/splitmix64.jl b/src/rand/splitmix64.jl new file mode 100644 index 0000000..165f72f --- /dev/null +++ b/src/rand/splitmix64.jl @@ -0,0 +1,26 @@ +struct SplitMix64 <: CounterRNGAlgorithm end + +# SplitMix64 magic numbers +const SPLITMIX64_INCREMENT = UInt64(0x9e3779b97f4a7c15) +const SPLITMIX64_MIX_A = UInt64(0xbf58476d1ce4e5b9) +const SPLITMIX64_MIX_B = UInt64(0x94d049bb133111eb) + + +@inline function _splitmix64_mix(x::UInt64)::UInt64 + x = xor(x, x >> 30) + x *= SPLITMIX64_MIX_A + x = xor(x, x >> 27) + x *= SPLITMIX64_MIX_B + x = xor(x, x >> 31) + return x +end + + +""" + rand_uint32(rng::CounterRNG{<:Unsigned, SplitMix64}, counter::UInt64) -> UInt32 +""" +@inline function rand_uint32(rng::CounterRNG{<:Unsigned, SplitMix64}, counter::UInt64)::UInt32 + seed = UInt64(rng.seed) + mixed = _splitmix64_mix(counter + seed + SPLITMIX64_INCREMENT) + return UInt32(mixed >> 32) +end diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl new file mode 100644 index 0000000..0a65dc3 --- /dev/null +++ b/src/rand/threefry.jl @@ -0,0 +1,49 @@ +struct Threefry <: CounterRNGAlgorithm end + +# Threefry magic numbers +const THREEFRY_PARITY = UInt32(0x1BD11BDA) +const THREEFRY_ROTATIONS = ( + UInt32(13), UInt32(15), UInt32(26), UInt32(6), + UInt32(17), UInt32(29), UInt32(16), UInt32(24), +) +const THREEFRY_ROUNDS = 20 + + +@inline function _threefry_key_word(k0::UInt32, k1::UInt32, k2::UInt32, idx::Int)::UInt32 + idx == 0 && return k0 + idx == 1 && return k1 + return k2 +end + + +""" + rand_uint32(rng::CounterRNG{<:Unsigned, Threefry}, counter::UInt64) -> UInt32 +""" +@inline function rand_uint32(rng::CounterRNG{<:Unsigned, Threefry}, counter::UInt64)::UInt32 + x0 = _u32_lo(counter) + x1 = _u32_hi(counter) + + seed = UInt64(rng.seed) + k0 = _u32_lo(seed) + k1 = _u32_hi(seed) + k2 = xor(THREEFRY_PARITY, xor(k0, k1)) + + x0 += k0 + x1 += k1 + + @inbounds for round in 0:(THREEFRY_ROUNDS - 1) + rot = THREEFRY_ROTATIONS[(round & 0x7) + 1] + x0 += x1 + x1 = xor(_rotl32(x1, rot), x0) + + if (round & 0x3) == 3 + s = (round >>> 2) + 1 + i0 = s % 3 + i1 = (s + 1) % 3 + x0 += _threefry_key_word(k0, k1, k2, i0) + x1 += _threefry_key_word(k0, k1, k2, i1) + UInt32(s) + end + end + + return x0 +end diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl new file mode 100644 index 0000000..35b72f0 --- /dev/null +++ b/src/rand/utilities.jl @@ -0,0 +1,36 @@ +# lo: rightmost 32 bits, hi: leftmost 32 bits +@inline _u32_lo(x::UInt64)::UInt32 = UInt32(x & UInt64(0xffffffff)) +@inline _u32_hi(x::UInt64)::UInt32 = UInt32(x >> 32) + +# leftmost 32 bits of a*b cast to UInt64s +@inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32) + + +@inline function _rotl32(x::UInt32, r::UInt32)::UInt32 + return (x << r) | (x >> (UInt32(32) - r)) +end + + +@inline _counter_from_index(i)::UInt64 = UInt64(i - one(i)) + + +@inline function rand_uint32(::AbstractCounterRNG, ::UInt64)::UInt32 + # Unrecognised AbstractCounterRNG + throw(ArgumentError("No rand_uint32 implementation for this RNG type")) +end + + + +""" + uint32_to_unit_float32(u::UInt32) -> Float32 + +Convert a random `UInt32` to `Float32` in `[0, 1)` by mantissa construction. +""" +@inline function uint32_to_unit_float32(u::UInt32)::Float32 + # Keep 23 random bits for the mantissa (drop 9 rightmost bits from the UInt32) + # and combine with the bit pattern of 1.0f0 (sign=0, exponent=127). + bits = UInt32(0x3f800000) | (u >> 9) + + # Interpret as 1.mantissa, then subtract 1 for [0, 1) + return reinterpret(Float32, bits) - 1.0f0 +end diff --git a/test/rand.jl b/test/rand.jl new file mode 100644 index 0000000..7fefdff --- /dev/null +++ b/test/rand.jl @@ -0,0 +1,132 @@ +function _is_unit_interval(v) + for x in v + if isnan(x) || x < 0.0f0 || x >= 1.0f0 + return false + end + end + return true +end + +function _rand_fill_reference!(rng, x::AbstractArray{Float32}) + @inbounds for i in eachindex(x) + counter = UInt64(i - one(i)) + x[i] = AK.uint32_to_unit_float32(AK.rand_uint32(rng, counter)) + end + return x +end + +@testset "rand" begin + @test AK.CounterRNG{AK.SplitMix64}(0x1) isa AK.CounterRNG{AK.SplitMix64, UInt64} + @test AK.CounterRNG{AK.Philox}(UInt32(0x1)) isa AK.CounterRNG{AK.Philox, UInt32} + @test AK.CounterRNG{AK.Threefry, UInt16}(123) isa AK.CounterRNG{AK.Threefry, UInt16} + @test_throws ArgumentError AK.CounterRNG{AK.SplitMix64, UInt8}(300) + + rng_algs = (AK.SplitMix64(), AK.Philox(), AK.Threefry()) + + for alg in rng_algs + rng_alg = AK.CounterRNG(0x123456789abcdef; alg) + @test AK.rand_uint32(rng_alg, UInt64(0)) == AK.rand_uint32(rng_alg, UInt64(0)) + @test AK.rand_uint32(rng_alg, UInt64(1)) != AK.rand_uint32(rng_alg, UInt64(0)) + + vals_alg = [AK.rand_uint32(rng_alg, UInt64(i)) for i in 0:1023] + @test length(unique(vals_alg)) == length(vals_alg) + + x_alg = array_from_host(zeros(Float32, 2048)) + AK.rand!(rng_alg, x_alg; prefer_threads, block_size=64) + @test _is_unit_interval(Array(x_alg)) + end + + rng = AK.CounterRNG(0x123456789abcdef) + + @test AK.rand_uint32(rng, UInt64(0)) == AK.rand_uint32(rng, UInt64(0)) + @test AK.rand_uint32(rng, UInt64(1)) != AK.rand_uint32(rng, UInt64(0)) + @test AK.rand_uint32(rng, UInt64(17)) != AK.rand_uint32(rng, UInt64(18)) + + vals = [AK.rand_uint32(rng, UInt64(i)) for i in 0:2047] + @test length(unique(vals)) == length(vals) + + for u in ( + UInt32(0x00000000), + UInt32(0x00000001), + UInt32(0x7fffffff), + UInt32(0x80000000), + UInt32(0xffffffff), + ) + x = AK.uint32_to_unit_float32(u) + @test !isnan(x) + @test 0.0f0 <= x < 1.0f0 + end + + lengths = (0, 1, 31, 32, 33, 1024, 1025) + for len in lengths + x = array_from_host(zeros(Float32, len)) + AK.rand!(rng, x; prefer_threads, block_size=64) + xh = Array(x) + + ref = zeros(Float32, len) + _rand_fill_reference!(rng, ref) + + @test xh == ref + @test _is_unit_interval(xh) + end + + x1 = array_from_host(zeros(Float32, 4096)) + x2 = array_from_host(zeros(Float32, 4096)) + AK.rand!(rng, x1; prefer_threads, block_size=64) + AK.rand!(rng, x2; prefer_threads, block_size=257) + @test Array(x1) == Array(x2) + + rng2 = AK.CounterRNG(rng.seed + UInt64(1)) + x3 = array_from_host(zeros(Float32, 4096)) + AK.rand!(rng2, x3; prefer_threads, block_size=64) + @test Array(x3) != Array(x1) + + xnd = array_from_host(zeros(Float32, 7, 11, 5)) + AK.rand!(rng, xnd; prefer_threads, block_size=128) + xndh = Array(xnd) + refnd = zeros(Float32, 7, 11, 5) + _rand_fill_reference!(rng, refnd) + @test xndh == refnd + + if IS_CPU_BACKEND + base = zeros(Float32, 64) + view_x = @view base[2:2:end] + AK.rand!( + rng, + view_x; + max_tasks=Threads.nthreads(), + min_elems=1, + prefer_threads=true, + ) + + ref_view = zeros(Float32, length(view_x)) + _rand_fill_reference!(rng, ref_view) + @test collect(view_x) == ref_view + end + + nstats = 200_000 + xstats = array_from_host(zeros(Float32, nstats)) + AK.rand!(rng, xstats; prefer_threads, block_size=256) + xh = Array(xstats) + + @test _is_unit_interval(xh) + + m = sum(xh) / nstats + v = sum((x - m)^2 for x in xh) / nstats + @test abs(m - 0.5) < 0.01 + @test abs(v - (1 / 12)) < 0.01 + + nbins = 16 + counts = zeros(Int, nbins) + for x in xh + ibin = Int(floor(x * nbins)) + 1 + ibin = min(ibin, nbins) + counts[ibin] += 1 + end + expected = nstats / nbins + max_rel_dev = maximum(abs(c - expected) / expected for c in counts) + @test max_rel_dev < 0.1 + + x64 = array_from_host(zeros(Float64, 16)) + @test_throws MethodError AK.rand!(rng, x64; prefer_threads) +end diff --git a/test/runtests.jl b/test/runtests.jl index 716fd8e..a2707b6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -69,6 +69,7 @@ end include("partition.jl") include("looping.jl") include("map.jl") +include("rand.jl") include("sort.jl") include("reduce.jl") include("accumulate.jl") From 3c3e81a6ca8f48ccd2b8d93180b50af9691c1b74 Mon Sep 17 00:00:00 2001 From: fjbarter Date: Mon, 16 Mar 2026 16:51:00 +0000 Subject: [PATCH 02/18] add convenience for AK.rand!(X), and single UInt64 seed construction for fresh fills with simple calls. determinism obviously remains if an rng is passed --- src/rand/philox.jl | 33 ++++++- src/rand/rand.jl | 89 ++++++++++++++---- src/rand/splitmix64.jl | 23 ++++- src/rand/threefry.jl | 33 ++++++- src/rand/utilities.jl | 63 ++++++++++++- test/rand.jl | 204 +++++++++++++++++++++++++++++------------ 6 files changed, 357 insertions(+), 88 deletions(-) diff --git a/src/rand/philox.jl b/src/rand/philox.jl index 2b32d95..8137f64 100644 --- a/src/rand/philox.jl +++ b/src/rand/philox.jl @@ -18,9 +18,12 @@ end """ - rand_uint32(rng::CounterRNG{<:Unsigned, Philox}, counter::UInt64) -> UInt32 + _philox2x32_block(rng::CounterRNG{<:Philox}, counter::UInt64) """ -@inline function rand_uint32(rng::CounterRNG{<:Unsigned, Philox}, counter::UInt64)::UInt32 +@inline function _philox2x32_block( + rng::CounterRNG{<:Philox}, + counter::UInt64, +)::Tuple{UInt32, UInt32} x0 = _u32_lo(counter) x1 = _u32_hi(counter) @@ -33,5 +36,31 @@ end k0 += PHILOX_W0 end + return x0, x1 +end + + +""" + rand_uint(rng::CounterRNG{<:Philox}, counter::UInt64, UInt32) -> UInt32 +""" +@inline function rand_uint( + rng::CounterRNG{<:Philox}, + counter::UInt64, + ::Type{UInt32}, +)::UInt32 + x0, _ = _philox2x32_block(rng, counter) return x0 end + + +""" + rand_uint(rng::CounterRNG{<:Philox}, counter::UInt64, UInt64) -> UInt64 +""" +@inline function rand_uint( + rng::CounterRNG{<:Philox}, + counter::UInt64, + ::Type{UInt64}, +)::UInt64 + x0, x1 = _philox2x32_block(rng, counter) + return _u64_from_u32(x0, x1) +end diff --git a/src/rand/rand.jl b/src/rand/rand.jl index 07952e1..0e29452 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -10,7 +10,7 @@ abstract type CounterRNGAlgorithm end """ - CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=SplitMix64()) + CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox()) Stateless counter-based RNG configuration for [`rand!`](@ref). @@ -21,13 +21,20 @@ value is a pure function of: - algorithm (`alg`) The default algorithm is `Philox()`. + +`seed` may be any non-negative `Integer`. It is normalised to `UInt64` internally. """ -struct CounterRNG{K <: Unsigned, A <: CounterRNGAlgorithm} <: AbstractCounterRNG - seed::K +struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG + seed::UInt64 alg::A end +function CounterRNG(seed::Unsigned; alg::CounterRNGAlgorithm=Philox()) + CounterRNG(UInt64(seed), alg) +end + + function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox()) @argcheck seed >= 0 CounterRNG(UInt64(seed), alg) @@ -35,6 +42,20 @@ end +""" + CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64()) + +Create a stateless counter-based RNG with an automatically generated seed. + +The seed is sampled exactly once at construction using `rand(UInt64)`. Reusing this same +`CounterRNG` instance is deterministic for fixed seed, algorithm, array shape, and eltype. +""" +function CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64()) + CounterRNG(Base.rand(UInt64); alg) +end + + + # Shared helpers include("utilities.jl") @@ -50,14 +71,14 @@ include("threefry.jl") function _rand_fill_threads!( rng::AbstractCounterRNG, - x::AbstractArray{Float32}; + x::AbstractArray{T}; max_tasks::Int, min_elems::Int, -) +) where {T <: ALLOWED_RAND_SCALARS} task_partition(length(x), max_tasks, min_elems) do irange @inbounds for i in irange counter = _counter_from_index(i) - x[i] = uint32_to_unit_float32(rand_uint32(rng, counter)) + x[i] = rand_scalar(rng, counter, T) end end return x @@ -71,17 +92,17 @@ end i = @index(Global, Linear) if i <= length(x) counter = _counter_from_index(i) - x[i] = uint32_to_unit_float32(rand_uint32(rng, counter)) + x[i] = rand_scalar(rng, counter, eltype(x)) end end function _rand_fill_gpu!( rng::AbstractCounterRNG, - x::AbstractArray{Float32}, + x::AbstractArray{T}, backend::Backend; block_size::Int, -) +) where {T <: ALLOWED_RAND_SCALARS} @argcheck block_size > 0 len = length(x) len == 0 && return x @@ -96,7 +117,7 @@ end """ rand!( rng::AbstractCounterRNG, - x::AbstractArray{Float32}, + x::AbstractArray{T}, backend::Backend=get_backend(x); # CPU settings @@ -110,15 +131,23 @@ end block_size::Int=256, ) -Fill `x` in-place with pseudo-random `Float32` values in `[0, 1)` using a stateless counter-based -RNG. For `x[i]`, the counter is exactly `UInt64(i - 1)` in linear indexing order. +Fill `x` in-place with pseudo-random values using a stateless counter-based RNG. For `x[i]`, the +counter is exactly `UInt64(i - 1)` in linear indexing order. -The float conversion is mantissa-based: uniform over the produced mantissa grid, not over all -representable `Float32` values in `[0, 1)`. +Supported scalar element types are: +- `UInt32`, `UInt64` +- `Int32`, `Int64` +- `Float32`, `Float64` + +Semantics: +- Unsigned integers: raw random bit patterns of requested width. +- Signed integers: corresponding unsigned patterns reinterpreted as signed. +- Floats: mantissa-based conversion from `UInt32`/`UInt64` into `[0, 1)`, uniform over the + produced mantissa grid (not over all representable floats). """ function rand!( rng::AbstractCounterRNG, - x::AbstractArray{Float32}, + x::AbstractArray{T}, backend::Backend=get_backend(x); # CPU settings @@ -128,10 +157,34 @@ function rand!( # GPU settings block_size::Int=256, -) +) where T + + @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)" + if use_gpu_algorithm(backend, prefer_threads) - _rand_fill_gpu!(rng, x, backend; block_size) + return _rand_fill_gpu!(rng, x, backend; block_size) else - _rand_fill_threads!(rng, x; max_tasks, min_elems) + return _rand_fill_threads!(rng, x; max_tasks, min_elems) end end + + +""" + rand!( + x::AbstractArray{T}, + args...; + kwargs..., + ) + +Convenience overload that creates a fresh `CounterRNG()` and fills `x`. + +Each call to `rand!(x, ...)` auto-seeds a new RNG once using `rand(UInt64)`, so repeated calls +produce different outputs unless an explicit `CounterRNG` is provided. +""" +function rand!( + x::AbstractArray, + args...; + kwargs..., +) + return rand!(CounterRNG(), x, args...; kwargs...) +end diff --git a/src/rand/splitmix64.jl b/src/rand/splitmix64.jl index 165f72f..0b94f31 100644 --- a/src/rand/splitmix64.jl +++ b/src/rand/splitmix64.jl @@ -17,10 +17,25 @@ end """ - rand_uint32(rng::CounterRNG{<:Unsigned, SplitMix64}, counter::UInt64) -> UInt32 + rand_uint(rng::CounterRNG{<:SplitMix64}, counter::UInt64, UInt64) -> UInt64 """ -@inline function rand_uint32(rng::CounterRNG{<:Unsigned, SplitMix64}, counter::UInt64)::UInt32 +@inline function rand_uint( + rng::CounterRNG{<:SplitMix64}, + counter::UInt64, + ::Type{UInt64}, +)::UInt64 seed = UInt64(rng.seed) - mixed = _splitmix64_mix(counter + seed + SPLITMIX64_INCREMENT) - return UInt32(mixed >> 32) + return _splitmix64_mix(counter + seed + SPLITMIX64_INCREMENT) +end + + +""" + rand_uint(rng::CounterRNG{<:SplitMix64}, counter::UInt64, UInt32) -> UInt32 +""" +@inline function rand_uint( + rng::CounterRNG{<:SplitMix64}, + counter::UInt64, + ::Type{UInt32}, +)::UInt32 + return _u32_hi(rand_uint(rng, counter, UInt64)) end diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl index 0a65dc3..4b1e5af 100644 --- a/src/rand/threefry.jl +++ b/src/rand/threefry.jl @@ -17,9 +17,12 @@ end """ - rand_uint32(rng::CounterRNG{<:Unsigned, Threefry}, counter::UInt64) -> UInt32 + _threefry2x32_block(rng::CounterRNG{<:Threefry}, counter::UInt64) """ -@inline function rand_uint32(rng::CounterRNG{<:Unsigned, Threefry}, counter::UInt64)::UInt32 +@inline function _threefry2x32_block( + rng::CounterRNG{<:Threefry}, + counter::UInt64, +)::Tuple{UInt32, UInt32} x0 = _u32_lo(counter) x1 = _u32_hi(counter) @@ -45,5 +48,31 @@ end end end + return x0, x1 +end + + +""" + rand_uint(rng::CounterRNG{<:Threefry}, counter::UInt64, UInt32) -> UInt32 +""" +@inline function rand_uint( + rng::CounterRNG{<:Threefry}, + counter::UInt64, + ::Type{UInt32}, +)::UInt32 + x0, _ = _threefry2x32_block(rng, counter) return x0 end + + +""" + rand_uint(rng::CounterRNG{<:Threefry}, counter::UInt64, UInt64) -> UInt64 +""" +@inline function rand_uint( + rng::CounterRNG{<:Threefry}, + counter::UInt64, + ::Type{UInt64}, +)::UInt64 + x0, x1 = _threefry2x32_block(rng, counter) + return _u64_from_u32(x0, x1) +end diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index 35b72f0..1f27c95 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -1,6 +1,7 @@ # lo: rightmost 32 bits, hi: leftmost 32 bits @inline _u32_lo(x::UInt64)::UInt32 = UInt32(x & UInt64(0xffffffff)) @inline _u32_hi(x::UInt64)::UInt32 = UInt32(x >> 32) +@inline _u64_from_u32(lo::UInt32, hi::UInt32)::UInt64 = (UInt64(hi) << 32) | UInt64(lo) # leftmost 32 bits of a*b cast to UInt64s @inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32) @@ -14,9 +15,50 @@ end @inline _counter_from_index(i)::UInt64 = UInt64(i - one(i)) -@inline function rand_uint32(::AbstractCounterRNG, ::UInt64)::UInt32 - # Unrecognised AbstractCounterRNG - throw(ArgumentError("No rand_uint32 implementation for this RNG type")) +""" + ALLOWED_RAND_SCALARS + +Internal scalar eltypes currently supported by [`rand!`](@ref). +""" +const ALLOWED_RAND_SCALARS = Union{ + UInt32, UInt64, + Int32, Int64, + Float32, Float64, +} + + +@inline function rand_uint(::AbstractCounterRNG, ::UInt64, ::Type{U})::U where {U <: Union{UInt32, UInt64}} + throw(ArgumentError("No rand_uint implementation for this RNG type")) +end + + +@inline raw_uint_type(::Type{UInt32}) = UInt32 +@inline raw_uint_type(::Type{Int32}) = UInt32 +@inline raw_uint_type(::Type{Float32}) = UInt32 +@inline raw_uint_type(::Type{UInt64}) = UInt64 +@inline raw_uint_type(::Type{Int64}) = UInt64 +@inline raw_uint_type(::Type{Float64}) = UInt64 + + +@inline from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u +@inline from_uint(::Type{UInt64}, u::UInt64)::UInt64 = u +@inline from_uint(::Type{Int32}, u::UInt32)::Int32 = reinterpret(Int32, u) +@inline from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u) +@inline from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u) +@inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u) + + +@inline function rand_scalar(rng::AbstractCounterRNG, counter::UInt64, ::Type{T})::T where {T <: ALLOWED_RAND_SCALARS} + U = raw_uint_type(T) + u = rand_uint(rng, counter, U) + return from_uint(T, u) +end + + +@inline function rand_scalar(::AbstractCounterRNG, ::UInt64, ::Type{T}) where {T} + throw(ArgumentError( + "Unsupported random scalar type $(T). Supported: UInt32, UInt64, Int32, Int64, Float32, Float64." + )) end @@ -34,3 +76,18 @@ Convert a random `UInt32` to `Float32` in `[0, 1)` by mantissa construction. # Interpret as 1.mantissa, then subtract 1 for [0, 1) return reinterpret(Float32, bits) - 1.0f0 end + + +""" + uint64_to_unit_float64(u::UInt64) -> Float64 + +Convert a random `UInt64` to `Float64` in `[0, 1)` by mantissa construction. +""" +@inline function uint64_to_unit_float64(u::UInt64)::Float64 + # Keep 52 random bits for the mantissa (drop 12 rightmost bits from the UInt64) + # and combine with the bit pattern of 1.0 (sign=0, exponent=1023). + bits = UInt64(0x3ff0000000000000) | (u >> 12) + + # Interpret as 1.mantissa, then subtract 1 for [0, 1) + return reinterpret(Float64, bits) - 1.0 +end diff --git a/test/rand.jl b/test/rand.jl index 7fefdff..339a2cb 100644 --- a/test/rand.jl +++ b/test/rand.jl @@ -1,49 +1,100 @@ function _is_unit_interval(v) for x in v - if isnan(x) || x < 0.0f0 || x >= 1.0f0 + if isnan(x) || x < zero(x) || x >= one(x) return false end end return true end -function _rand_fill_reference!(rng, x::AbstractArray{Float32}) +function _rand_fill_reference!(rng, x::AbstractArray{T}) where {T <: AK.ALLOWED_RAND_SCALARS} @inbounds for i in eachindex(x) counter = UInt64(i - one(i)) - x[i] = AK.uint32_to_unit_float32(AK.rand_uint32(rng, counter)) + x[i] = AK.rand_scalar(rng, counter, T) end return x end @testset "rand" begin - @test AK.CounterRNG{AK.SplitMix64}(0x1) isa AK.CounterRNG{AK.SplitMix64, UInt64} - @test AK.CounterRNG{AK.Philox}(UInt32(0x1)) isa AK.CounterRNG{AK.Philox, UInt32} - @test AK.CounterRNG{AK.Threefry, UInt16}(123) isa AK.CounterRNG{AK.Threefry, UInt16} - @test_throws ArgumentError AK.CounterRNG{AK.SplitMix64, UInt8}(300) + @test AK.CounterRNG(0x1; alg=AK.SplitMix64()) isa AK.CounterRNG{AK.SplitMix64} + @test AK.CounterRNG(UInt32(0x1); alg=AK.Philox()) isa AK.CounterRNG{AK.Philox} + @test AK.CounterRNG(UInt16(123); alg=AK.Threefry()) isa AK.CounterRNG{AK.Threefry} + @test AK.CounterRNG(UInt32(300)).seed == UInt64(300) + @test_throws ArgumentError AK.CounterRNG(-1) + + Random.seed!(0x1234) + expected_auto_seed = rand(UInt64) + Random.seed!(0x1234) + rng_auto = AK.CounterRNG() + @test rng_auto.seed == expected_auto_seed + @test rng_auto.alg isa AK.SplitMix64 + + xauto1 = array_from_host(zeros(Float32, 1024)) + xauto2 = array_from_host(zeros(Float32, 1024)) + AK.rand!(rng_auto, xauto1; prefer_threads, block_size=64) + AK.rand!(rng_auto, xauto2; prefer_threads, block_size=257) + @test Array(xauto1) == Array(xauto2) + + Random.seed!(0xabcdef) + seed1 = rand(UInt64) + seed2 = rand(UInt64) + ref1 = array_from_host(zeros(Float32, 1024)) + ref2 = array_from_host(zeros(Float32, 1024)) + AK.rand!(AK.CounterRNG(seed1; alg=AK.SplitMix64()), ref1; prefer_threads, block_size=64) + AK.rand!(AK.CounterRNG(seed2; alg=AK.SplitMix64()), ref2; prefer_threads, block_size=64) + + Random.seed!(0xabcdef) + xconv1 = array_from_host(zeros(Float32, 1024)) + xconv2 = array_from_host(zeros(Float32, 1024)) + AK.rand!(xconv1; prefer_threads, block_size=64) + AK.rand!(xconv2; prefer_threads, block_size=64) + @test Array(xconv1) == Array(ref1) + @test Array(xconv2) == Array(ref2) rng_algs = (AK.SplitMix64(), AK.Philox(), AK.Threefry()) + scalar_types = (UInt32, UInt64, Int32, Int64, Float32, Float64) for alg in rng_algs rng_alg = AK.CounterRNG(0x123456789abcdef; alg) - @test AK.rand_uint32(rng_alg, UInt64(0)) == AK.rand_uint32(rng_alg, UInt64(0)) - @test AK.rand_uint32(rng_alg, UInt64(1)) != AK.rand_uint32(rng_alg, UInt64(0)) + for U in (UInt32, UInt64) + @test AK.rand_uint(rng_alg, UInt64(0), U) == AK.rand_uint(rng_alg, UInt64(0), U) + @test AK.rand_uint(rng_alg, UInt64(1), U) != AK.rand_uint(rng_alg, UInt64(0), U) - vals_alg = [AK.rand_uint32(rng_alg, UInt64(i)) for i in 0:1023] - @test length(unique(vals_alg)) == length(vals_alg) + vals_alg = [AK.rand_uint(rng_alg, UInt64(i), U) for i in 0:1023] + @test length(unique(vals_alg)) > 900 + end - x_alg = array_from_host(zeros(Float32, 2048)) - AK.rand!(rng_alg, x_alg; prefer_threads, block_size=64) - @test _is_unit_interval(Array(x_alg)) + for T in scalar_types + x_alg = array_from_host(zeros(T, 2048)) + AK.rand!(rng_alg, x_alg; prefer_threads, block_size=64) + x_alg_h = Array(x_alg) + ref_alg = zeros(T, 2048) + _rand_fill_reference!(rng_alg, ref_alg) + @test x_alg_h == ref_alg + if T <: AbstractFloat + @test _is_unit_interval(x_alg_h) + end + end end rng = AK.CounterRNG(0x123456789abcdef) - @test AK.rand_uint32(rng, UInt64(0)) == AK.rand_uint32(rng, UInt64(0)) - @test AK.rand_uint32(rng, UInt64(1)) != AK.rand_uint32(rng, UInt64(0)) - @test AK.rand_uint32(rng, UInt64(17)) != AK.rand_uint32(rng, UInt64(18)) + @test AK.from_uint(UInt32, UInt32(0xdeadbeef)) == UInt32(0xdeadbeef) + @test AK.from_uint(UInt64, UInt64(0x0123456789abcdef)) == UInt64(0x0123456789abcdef) + @test AK.from_uint(Int32, UInt32(0xdeadbeef)) == reinterpret(Int32, UInt32(0xdeadbeef)) + @test AK.from_uint(Int64, UInt64(0x0123456789abcdef)) == reinterpret(Int64, UInt64(0x0123456789abcdef)) + + @test AK.rand_uint(rng, UInt64(0), UInt32) == AK.rand_uint(rng, UInt64(0), UInt32) + @test AK.rand_uint(rng, UInt64(1), UInt32) != AK.rand_uint(rng, UInt64(0), UInt32) + @test AK.rand_uint(rng, UInt64(17), UInt32) != AK.rand_uint(rng, UInt64(18), UInt32) + @test AK.rand_uint(rng, UInt64(0), UInt64) == AK.rand_uint(rng, UInt64(0), UInt64) + @test AK.rand_uint(rng, UInt64(1), UInt64) != AK.rand_uint(rng, UInt64(0), UInt64) + @test AK.rand_uint(rng, UInt64(17), UInt64) != AK.rand_uint(rng, UInt64(18), UInt64) - vals = [AK.rand_uint32(rng, UInt64(i)) for i in 0:2047] - @test length(unique(vals)) == length(vals) + vals_u32 = [AK.rand_uint(rng, UInt64(i), UInt32) for i in 0:2047] + vals_u64 = [AK.rand_uint(rng, UInt64(i), UInt64) for i in 0:2047] + @test length(unique(vals_u32)) > 1800 + @test length(unique(vals_u64)) > 2000 for u in ( UInt32(0x00000000), @@ -57,51 +108,85 @@ end @test 0.0f0 <= x < 1.0f0 end - lengths = (0, 1, 31, 32, 33, 1024, 1025) - for len in lengths - x = array_from_host(zeros(Float32, len)) - AK.rand!(rng, x; prefer_threads, block_size=64) - xh = Array(x) - - ref = zeros(Float32, len) - _rand_fill_reference!(rng, ref) + for u in ( + UInt64(0x0000000000000000), + UInt64(0x0000000000000001), + UInt64(0x7fffffffffffffff), + UInt64(0x8000000000000000), + UInt64(0xffffffffffffffff), + ) + x = AK.uint64_to_unit_float64(u) + @test !isnan(x) + @test 0.0 <= x < 1.0 + end - @test xh == ref - @test _is_unit_interval(xh) + for T in scalar_types + s0 = AK.rand_scalar(rng, UInt64(0), T) + s1 = AK.rand_scalar(rng, UInt64(1), T) + @test s0 isa T + @test s1 isa T + @test s0 != s1 + if T <: AbstractFloat + @test zero(T) <= s0 < one(T) + @test zero(T) <= s1 < one(T) + end end - x1 = array_from_host(zeros(Float32, 4096)) - x2 = array_from_host(zeros(Float32, 4096)) - AK.rand!(rng, x1; prefer_threads, block_size=64) - AK.rand!(rng, x2; prefer_threads, block_size=257) - @test Array(x1) == Array(x2) + @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16) - rng2 = AK.CounterRNG(rng.seed + UInt64(1)) - x3 = array_from_host(zeros(Float32, 4096)) - AK.rand!(rng2, x3; prefer_threads, block_size=64) - @test Array(x3) != Array(x1) + lengths = (0, 1, 31, 32, 33, 1024, 1025) + for T in scalar_types + for len in lengths + x = array_from_host(zeros(T, len)) + AK.rand!(rng, x; prefer_threads, block_size=64) + xh = Array(x) + + ref = zeros(T, len) + _rand_fill_reference!(rng, ref) + + @test xh == ref + if T <: AbstractFloat + @test _is_unit_interval(xh) + end + end + end - xnd = array_from_host(zeros(Float32, 7, 11, 5)) - AK.rand!(rng, xnd; prefer_threads, block_size=128) - xndh = Array(xnd) - refnd = zeros(Float32, 7, 11, 5) - _rand_fill_reference!(rng, refnd) - @test xndh == refnd + rng2 = AK.CounterRNG(rng.seed + UInt64(1)) + for T in scalar_types + x1 = array_from_host(zeros(T, 4096)) + x2 = array_from_host(zeros(T, 4096)) + AK.rand!(rng, x1; prefer_threads, block_size=64) + AK.rand!(rng, x2; prefer_threads, block_size=257) + @test Array(x1) == Array(x2) + + x3 = array_from_host(zeros(T, 4096)) + AK.rand!(rng2, x3; prefer_threads, block_size=64) + @test Array(x3) != Array(x1) + + xnd = array_from_host(zeros(T, 7, 11, 5)) + AK.rand!(rng, xnd; prefer_threads, block_size=128) + xndh = Array(xnd) + refnd = zeros(T, 7, 11, 5) + _rand_fill_reference!(rng, refnd) + @test xndh == refnd + end if IS_CPU_BACKEND - base = zeros(Float32, 64) - view_x = @view base[2:2:end] - AK.rand!( - rng, - view_x; - max_tasks=Threads.nthreads(), - min_elems=1, - prefer_threads=true, - ) - - ref_view = zeros(Float32, length(view_x)) - _rand_fill_reference!(rng, ref_view) - @test collect(view_x) == ref_view + for T in scalar_types + base = zeros(T, 64) + view_x = @view base[2:2:end] + AK.rand!( + rng, + view_x; + max_tasks=Threads.nthreads(), + min_elems=1, + prefer_threads=true, + ) + + ref_view = zeros(T, length(view_x)) + _rand_fill_reference!(rng, ref_view) + @test collect(view_x) == ref_view + end end nstats = 200_000 @@ -127,6 +212,7 @@ end max_rel_dev = maximum(abs(c - expected) / expected for c in counts) @test max_rel_dev < 0.1 - x64 = array_from_host(zeros(Float64, 16)) - @test_throws MethodError AK.rand!(rng, x64; prefer_threads) + x16 = array_from_host(zeros(UInt16, 16)) + @test_throws ArgumentError AK.rand!(x16; prefer_threads) + @test_throws ArgumentError AK.rand!(rng, x16; prefer_threads) end From f590d5eabcb5a616776cb13913c3b72eba2e1428 Mon Sep 17 00:00:00 2001 From: fjbarter Date: Mon, 16 Mar 2026 17:42:43 +0000 Subject: [PATCH 03/18] just use foreachindex for the kernel now that it is simple enough --- src/rand/rand.jl | 86 +++++++++---------------------------------- src/rand/utilities.jl | 27 ++++++++++---- 2 files changed, 36 insertions(+), 77 deletions(-) diff --git a/src/rand/rand.jl b/src/rand/rand.jl index 0e29452..23ef801 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -23,6 +23,13 @@ value is a pure function of: The default algorithm is `Philox()`. `seed` may be any non-negative `Integer`. It is normalised to `UInt64` internally. + +Constructors: +- `CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())` + Uses an explicit non-negative seed. +- `CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64())` + Auto-seeds once using `rand(UInt64)`. Reusing the same `CounterRNG` instance is deterministic + for fixed seed, algorithm, array shape, and eltype. """ struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG seed::UInt64 @@ -30,26 +37,12 @@ struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG end -function CounterRNG(seed::Unsigned; alg::CounterRNGAlgorithm=Philox()) - CounterRNG(UInt64(seed), alg) -end - - function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox()) - @argcheck seed >= 0 + @argcheck seed >= 0 "Seed must be a positive integer" CounterRNG(UInt64(seed), alg) end - -""" - CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64()) - -Create a stateless counter-based RNG with an automatically generated seed. - -The seed is sampled exactly once at construction using `rand(UInt64)`. Reusing this same -`CounterRNG` instance is deterministic for fixed seed, algorithm, array shape, and eltype. -""" function CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64()) CounterRNG(Base.rand(UInt64); alg) end @@ -65,55 +58,6 @@ include("splitmix64.jl") include("philox.jl") include("threefry.jl") - - - - -function _rand_fill_threads!( - rng::AbstractCounterRNG, - x::AbstractArray{T}; - max_tasks::Int, - min_elems::Int, -) where {T <: ALLOWED_RAND_SCALARS} - task_partition(length(x), max_tasks, min_elems) do irange - @inbounds for i in irange - counter = _counter_from_index(i) - x[i] = rand_scalar(rng, counter, T) - end - end - return x -end - - -@kernel inbounds=true cpu=false unsafe_indices=true function _rand_fill_kernel!( - rng, - x, -) - i = @index(Global, Linear) - if i <= length(x) - counter = _counter_from_index(i) - x[i] = rand_scalar(rng, counter, eltype(x)) - end -end - - -function _rand_fill_gpu!( - rng::AbstractCounterRNG, - x::AbstractArray{T}, - backend::Backend; - block_size::Int, -) where {T <: ALLOWED_RAND_SCALARS} - @argcheck block_size > 0 - len = length(x) - len == 0 && return x - - blocks = div(len, block_size, RoundUp) - kernel! = _rand_fill_kernel!(backend, block_size) - kernel!(rng, x, ndrange=(blocks * block_size,)) - return x -end - - """ rand!( rng::AbstractCounterRNG, @@ -160,12 +104,16 @@ function rand!( ) where T @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)" - - if use_gpu_algorithm(backend, prefer_threads) - return _rand_fill_gpu!(rng, x, backend; block_size) - else - return _rand_fill_threads!(rng, x; max_tasks, min_elems) + foreachindex( + 1:length(x), backend; + max_tasks, + min_elems, + prefer_threads, + block_size, + ) do i + @inbounds x[i] = rand_scalar(rng, _counter_from_index(i), T) end + return x end diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index 1f27c95..c22a863 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -27,11 +27,6 @@ const ALLOWED_RAND_SCALARS = Union{ } -@inline function rand_uint(::AbstractCounterRNG, ::UInt64, ::Type{U})::U where {U <: Union{UInt32, UInt64}} - throw(ArgumentError("No rand_uint implementation for this RNG type")) -end - - @inline raw_uint_type(::Type{UInt32}) = UInt32 @inline raw_uint_type(::Type{Int32}) = UInt32 @inline raw_uint_type(::Type{Float32}) = UInt32 @@ -48,9 +43,24 @@ end @inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u) -@inline function rand_scalar(rng::AbstractCounterRNG, counter::UInt64, ::Type{T})::T where {T <: ALLOWED_RAND_SCALARS} - U = raw_uint_type(T) - u = rand_uint(rng, counter, U) +@inline function rand_uint( + rng::AbstractCounterRNG, + ::UInt64, + ::Type{UIntType} +)::UIntType where {UIntType <: Union{UInt32, UInt64}} + throw(ArgumentError("No rand_uint implementation for RNG: $rng")) +end + + +@inline function rand_scalar( + rng::AbstractCounterRNG, + counter::UInt64, + ::Type{T} +)::T where {T <: ALLOWED_RAND_SCALARS} + + UIntType = raw_uint_type(T) + u = rand_uint(rng, counter, UIntType) + return from_uint(T, u) end @@ -63,6 +73,7 @@ end + """ uint32_to_unit_float32(u::UInt32) -> Float32 From 9e471d6c3144704e036bd98bf16705181426b85a Mon Sep 17 00:00:00 2001 From: fjbarter Date: Mon, 16 Mar 2026 18:34:24 +0000 Subject: [PATCH 04/18] finalise tests and docs --- docs/make.jl | 1 + docs/src/api/rand.md | 48 ++++++ src/rand/philox.jl | 15 +- src/rand/rand.jl | 18 +- src/rand/splitmix64.jl | 8 +- src/rand/threefry.jl | 14 +- src/rand/utilities.jl | 36 ++-- test/rand.jl | 362 ++++++++++++++++++++--------------------- 8 files changed, 256 insertions(+), 246 deletions(-) create mode 100644 docs/src/api/rand.md diff --git a/docs/make.jl b/docs/make.jl index 58b4632..5e794a2 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -23,6 +23,7 @@ makedocs(; "Using Different Backends" => "api/using_backends.md", "General Loops" => "api/foreachindex.md", "Map" => "api/map.md", + "Random Number Generation" => "api/rand.md", "Sorting" => "api/sort.md", "Reduce" => "api/reduce.md", "MapReduce" => "api/mapreduce.md", diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md new file mode 100644 index 0000000..2fa468f --- /dev/null +++ b/docs/src/api/rand.md @@ -0,0 +1,48 @@ +### Random Number Generation + +Counter-based random generation for CPU and GPU backends with deterministic behavior for fixed +`seed`, algorithm, array shape, and eltype. + +Use an explicit `CounterRNG(seed; alg=...)` when reproducibility matters. For convenience, +`AK.rand!(x)` creates a fresh `CounterRNG()` on each call using one auto-seeded `Base.rand(UInt64)` +draw, so repeated calls usually produce different outputs. + +Supported output element types: +- `UInt32`, `UInt64` +- `Int32`, `Int64` +- `Float32`, `Float64` + +The core of the random number generation produces a `UInt` of the requested scalar width. +That `UInt` is then either: +- Unsigned integers: returned as-is +- Signed integers: reinterpreted as a signed integer bit pattern. +- Floats: mantissa construction into a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/)). + +Algorithms currently available: +- `SplitMix64` ([read more](https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64)) +- `Philox` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)) +- `Threefry` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)) + +`Philox` is the default algorithm for `CounterRNG()`, as it is more thoroughly +statistically tested and measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an RTX +5060 (advertised 448 GB/s), i.e. effectively memory-bound throughput. + +Examples: +```julia +import AcceleratedKernels as AK +using oneAPI + +# Reproducible +rng = AK.CounterRNG(0x12345678; alg=AK.Philox()) +x = oneArray{Float32}(undef, 1024) +AK.rand!(rng, x) + +# Convenience (fresh auto-seeded RNG on each call) +y = oneArray{Float32}(undef, 1024) +AK.rand!(y) +``` + +```@docs +AcceleratedKernels.CounterRNG +AcceleratedKernels.rand! +``` diff --git a/src/rand/philox.jl b/src/rand/philox.jl index 8137f64..bdebc8e 100644 --- a/src/rand/philox.jl +++ b/src/rand/philox.jl @@ -7,7 +7,6 @@ const PHILOX_W0 = UInt32(0x9E3779B9) const PHILOX_ROUNDS = 10 -# Each round destroys x0 with multiplication, addition, and XORs @inline function _philox2x32_round(x0::UInt32, x1::UInt32, k0::UInt32) lo = PHILOX_M0 * x0 hi = _mulhi_u32(PHILOX_M0, x0) @@ -17,9 +16,7 @@ const PHILOX_ROUNDS = 10 end -""" - _philox2x32_block(rng::CounterRNG{<:Philox}, counter::UInt64) -""" +# Evaluate one Philox block at `counter`, returning two 32-bit lanes `(x0, x1)` @inline function _philox2x32_block( rng::CounterRNG{<:Philox}, counter::UInt64, @@ -40,9 +37,7 @@ end end -""" - rand_uint(rng::CounterRNG{<:Philox}, counter::UInt64, UInt32) -> UInt32 -""" +# Return lane 0 from the single Philox block at `counter` @inline function rand_uint( rng::CounterRNG{<:Philox}, counter::UInt64, @@ -53,14 +48,12 @@ end end -""" - rand_uint(rng::CounterRNG{<:Philox}, counter::UInt64, UInt64) -> UInt64 -""" +# Build UInt64 from the two lanes `(x0, x1)` of the same Philox block at `counter` @inline function rand_uint( rng::CounterRNG{<:Philox}, counter::UInt64, ::Type{UInt64}, )::UInt64 x0, x1 = _philox2x32_block(rng, counter) - return _u64_from_u32(x0, x1) + return _u64_from_u32s(x0, x1) end diff --git a/src/rand/rand.jl b/src/rand/rand.jl index 23ef801..e62ba96 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -38,12 +38,12 @@ end function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox()) - @argcheck seed >= 0 "Seed must be a positive integer" + @argcheck seed >= 0 "Seed must be a non-negative integer" CounterRNG(UInt64(seed), alg) end -function CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64()) +function CounterRNG(; alg::CounterRNGAlgorithm=Philox()) CounterRNG(Base.rand(UInt64); alg) end @@ -115,20 +115,6 @@ function rand!( end return x end - - -""" - rand!( - x::AbstractArray{T}, - args...; - kwargs..., - ) - -Convenience overload that creates a fresh `CounterRNG()` and fills `x`. - -Each call to `rand!(x, ...)` auto-seeds a new RNG once using `rand(UInt64)`, so repeated calls -produce different outputs unless an explicit `CounterRNG` is provided. -""" function rand!( x::AbstractArray, args...; diff --git a/src/rand/splitmix64.jl b/src/rand/splitmix64.jl index 0b94f31..d169b73 100644 --- a/src/rand/splitmix64.jl +++ b/src/rand/splitmix64.jl @@ -16,9 +16,7 @@ const SPLITMIX64_MIX_B = UInt64(0x94d049bb133111eb) end -""" - rand_uint(rng::CounterRNG{<:SplitMix64}, counter::UInt64, UInt64) -> UInt64 -""" +# Natural SplitMix64 output path: compute 64 random bits directly from one counter @inline function rand_uint( rng::CounterRNG{<:SplitMix64}, counter::UInt64, @@ -29,9 +27,7 @@ end end -""" - rand_uint(rng::CounterRNG{<:SplitMix64}, counter::UInt64, UInt32) -> UInt32 -""" +# UInt32 path is derived from the high 32 bits of the UInt64 SplitMix output @inline function rand_uint( rng::CounterRNG{<:SplitMix64}, counter::UInt64, diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl index 4b1e5af..7326f40 100644 --- a/src/rand/threefry.jl +++ b/src/rand/threefry.jl @@ -16,9 +16,7 @@ const THREEFRY_ROUNDS = 20 end -""" - _threefry2x32_block(rng::CounterRNG{<:Threefry}, counter::UInt64) -""" +# Evaluate one Threefry block at `counter`, returning two 32-bit lanes `(x0, x1)` @inline function _threefry2x32_block( rng::CounterRNG{<:Threefry}, counter::UInt64, @@ -52,9 +50,7 @@ end end -""" - rand_uint(rng::CounterRNG{<:Threefry}, counter::UInt64, UInt32) -> UInt32 -""" +# Return lane 0 from the single Threefry block at `counter` @inline function rand_uint( rng::CounterRNG{<:Threefry}, counter::UInt64, @@ -65,14 +61,12 @@ end end -""" - rand_uint(rng::CounterRNG{<:Threefry}, counter::UInt64, UInt64) -> UInt64 -""" +# Build UInt64 from the two lanes `(x0, x1)` of the same Threefry block at `counter` @inline function rand_uint( rng::CounterRNG{<:Threefry}, counter::UInt64, ::Type{UInt64}, )::UInt64 x0, x1 = _threefry2x32_block(rng, counter) - return _u64_from_u32(x0, x1) + return _u64_from_u32s(x0, x1) end diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index c22a863..2623c9d 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -1,25 +1,19 @@ # lo: rightmost 32 bits, hi: leftmost 32 bits @inline _u32_lo(x::UInt64)::UInt32 = UInt32(x & UInt64(0xffffffff)) @inline _u32_hi(x::UInt64)::UInt32 = UInt32(x >> 32) -@inline _u64_from_u32(lo::UInt32, hi::UInt32)::UInt64 = (UInt64(hi) << 32) | UInt64(lo) +@inline _u64_from_u32s(lo::UInt32, hi::UInt32)::UInt64 = (UInt64(hi) << 32) | UInt64(lo) # leftmost 32 bits of a*b cast to UInt64s @inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32) - -@inline function _rotl32(x::UInt32, r::UInt32)::UInt32 - return (x << r) | (x >> (UInt32(32) - r)) -end +# 32-bit rotate left by r positions +@inline _rotl32(x::UInt32, r::UInt32)::UInt32 = (x << r) | (x >> (UInt32(32) - r)) @inline _counter_from_index(i)::UInt64 = UInt64(i - one(i)) -""" - ALLOWED_RAND_SCALARS - -Internal scalar eltypes currently supported by [`rand!`](@ref). -""" +# Internal scalar eltypes currently supported by rand!. const ALLOWED_RAND_SCALARS = Union{ UInt32, UInt64, Int32, Int64, @@ -43,6 +37,10 @@ const ALLOWED_RAND_SCALARS = Union{ @inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u) +#= +Every RNG algorithm implements rand_uint(rng, counter, UInt32/UInt64). +This fallback provides a clear failure for unsupported RNG types. +=# @inline function rand_uint( rng::AbstractCounterRNG, ::UInt64, @@ -52,6 +50,12 @@ const ALLOWED_RAND_SCALARS = Union{ end +#= +Shared scalar generation: +1) map requested scalar type to corresponding raw UInt width +2) fill the UInt with random bits +3) convert bits into requested scalar representation +=# @inline function rand_scalar( rng::AbstractCounterRNG, counter::UInt64, @@ -74,11 +78,7 @@ end -""" - uint32_to_unit_float32(u::UInt32) -> Float32 - -Convert a random `UInt32` to `Float32` in `[0, 1)` by mantissa construction. -""" +# Convert random UInt32 bits to Float32 in [0, 1) by mantissa construction. @inline function uint32_to_unit_float32(u::UInt32)::Float32 # Keep 23 random bits for the mantissa (drop 9 rightmost bits from the UInt32) # and combine with the bit pattern of 1.0f0 (sign=0, exponent=127). @@ -89,11 +89,7 @@ Convert a random `UInt32` to `Float32` in `[0, 1)` by mantissa construction. end -""" - uint64_to_unit_float64(u::UInt64) -> Float64 - -Convert a random `UInt64` to `Float64` in `[0, 1)` by mantissa construction. -""" +# Convert random UInt64 bits to Float64 in [0, 1) by mantissa construction. @inline function uint64_to_unit_float64(u::UInt64)::Float64 # Keep 52 random bits for the mantissa (drop 12 rightmost bits from the UInt64) # and combine with the bit pattern of 1.0 (sign=0, exponent=1023). diff --git a/test/rand.jl b/test/rand.jl index 339a2cb..e3f0bb6 100644 --- a/test/rand.jl +++ b/test/rand.jl @@ -1,218 +1,214 @@ -function _is_unit_interval(v) - for x in v - if isnan(x) || x < zero(x) || x >= one(x) - return false - end - end - return true -end +const RAND_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry()) +const RAND_SCALAR_TYPES = (UInt32, UInt64, Int32, Int64, Float32, Float64) + + +_is_unit_interval(v) = all(x -> !isnan(x) && zero(x) <= x < one(x), v) + function _rand_fill_reference!(rng, x::AbstractArray{T}) where {T <: AK.ALLOWED_RAND_SCALARS} @inbounds for i in eachindex(x) - counter = UInt64(i - one(i)) - x[i] = AK.rand_scalar(rng, counter, T) + x[i] = AK.rand_scalar(rng, UInt64(i - one(i)), T) end return x end -@testset "rand" begin - @test AK.CounterRNG(0x1; alg=AK.SplitMix64()) isa AK.CounterRNG{AK.SplitMix64} - @test AK.CounterRNG(UInt32(0x1); alg=AK.Philox()) isa AK.CounterRNG{AK.Philox} - @test AK.CounterRNG(UInt16(123); alg=AK.Threefry()) isa AK.CounterRNG{AK.Threefry} - @test AK.CounterRNG(UInt32(300)).seed == UInt64(300) - @test_throws ArgumentError AK.CounterRNG(-1) - - Random.seed!(0x1234) - expected_auto_seed = rand(UInt64) - Random.seed!(0x1234) - rng_auto = AK.CounterRNG() - @test rng_auto.seed == expected_auto_seed - @test rng_auto.alg isa AK.SplitMix64 - - xauto1 = array_from_host(zeros(Float32, 1024)) - xauto2 = array_from_host(zeros(Float32, 1024)) - AK.rand!(rng_auto, xauto1; prefer_threads, block_size=64) - AK.rand!(rng_auto, xauto2; prefer_threads, block_size=257) - @test Array(xauto1) == Array(xauto2) - - Random.seed!(0xabcdef) - seed1 = rand(UInt64) - seed2 = rand(UInt64) - ref1 = array_from_host(zeros(Float32, 1024)) - ref2 = array_from_host(zeros(Float32, 1024)) - AK.rand!(AK.CounterRNG(seed1; alg=AK.SplitMix64()), ref1; prefer_threads, block_size=64) - AK.rand!(AK.CounterRNG(seed2; alg=AK.SplitMix64()), ref2; prefer_threads, block_size=64) - - Random.seed!(0xabcdef) - xconv1 = array_from_host(zeros(Float32, 1024)) - xconv2 = array_from_host(zeros(Float32, 1024)) - AK.rand!(xconv1; prefer_threads, block_size=64) - AK.rand!(xconv2; prefer_threads, block_size=64) - @test Array(xconv1) == Array(ref1) - @test Array(xconv2) == Array(ref2) - - rng_algs = (AK.SplitMix64(), AK.Philox(), AK.Threefry()) - scalar_types = (UInt32, UInt64, Int32, Int64, Float32, Float64) - - for alg in rng_algs - rng_alg = AK.CounterRNG(0x123456789abcdef; alg) - for U in (UInt32, UInt64) - @test AK.rand_uint(rng_alg, UInt64(0), U) == AK.rand_uint(rng_alg, UInt64(0), U) - @test AK.rand_uint(rng_alg, UInt64(1), U) != AK.rand_uint(rng_alg, UInt64(0), U) - - vals_alg = [AK.rand_uint(rng_alg, UInt64(i), U) for i in 0:1023] - @test length(unique(vals_alg)) > 900 - end - for T in scalar_types - x_alg = array_from_host(zeros(T, 2048)) - AK.rand!(rng_alg, x_alg; prefer_threads, block_size=64) - x_alg_h = Array(x_alg) - ref_alg = zeros(T, 2048) - _rand_fill_reference!(rng_alg, ref_alg) - @test x_alg_h == ref_alg - if T <: AbstractFloat - @test _is_unit_interval(x_alg_h) - end - end - end +function _assert_rand_matches_reference!(rng, x; kwargs...) + AK.rand!(rng, x; kwargs...) + ref = zeros(eltype(x), size(x)) + _rand_fill_reference!(rng, ref) + @test Array(x) == ref + return x +end - rng = AK.CounterRNG(0x123456789abcdef) - - @test AK.from_uint(UInt32, UInt32(0xdeadbeef)) == UInt32(0xdeadbeef) - @test AK.from_uint(UInt64, UInt64(0x0123456789abcdef)) == UInt64(0x0123456789abcdef) - @test AK.from_uint(Int32, UInt32(0xdeadbeef)) == reinterpret(Int32, UInt32(0xdeadbeef)) - @test AK.from_uint(Int64, UInt64(0x0123456789abcdef)) == reinterpret(Int64, UInt64(0x0123456789abcdef)) - - @test AK.rand_uint(rng, UInt64(0), UInt32) == AK.rand_uint(rng, UInt64(0), UInt32) - @test AK.rand_uint(rng, UInt64(1), UInt32) != AK.rand_uint(rng, UInt64(0), UInt32) - @test AK.rand_uint(rng, UInt64(17), UInt32) != AK.rand_uint(rng, UInt64(18), UInt32) - @test AK.rand_uint(rng, UInt64(0), UInt64) == AK.rand_uint(rng, UInt64(0), UInt64) - @test AK.rand_uint(rng, UInt64(1), UInt64) != AK.rand_uint(rng, UInt64(0), UInt64) - @test AK.rand_uint(rng, UInt64(17), UInt64) != AK.rand_uint(rng, UInt64(18), UInt64) - - vals_u32 = [AK.rand_uint(rng, UInt64(i), UInt32) for i in 0:2047] - vals_u64 = [AK.rand_uint(rng, UInt64(i), UInt64) for i in 0:2047] - @test length(unique(vals_u32)) > 1800 - @test length(unique(vals_u64)) > 2000 - - for u in ( - UInt32(0x00000000), - UInt32(0x00000001), - UInt32(0x7fffffff), - UInt32(0x80000000), - UInt32(0xffffffff), - ) - x = AK.uint32_to_unit_float32(u) - @test !isnan(x) - @test 0.0f0 <= x < 1.0f0 + +@testset "rand" begin + @testset "constructors" begin + @test AK.CounterRNG(0x1; alg=AK.SplitMix64()) isa AK.CounterRNG{AK.SplitMix64} + @test AK.CounterRNG(UInt32(0x1); alg=AK.Philox()) isa AK.CounterRNG{AK.Philox} + @test AK.CounterRNG(UInt16(123); alg=AK.Threefry()) isa AK.CounterRNG{AK.Threefry} + @test AK.CounterRNG(UInt32(300)).seed == UInt64(300) + @test_throws ArgumentError AK.CounterRNG(-1) + + Random.seed!(0x1234) + expected_seed = rand(UInt64) + Random.seed!(0x1234) + rng_auto = AK.CounterRNG() + @test rng_auto.seed == expected_seed + @test rng_auto.alg isa AK.Philox + + x1 = array_from_host(zeros(Float32, 1024)) + x2 = array_from_host(zeros(Float32, 1024)) + AK.rand!(rng_auto, x1; prefer_threads, block_size=64) + AK.rand!(rng_auto, x2; prefer_threads, block_size=257) + @test Array(x1) == Array(x2) end - for u in ( - UInt64(0x0000000000000000), - UInt64(0x0000000000000001), - UInt64(0x7fffffffffffffff), - UInt64(0x8000000000000000), - UInt64(0xffffffffffffffff), - ) - x = AK.uint64_to_unit_float64(u) - @test !isnan(x) - @test 0.0 <= x < 1.0 + + @testset "bit helpers" begin + hi = UInt32(0b10101010101010101010101010101010) + lo = UInt32(0b01010101010101010101010101010101) + word = UInt64(hi) << 32 | UInt64(lo) + + @test AK._u32_hi(word) == hi + @test AK._u32_lo(word) == lo + @test AK._u64_from_u32s(lo, hi) == word + @test AK._mulhi_u32(0xffffffff % UInt32, 0xffffffff % UInt32) == 0xfffffffe % UInt32 + @test AK._rotl32(0b10000000000000000000000000000001 % UInt32, UInt32(1)) == 0b11 % UInt32 + @test AK._counter_from_index(1) == UInt64(0) + @test AK._counter_from_index(17) == UInt64(16) + + @test AK.raw_uint_type(UInt32) === UInt32 + @test AK.raw_uint_type(Int32) === UInt32 + @test AK.raw_uint_type(Float32) === UInt32 + @test AK.raw_uint_type(UInt64) === UInt64 + @test AK.raw_uint_type(Int64) === UInt64 + @test AK.raw_uint_type(Float64) === UInt64 + + @test AK.from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32 + @test AK.from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64 + @test AK.from_uint(Int32, 0b11111111111111111111111111111111 % UInt32) == Int32(-1) + @test AK.from_uint( + Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64 + ) == Int64(-1) + + @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0 + @test AK.uint64_to_unit_float64(UInt64(0)) == 0.0 + @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0 + @test 0.0 <= AK.uint64_to_unit_float64(typemax(UInt64)) < 1.0 end - for T in scalar_types - s0 = AK.rand_scalar(rng, UInt64(0), T) - s1 = AK.rand_scalar(rng, UInt64(1), T) - @test s0 isa T - @test s1 isa T - @test s0 != s1 - if T <: AbstractFloat - @test zero(T) <= s0 < one(T) - @test zero(T) <= s1 < one(T) + + @testset "rand_uint" begin + for alg in RAND_ALGS + rng = AK.CounterRNG(0x123456789abcdef; alg) + for U in (UInt32, UInt64) + @test AK.rand_uint(rng, UInt64(0), U) == AK.rand_uint(rng, UInt64(0), U) + @test AK.rand_uint(rng, UInt64(1), U) != AK.rand_uint(rng, UInt64(0), U) + + vals = [AK.rand_uint(rng, UInt64(i), U) for i in 0:511] + @test length(unique(vals)) > 460 + end + end + + rng_splitmix = AK.CounterRNG(0x31415926; alg=AK.SplitMix64()) + for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023)) + @test AK.rand_uint(rng_splitmix, c, UInt32) == AK._u32_hi( + AK.rand_uint(rng_splitmix, c, UInt64) + ) end - end - @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16) + for alg in (AK.Philox(), AK.Threefry()) + rng = AK.CounterRNG(0xabcdef1234567890; alg) + for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023)) + @test AK._u32_lo(AK.rand_uint(rng, c, UInt64)) == AK.rand_uint(rng, c, UInt32) + end + end + end - lengths = (0, 1, 31, 32, 33, 1024, 1025) - for T in scalar_types - for len in lengths - x = array_from_host(zeros(T, len)) - AK.rand!(rng, x; prefer_threads, block_size=64) - xh = Array(x) - ref = zeros(T, len) - _rand_fill_reference!(rng, ref) + @testset "rand_scalar" begin + rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) - @test xh == ref + for T in RAND_SCALAR_TYPES + s0 = AK.rand_scalar(rng, UInt64(0), T) + s1 = AK.rand_scalar(rng, UInt64(1), T) + @test s0 isa T + @test s1 isa T + @test s0 != s1 if T <: AbstractFloat - @test _is_unit_interval(xh) + @test zero(T) <= s0 < one(T) + @test zero(T) <= s1 < one(T) end end + + c = UInt64(42) + @test AK.rand_scalar(rng, c, Int32) == reinterpret(Int32, AK.rand_uint(rng, c, UInt32)) + @test AK.rand_scalar(rng, c, Int64) == reinterpret(Int64, AK.rand_uint(rng, c, UInt64)) + @test AK.rand_scalar(rng, c, Float32) == AK.uint32_to_unit_float32( + AK.rand_uint(rng, c, UInt32) + ) + @test AK.rand_scalar(rng, c, Float64) == AK.uint64_to_unit_float64( + AK.rand_uint(rng, c, UInt64) + ) + @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16) end - rng2 = AK.CounterRNG(rng.seed + UInt64(1)) - for T in scalar_types - x1 = array_from_host(zeros(T, 4096)) - x2 = array_from_host(zeros(T, 4096)) - AK.rand!(rng, x1; prefer_threads, block_size=64) - AK.rand!(rng, x2; prefer_threads, block_size=257) - @test Array(x1) == Array(x2) - x3 = array_from_host(zeros(T, 4096)) - AK.rand!(rng2, x3; prefer_threads, block_size=64) - @test Array(x3) != Array(x1) + @testset "rand! explicit rng" begin + lengths = (0, 1, 31, 32, 33, 257, 1024) + rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) - xnd = array_from_host(zeros(T, 7, 11, 5)) - AK.rand!(rng, xnd; prefer_threads, block_size=128) - xndh = Array(xnd) - refnd = zeros(T, 7, 11, 5) - _rand_fill_reference!(rng, refnd) - @test xndh == refnd - end + for T in RAND_SCALAR_TYPES + for len in lengths + x = array_from_host(zeros(T, len)) + _assert_rand_matches_reference!(rng, x; prefer_threads, block_size=64) + if T <: AbstractFloat + @test _is_unit_interval(Array(x)) + end + end + end - if IS_CPU_BACKEND - for T in scalar_types - base = zeros(T, 64) - view_x = @view base[2:2:end] - AK.rand!( - rng, - view_x; - max_tasks=Threads.nthreads(), - min_elems=1, - prefer_threads=true, - ) + for T in RAND_SCALAR_TYPES + x1 = array_from_host(zeros(T, 2048)) + x2 = array_from_host(zeros(T, 2048)) + AK.rand!(rng, x1; prefer_threads, block_size=64) + AK.rand!(rng, x2; prefer_threads, block_size=257) + @test Array(x1) == Array(x2) + end + + rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg) + for T in RAND_SCALAR_TYPES + x1 = array_from_host(zeros(T, 2048)) + x2 = array_from_host(zeros(T, 2048)) + AK.rand!(rng, x1; prefer_threads, block_size=64) + AK.rand!(rng2, x2; prefer_threads, block_size=64) + @test Array(x1) != Array(x2) + end - ref_view = zeros(T, length(view_x)) - _rand_fill_reference!(rng, ref_view) - @test collect(view_x) == ref_view + for T in (Float32, UInt64) + xnd = array_from_host(zeros(T, 7, 11, 5)) + _assert_rand_matches_reference!(rng, xnd; prefer_threads, block_size=128) + end + + if IS_CPU_BACKEND + for T in RAND_SCALAR_TYPES + base = zeros(T, 64) + view_x = @view base[2:2:end] + AK.rand!( + rng, view_x; + max_tasks=Threads.nthreads(), + min_elems=1, + prefer_threads=true + ) + ref_view = zeros(T, length(view_x)) + _rand_fill_reference!(rng, ref_view) + @test collect(view_x) == ref_view + end end end - nstats = 200_000 - xstats = array_from_host(zeros(Float32, nstats)) - AK.rand!(rng, xstats; prefer_threads, block_size=256) - xh = Array(xstats) - @test _is_unit_interval(xh) + @testset "rand! convenience" begin + Random.seed!(0xabcdef) + seed1 = rand(UInt64) + seed2 = rand(UInt64) - m = sum(xh) / nstats - v = sum((x - m)^2 for x in xh) / nstats - @test abs(m - 0.5) < 0.01 - @test abs(v - (1 / 12)) < 0.01 + ref1 = array_from_host(zeros(Float32, 1024)) + ref2 = array_from_host(zeros(Float32, 1024)) + AK.rand!(AK.CounterRNG(seed1; alg=AK.Philox()), ref1; prefer_threads, block_size=64) + AK.rand!(AK.CounterRNG(seed2; alg=AK.Philox()), ref2; prefer_threads, block_size=64) - nbins = 16 - counts = zeros(Int, nbins) - for x in xh - ibin = Int(floor(x * nbins)) + 1 - ibin = min(ibin, nbins) - counts[ibin] += 1 - end - expected = nstats / nbins - max_rel_dev = maximum(abs(c - expected) / expected for c in counts) - @test max_rel_dev < 0.1 + Random.seed!(0xabcdef) + x1 = array_from_host(zeros(Float32, 1024)) + x2 = array_from_host(zeros(Float32, 1024)) + AK.rand!(x1; prefer_threads, block_size=64) + AK.rand!(x2; prefer_threads, block_size=64) + @test Array(x1) == Array(ref1) + @test Array(x2) == Array(ref2) - x16 = array_from_host(zeros(UInt16, 16)) - @test_throws ArgumentError AK.rand!(x16; prefer_threads) - @test_throws ArgumentError AK.rand!(rng, x16; prefer_threads) + x_bad = array_from_host(zeros(UInt16, 16)) + @test_throws ArgumentError AK.rand!(x_bad; prefer_threads) + @test_throws ArgumentError AK.rand!(AK.CounterRNG(0x1), x_bad; prefer_threads) + end end From 65caf10021c371500fc785e927da4ef00ee645f8 Mon Sep 17 00:00:00 2001 From: fjbarter Date: Tue, 17 Mar 2026 00:16:49 +0000 Subject: [PATCH 05/18] ensure deteminism in convenience tests by avoiding race causing intermittent test failure. fence Float64 tests to CPU-only to avoid breaking CI on Metal and oneAPI --- Project.toml | 4 +++- src/AcceleratedKernels.jl | 1 + src/rand/rand.jl | 16 ++++++------- test/rand.jl | 48 +++++++++++++++++++++++---------------- 4 files changed, 41 insertions(+), 28 deletions(-) diff --git a/Project.toml b/Project.toml index 2fccea8..60ddf3a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,13 +1,14 @@ name = "AcceleratedKernels" uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" -authors = ["Andrei-Leonard Nicusan and contributors"] version = "0.4.3" +authors = ["Andrei-Leonard Nicusan and contributors"] [deps] ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" [weakdeps] @@ -21,6 +22,7 @@ ArgCheck = "2" GPUArraysCore = "0.2.0" KernelAbstractions = "0.9.34" Markdown = "1" +Random = "1.11.0" UnsafeAtomics = "0.3.0" julia = "1.10" oneAPI = "1, 2" diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl index 06a401a..7262cbe 100644 --- a/src/AcceleratedKernels.jl +++ b/src/AcceleratedKernels.jl @@ -15,6 +15,7 @@ using ArgCheck: @argcheck using GPUArraysCore: AnyGPUArray, @allowscalar using KernelAbstractions using KernelAbstractions: @context +using Random import UnsafeAtomics diff --git a/src/rand/rand.jl b/src/rand/rand.jl index e62ba96..20a2fe8 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -27,8 +27,8 @@ The default algorithm is `Philox()`. Constructors: - `CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())` Uses an explicit non-negative seed. -- `CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64())` - Auto-seeds once using `rand(UInt64)`. Reusing the same `CounterRNG` instance is deterministic +- `CounterRNG(; alg::CounterRNGAlgorithm=Philox())` + Auto-seeds once using `Random.rand(Random.default_rng(), UInt64)`. Reusing the same `CounterRNG` instance is deterministic for fixed seed, algorithm, array shape, and eltype. """ struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG @@ -44,7 +44,7 @@ end function CounterRNG(; alg::CounterRNGAlgorithm=Philox()) - CounterRNG(Base.rand(UInt64); alg) + CounterRNG(Random.rand(Random.default_rng(), UInt64); alg) end @@ -58,6 +58,9 @@ include("splitmix64.jl") include("philox.jl") include("threefry.jl") + + + """ rand!( rng::AbstractCounterRNG, @@ -83,11 +86,6 @@ Supported scalar element types are: - `Int32`, `Int64` - `Float32`, `Float64` -Semantics: -- Unsigned integers: raw random bit patterns of requested width. -- Signed integers: corresponding unsigned patterns reinterpreted as signed. -- Floats: mantissa-based conversion from `UInt32`/`UInt64` into `[0, 1)`, uniform over the - produced mantissa grid (not over all representable floats). """ function rand!( rng::AbstractCounterRNG, @@ -115,6 +113,8 @@ function rand!( end return x end + + function rand!( x::AbstractArray, args...; diff --git a/test/rand.jl b/test/rand.jl index e3f0bb6..1084637 100644 --- a/test/rand.jl +++ b/test/rand.jl @@ -1,5 +1,9 @@ const RAND_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry()) -const RAND_SCALAR_TYPES = (UInt32, UInt64, Int32, Int64, Float32, Float64) +const RAND_SCALAR_TYPES_ALL = (UInt32, UInt64, Int32, Int64, Float32, Float64) +const RAND_SCALAR_TYPES_BACKEND = IS_CPU_BACKEND ? + RAND_SCALAR_TYPES_ALL : + (UInt32, UInt64, Int32, Int64, Float32) +const RUN_FLOAT64_RAND_TESTS = IS_CPU_BACKEND _is_unit_interval(v) = all(x -> !isnan(x) && zero(x) <= x < one(x), v) @@ -31,7 +35,7 @@ end @test_throws ArgumentError AK.CounterRNG(-1) Random.seed!(0x1234) - expected_seed = rand(UInt64) + expected_seed = Random.rand(Random.default_rng(), UInt64) Random.seed!(0x1234) rng_auto = AK.CounterRNG() @test rng_auto.seed == expected_seed @@ -63,7 +67,9 @@ end @test AK.raw_uint_type(Float32) === UInt32 @test AK.raw_uint_type(UInt64) === UInt64 @test AK.raw_uint_type(Int64) === UInt64 - @test AK.raw_uint_type(Float64) === UInt64 + if RUN_FLOAT64_RAND_TESTS + @test AK.raw_uint_type(Float64) === UInt64 + end @test AK.from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32 @test AK.from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64 @@ -73,9 +79,11 @@ end ) == Int64(-1) @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0 - @test AK.uint64_to_unit_float64(UInt64(0)) == 0.0 @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0 - @test 0.0 <= AK.uint64_to_unit_float64(typemax(UInt64)) < 1.0 + if RUN_FLOAT64_RAND_TESTS + @test AK.uint64_to_unit_float64(UInt64(0)) == 0.0 + @test 0.0 <= AK.uint64_to_unit_float64(typemax(UInt64)) < 1.0 + end end @@ -110,7 +118,7 @@ end @testset "rand_scalar" begin rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) - for T in RAND_SCALAR_TYPES + for T in RAND_SCALAR_TYPES_BACKEND s0 = AK.rand_scalar(rng, UInt64(0), T) s1 = AK.rand_scalar(rng, UInt64(1), T) @test s0 isa T @@ -128,9 +136,11 @@ end @test AK.rand_scalar(rng, c, Float32) == AK.uint32_to_unit_float32( AK.rand_uint(rng, c, UInt32) ) - @test AK.rand_scalar(rng, c, Float64) == AK.uint64_to_unit_float64( - AK.rand_uint(rng, c, UInt64) - ) + if RUN_FLOAT64_RAND_TESTS + @test AK.rand_scalar(rng, c, Float64) == AK.uint64_to_unit_float64( + AK.rand_uint(rng, c, UInt64) + ) + end @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16) end @@ -139,7 +149,7 @@ end lengths = (0, 1, 31, 32, 33, 257, 1024) rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) - for T in RAND_SCALAR_TYPES + for T in RAND_SCALAR_TYPES_BACKEND for len in lengths x = array_from_host(zeros(T, len)) _assert_rand_matches_reference!(rng, x; prefer_threads, block_size=64) @@ -149,7 +159,7 @@ end end end - for T in RAND_SCALAR_TYPES + for T in RAND_SCALAR_TYPES_BACKEND x1 = array_from_host(zeros(T, 2048)) x2 = array_from_host(zeros(T, 2048)) AK.rand!(rng, x1; prefer_threads, block_size=64) @@ -158,7 +168,7 @@ end end rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg) - for T in RAND_SCALAR_TYPES + for T in RAND_SCALAR_TYPES_BACKEND x1 = array_from_host(zeros(T, 2048)) x2 = array_from_host(zeros(T, 2048)) AK.rand!(rng, x1; prefer_threads, block_size=64) @@ -172,7 +182,7 @@ end end if IS_CPU_BACKEND - for T in RAND_SCALAR_TYPES + for T in RAND_SCALAR_TYPES_BACKEND base = zeros(T, 64) view_x = @view base[2:2:end] AK.rand!( @@ -190,18 +200,18 @@ end @testset "rand! convenience" begin - Random.seed!(0xabcdef) - seed1 = rand(UInt64) - seed2 = rand(UInt64) - ref1 = array_from_host(zeros(Float32, 1024)) ref2 = array_from_host(zeros(Float32, 1024)) + x1 = array_from_host(zeros(Float32, 1024)) + x2 = array_from_host(zeros(Float32, 1024)) + + Random.seed!(0xabcdef) + seed1 = Random.rand(Random.default_rng(), UInt64) AK.rand!(AK.CounterRNG(seed1; alg=AK.Philox()), ref1; prefer_threads, block_size=64) + seed2 = Random.rand(Random.default_rng(), UInt64) AK.rand!(AK.CounterRNG(seed2; alg=AK.Philox()), ref2; prefer_threads, block_size=64) Random.seed!(0xabcdef) - x1 = array_from_host(zeros(Float32, 1024)) - x2 = array_from_host(zeros(Float32, 1024)) AK.rand!(x1; prefer_threads, block_size=64) AK.rand!(x2; prefer_threads, block_size=64) @test Array(x1) == Array(ref1) From 3d1e0c05b2e6c5f6ddc2008ba3de5e1c6de37453 Mon Sep 17 00:00:00 2001 From: fjbarter Date: Tue, 17 Mar 2026 00:25:28 +0000 Subject: [PATCH 06/18] fix Random dep for use with julia v1.10 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 60ddf3a..4a3a6b3 100644 --- a/Project.toml +++ b/Project.toml @@ -22,7 +22,7 @@ ArgCheck = "2" GPUArraysCore = "0.2.0" KernelAbstractions = "0.9.34" Markdown = "1" -Random = "1.11.0" +Random = "1" UnsafeAtomics = "0.3.0" julia = "1.10" oneAPI = "1, 2" From 48cd247b86a55b5fc4c896c151f6bb3f128e47d2 Mon Sep 17 00:00:00 2001 From: fjbarter Date: Tue, 17 Mar 2026 21:28:32 +0000 Subject: [PATCH 07/18] add Bool scalar support with isodd() on a UInt32 --- docs/src/api/rand.md | 6 ++++-- src/rand/rand.jl | 8 ++++++++ src/rand/utilities.jl | 5 ++++- test/rand.jl | 17 +++++++++++++---- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md index 2fa468f..d9ffc71 100644 --- a/docs/src/api/rand.md +++ b/docs/src/api/rand.md @@ -4,19 +4,21 @@ Counter-based random generation for CPU and GPU backends with deterministic beha `seed`, algorithm, array shape, and eltype. Use an explicit `CounterRNG(seed; alg=...)` when reproducibility matters. For convenience, -`AK.rand!(x)` creates a fresh `CounterRNG()` on each call using one auto-seeded `Base.rand(UInt64)` -draw, so repeated calls usually produce different outputs. +`AK.rand!(x)` creates a fresh `CounterRNG()` on each call using one auto-seeded +`Base.rand(Random.default_rng(), UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used. Supported output element types: - `UInt32`, `UInt64` - `Int32`, `Int64` - `Float32`, `Float64` +- `Bool` The core of the random number generation produces a `UInt` of the requested scalar width. That `UInt` is then either: - Unsigned integers: returned as-is - Signed integers: reinterpreted as a signed integer bit pattern. - Floats: mantissa construction into a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/)). +- Bool: `true` if the `UInt` draw is odd (`isodd(u)`), otherwise `false`. Algorithms currently available: - `SplitMix64` ([read more](https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64)) diff --git a/src/rand/rand.jl b/src/rand/rand.jl index 20a2fe8..fdd7076 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -85,6 +85,14 @@ Supported scalar element types are: - `UInt32`, `UInt64` - `Int32`, `Int64` - `Float32`, `Float64` +- `Bool` + +Semantics: +- Unsigned integers: raw random bit patterns of requested width. +- Signed integers: corresponding unsigned patterns reinterpreted as signed. +- Floats: mantissa-based conversion from `UInt32`/`UInt64` into `[0, 1)`, uniform over the + produced mantissa grid (not over all representable floats). +- Bool: `true` if the raw `UInt` draw is odd (`isodd(u)`), otherwise `false`. """ function rand!( diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index 2623c9d..c9f2cc5 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -18,6 +18,7 @@ const ALLOWED_RAND_SCALARS = Union{ UInt32, UInt64, Int32, Int64, Float32, Float64, + Bool } @@ -27,6 +28,7 @@ const ALLOWED_RAND_SCALARS = Union{ @inline raw_uint_type(::Type{UInt64}) = UInt64 @inline raw_uint_type(::Type{Int64}) = UInt64 @inline raw_uint_type(::Type{Float64}) = UInt64 +@inline raw_uint_type(::Type{Bool}) = UInt32 @inline from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u @@ -35,6 +37,7 @@ const ALLOWED_RAND_SCALARS = Union{ @inline from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u) @inline from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u) @inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u) +@inline from_uint(::Type{Bool}, u::UInt32)::Bool = isodd(u) #= @@ -71,7 +74,7 @@ end @inline function rand_scalar(::AbstractCounterRNG, ::UInt64, ::Type{T}) where {T} throw(ArgumentError( - "Unsupported random scalar type $(T). Supported: UInt32, UInt64, Int32, Int64, Float32, Float64." + "Unsupported random scalar type $(T). Supported: $(ALLOWED_RAND_SCALARS)" )) end diff --git a/test/rand.jl b/test/rand.jl index 1084637..a9d0c46 100644 --- a/test/rand.jl +++ b/test/rand.jl @@ -1,8 +1,8 @@ const RAND_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry()) -const RAND_SCALAR_TYPES_ALL = (UInt32, UInt64, Int32, Int64, Float32, Float64) +const RAND_SCALAR_TYPES_ALL = (UInt32, UInt64, Int32, Int64, Float32, Float64, Bool) const RAND_SCALAR_TYPES_BACKEND = IS_CPU_BACKEND ? RAND_SCALAR_TYPES_ALL : - (UInt32, UInt64, Int32, Int64, Float32) + (UInt32, UInt64, Int32, Int64, Float32, Bool) const RUN_FLOAT64_RAND_TESTS = IS_CPU_BACKEND @@ -67,6 +67,7 @@ end @test AK.raw_uint_type(Float32) === UInt32 @test AK.raw_uint_type(UInt64) === UInt64 @test AK.raw_uint_type(Int64) === UInt64 + @test AK.raw_uint_type(Bool) === UInt32 if RUN_FLOAT64_RAND_TESTS @test AK.raw_uint_type(Float64) === UInt64 end @@ -77,6 +78,8 @@ end @test AK.from_uint( Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64 ) == Int64(-1) + @test AK.from_uint(Bool, UInt32(0)) == false + @test AK.from_uint(Bool, UInt32(1)) == true @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0 @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0 @@ -123,7 +126,9 @@ end s1 = AK.rand_scalar(rng, UInt64(1), T) @test s0 isa T @test s1 isa T - @test s0 != s1 + if T !== Bool + @test s0 != s1 + end if T <: AbstractFloat @test zero(T) <= s0 < one(T) @test zero(T) <= s1 < one(T) @@ -136,11 +141,15 @@ end @test AK.rand_scalar(rng, c, Float32) == AK.uint32_to_unit_float32( AK.rand_uint(rng, c, UInt32) ) + @test AK.rand_scalar(rng, c, Bool) == isodd(AK.rand_uint(rng, c, UInt32)) if RUN_FLOAT64_RAND_TESTS @test AK.rand_scalar(rng, c, Float64) == AK.uint64_to_unit_float64( AK.rand_uint(rng, c, UInt64) ) end + bools = [AK.rand_scalar(rng, UInt64(i), Bool) for i in 0:511] + @test any(identity, bools) + @test any(!, bools) @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16) end @@ -176,7 +185,7 @@ end @test Array(x1) != Array(x2) end - for T in (Float32, UInt64) + for T in (Float32, UInt64, Bool) xnd = array_from_host(zeros(T, 7, 11, 5)) _assert_rand_matches_reference!(rng, xnd; prefer_threads, block_size=128) end From 68e03c3b98f5a703a422a03e0ab2e2e88f1ec53c Mon Sep 17 00:00:00 2001 From: fjbarter Date: Tue, 17 Mar 2026 22:04:13 +0000 Subject: [PATCH 08/18] add support for Float16, UInt8, UInt16, Int8, Int16 (was bored) --- Project.toml | 2 +- docs/src/api/rand.md | 16 ++++++++-------- src/rand/rand.jl | 6 +++--- src/rand/utilities.jl | 27 ++++++++++++++++++++++++--- test/rand.jl | 40 ++++++++++++++++++++++++++++++++++------ 5 files changed, 70 insertions(+), 21 deletions(-) diff --git a/Project.toml b/Project.toml index 4a3a6b3..b0c601d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AcceleratedKernels" uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" -version = "0.4.3" authors = ["Andrei-Leonard Nicusan and contributors"] +version = "0.4.3" [deps] ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md index d9ffc71..93ed339 100644 --- a/docs/src/api/rand.md +++ b/docs/src/api/rand.md @@ -5,18 +5,18 @@ Counter-based random generation for CPU and GPU backends with deterministic beha Use an explicit `CounterRNG(seed; alg=...)` when reproducibility matters. For convenience, `AK.rand!(x)` creates a fresh `CounterRNG()` on each call using one auto-seeded -`Base.rand(Random.default_rng(), UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used. +`Random.rand(Random.default_rng(), UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used. -Supported output element types: -- `UInt32`, `UInt64` -- `Int32`, `Int64` -- `Float32`, `Float64` +Supported element types: +- `UInt8`, `UInt16`, `UInt32`, `UInt64` +- `Int8`, `Int16`, `Int32`, `Int64` +- `Float16`, `Float32`, `Float64` - `Bool` -The core of the random number generation produces a `UInt` of the requested scalar width. +The core of the random number generation produces either a `UInt32` or `UInt64` depending on the width of the requested element type. That `UInt` is then either: -- Unsigned integers: returned as-is -- Signed integers: reinterpreted as a signed integer bit pattern. +- Unsigned integers: returned as-is or truncated if necessary. +- Signed integers: reinterpreted as a signed integer bit pattern and truncated if necessary. - Floats: mantissa construction into a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/)). - Bool: `true` if the `UInt` draw is odd (`isodd(u)`), otherwise `false`. diff --git a/src/rand/rand.jl b/src/rand/rand.jl index fdd7076..dc48f03 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -82,9 +82,9 @@ Fill `x` in-place with pseudo-random values using a stateless counter-based RNG. counter is exactly `UInt64(i - 1)` in linear indexing order. Supported scalar element types are: -- `UInt32`, `UInt64` -- `Int32`, `Int64` -- `Float32`, `Float64` +- `UInt8`, `UInt16`, `UInt32`, `UInt64` +- `Int8`, `Int16`, `Int32`, `Int64` +- `Float16`, `Float32`, `Float64` - `Bool` Semantics: diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index c9f2cc5..3d5dab7 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -15,15 +15,20 @@ # Internal scalar eltypes currently supported by rand!. const ALLOWED_RAND_SCALARS = Union{ - UInt32, UInt64, - Int32, Int64, - Float32, Float64, + UInt8, UInt16, UInt32, UInt64, + Int8, Int16, Int32, Int64, + Float16, Float32, Float64, Bool } +@inline raw_uint_type(::Type{UInt8}) = UInt32 +@inline raw_uint_type(::Type{UInt16}) = UInt32 @inline raw_uint_type(::Type{UInt32}) = UInt32 +@inline raw_uint_type(::Type{Int8}) = UInt32 +@inline raw_uint_type(::Type{Int16}) = UInt32 @inline raw_uint_type(::Type{Int32}) = UInt32 +@inline raw_uint_type(::Type{Float16}) = UInt32 @inline raw_uint_type(::Type{Float32}) = UInt32 @inline raw_uint_type(::Type{UInt64}) = UInt64 @inline raw_uint_type(::Type{Int64}) = UInt64 @@ -31,10 +36,15 @@ const ALLOWED_RAND_SCALARS = Union{ @inline raw_uint_type(::Type{Bool}) = UInt32 +@inline from_uint(::Type{UInt8}, u::UInt32)::UInt8 = trunc(UInt8, u >> 24) +@inline from_uint(::Type{UInt16}, u::UInt32)::UInt16 = trunc(UInt16, u >> 16) @inline from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u @inline from_uint(::Type{UInt64}, u::UInt64)::UInt64 = u +@inline from_uint(::Type{Int8}, u::UInt32)::Int8 = reinterpret(Int8, trunc(UInt8, u >> 24)) +@inline from_uint(::Type{Int16}, u::UInt32)::Int16 = reinterpret(Int16, trunc(UInt16, u >> 16)) @inline from_uint(::Type{Int32}, u::UInt32)::Int32 = reinterpret(Int32, u) @inline from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u) +@inline from_uint(::Type{Float16}, u::UInt32)::Float16 = uint32_to_unit_float16(u) @inline from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u) @inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u) @inline from_uint(::Type{Bool}, u::UInt32)::Bool = isodd(u) @@ -81,6 +91,17 @@ end +# Convert random UInt32 bits to Float16 in [0, 1) by mantissa construction. +@inline function uint32_to_unit_float16(u::UInt32)::Float16 + # Keep 10 random bits for the mantissa (drop 22 rightmost bits from the UInt32) + # and combine with the bit pattern of Float16(1.0) (sign=0, exponent=15). + bits = UInt16(0x3c00) | UInt16(u >> 22) + + # Interpret as 1.mantissa, then subtract 1 for [0, 1) + return reinterpret(Float16, bits) - Float16(1) +end + + # Convert random UInt32 bits to Float32 in [0, 1) by mantissa construction. @inline function uint32_to_unit_float32(u::UInt32)::Float32 # Keep 23 random bits for the mantissa (drop 9 rightmost bits from the UInt32) diff --git a/test/rand.jl b/test/rand.jl index a9d0c46..9e03e59 100644 --- a/test/rand.jl +++ b/test/rand.jl @@ -1,8 +1,13 @@ const RAND_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry()) -const RAND_SCALAR_TYPES_ALL = (UInt32, UInt64, Int32, Int64, Float32, Float64, Bool) +const RAND_SCALAR_TYPES_ALL = ( + UInt8, UInt16, UInt32, UInt64, + Int8, Int16, Int32, Int64, + Float16, Float32, Float64, + Bool, +) const RAND_SCALAR_TYPES_BACKEND = IS_CPU_BACKEND ? RAND_SCALAR_TYPES_ALL : - (UInt32, UInt64, Int32, Int64, Float32, Bool) + (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float16, Float32, Bool) const RUN_FLOAT64_RAND_TESTS = IS_CPU_BACKEND @@ -62,8 +67,13 @@ end @test AK._counter_from_index(1) == UInt64(0) @test AK._counter_from_index(17) == UInt64(16) + @test AK.raw_uint_type(UInt8) === UInt32 + @test AK.raw_uint_type(UInt16) === UInt32 @test AK.raw_uint_type(UInt32) === UInt32 + @test AK.raw_uint_type(Int8) === UInt32 + @test AK.raw_uint_type(Int16) === UInt32 @test AK.raw_uint_type(Int32) === UInt32 + @test AK.raw_uint_type(Float16) === UInt32 @test AK.raw_uint_type(Float32) === UInt32 @test AK.raw_uint_type(UInt64) === UInt64 @test AK.raw_uint_type(Int64) === UInt64 @@ -72,15 +82,22 @@ end @test AK.raw_uint_type(Float64) === UInt64 end + @test AK.from_uint(UInt8, UInt32(0xabcdef01)) == UInt8(0xab) + @test AK.from_uint(UInt16, UInt32(0xabcdef01)) == UInt16(0xabcd) @test AK.from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32 @test AK.from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64 + @test AK.from_uint(Int8, UInt32(0xff000000)) == Int8(-1) + @test AK.from_uint(Int16, UInt32(0xffff0000)) == Int16(-1) @test AK.from_uint(Int32, 0b11111111111111111111111111111111 % UInt32) == Int32(-1) @test AK.from_uint( Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64 ) == Int64(-1) + @test AK.from_uint(Float16, UInt32(0)) == Float16(0) @test AK.from_uint(Bool, UInt32(0)) == false @test AK.from_uint(Bool, UInt32(1)) == true + @test AK.uint32_to_unit_float16(UInt32(0)) == Float16(0) + @test Float16(0) <= AK.uint32_to_unit_float16(typemax(UInt32)) < Float16(1) @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0 @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0 if RUN_FLOAT64_RAND_TESTS @@ -126,7 +143,7 @@ end s1 = AK.rand_scalar(rng, UInt64(1), T) @test s0 isa T @test s1 isa T - if T !== Bool + if !(T in (Bool, Float16, UInt8, UInt16, Int8, Int16)) @test s0 != s1 end if T <: AbstractFloat @@ -136,8 +153,19 @@ end end c = UInt64(42) + @test AK.rand_scalar(rng, c, UInt8) == trunc(UInt8, AK.rand_uint(rng, c, UInt32) >> 24) + @test AK.rand_scalar(rng, c, UInt16) == trunc(UInt16, AK.rand_uint(rng, c, UInt32) >> 16) + @test AK.rand_scalar( + rng, c, Int8 + ) == reinterpret(Int8, trunc(UInt8, AK.rand_uint(rng, c, UInt32) >> 24)) + @test AK.rand_scalar( + rng, c, Int16 + ) == reinterpret(Int16, trunc(UInt16, AK.rand_uint(rng, c, UInt32) >> 16)) @test AK.rand_scalar(rng, c, Int32) == reinterpret(Int32, AK.rand_uint(rng, c, UInt32)) @test AK.rand_scalar(rng, c, Int64) == reinterpret(Int64, AK.rand_uint(rng, c, UInt64)) + @test AK.rand_scalar(rng, c, Float16) == AK.uint32_to_unit_float16( + AK.rand_uint(rng, c, UInt32) + ) @test AK.rand_scalar(rng, c, Float32) == AK.uint32_to_unit_float32( AK.rand_uint(rng, c, UInt32) ) @@ -150,7 +178,7 @@ end bools = [AK.rand_scalar(rng, UInt64(i), Bool) for i in 0:511] @test any(identity, bools) @test any(!, bools) - @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16) + @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt128) end @@ -185,7 +213,7 @@ end @test Array(x1) != Array(x2) end - for T in (Float32, UInt64, Bool) + for T in (Float16, Float32, UInt64, Bool) xnd = array_from_host(zeros(T, 7, 11, 5)) _assert_rand_matches_reference!(rng, xnd; prefer_threads, block_size=128) end @@ -226,7 +254,7 @@ end @test Array(x1) == Array(ref1) @test Array(x2) == Array(ref2) - x_bad = array_from_host(zeros(UInt16, 16)) + x_bad = zeros(UInt128, 16) @test_throws ArgumentError AK.rand!(x_bad; prefer_threads) @test_throws ArgumentError AK.rand!(AK.CounterRNG(0x1), x_bad; prefer_threads) end From 8677d84d4b15592a31cf5a3e73587fe7c77b454b Mon Sep 17 00:00:00 2001 From: fjbarter Date: Wed, 18 Mar 2026 16:29:00 +0000 Subject: [PATCH 09/18] Add more faithful seed-key mapping for Philox, and improve thoroughness of docs --- docs/src/api/rand.md | 20 ++++++++++++++++++-- src/rand/philox.jl | 4 +--- src/rand/rand.jl | 2 +- src/rand/{splitmix64.jl => splitmix.jl} | 6 ++++++ 4 files changed, 26 insertions(+), 6 deletions(-) rename src/rand/{splitmix64.jl => splitmix.jl} (82%) diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md index 93ed339..997e407 100644 --- a/docs/src/api/rand.md +++ b/docs/src/api/rand.md @@ -25,8 +25,24 @@ Algorithms currently available: - `Philox` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)) - `Threefry` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)) -`Philox` is the default algorithm for `CounterRNG()`, as it is more thoroughly -statistically tested and measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an RTX +Statistical-testing note: +- Published/reference versions of `SplitMix64`, `Philox`, and `Threefry` are reported to pass + TestU01 BigCrush. +- These results refer to specific constructions and test setups; wrapper choices (such as + seed/key mapping conventions) can change bitwise output streams. +- These generators are not intended to be cryptographically secure. + +Philox keying note: +- AK uses `Philox2x32` internally (one 32-bit Philox key word). +- Users can pass any non-negative `Integer` seed; AK normalises to `UInt64` then derives the + 32-bit Philox key via a SplitMix-based mapping. +- This is a deliberate wrapper choice for ease of use (simple `seed` API with deterministic + streams), not a change to the Philox round function itself. +- Therefore, AK Philox streams are deterministic and high-quality, but not guaranteed to be + bit-for-bit identical to a raw Random123 Philox stream unless the same seed-to-key mapping and + counter convention are used. + +`Philox` is the default algorithm for `CounterRNG()`, as it is very thorough and very fast; it has been measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an Nvidia GeForce RTX 5060 (advertised 448 GB/s), i.e. effectively memory-bound throughput. Examples: diff --git a/src/rand/philox.jl b/src/rand/philox.jl index bdebc8e..37149f1 100644 --- a/src/rand/philox.jl +++ b/src/rand/philox.jl @@ -24,9 +24,7 @@ end x0 = _u32_lo(counter) x1 = _u32_hi(counter) - seed = UInt64(rng.seed) - k0 = _u32_lo(seed) - x1 = xor(x1, _u32_hi(seed)) + k0 = splitmix32_from_u64(UInt64(rng.seed)) @inbounds for _ in 1:PHILOX_ROUNDS x0, x1 = _philox2x32_round(x0, x1, k0) diff --git a/src/rand/rand.jl b/src/rand/rand.jl index dc48f03..d1fcb42 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -54,7 +54,7 @@ end include("utilities.jl") # Algorithm-specific integer generators -include("splitmix64.jl") +include("splitmix.jl") include("philox.jl") include("threefry.jl") diff --git a/src/rand/splitmix64.jl b/src/rand/splitmix.jl similarity index 82% rename from src/rand/splitmix64.jl rename to src/rand/splitmix.jl index d169b73..8d7992b 100644 --- a/src/rand/splitmix64.jl +++ b/src/rand/splitmix.jl @@ -16,6 +16,12 @@ const SPLITMIX64_MIX_B = UInt64(0x94d049bb133111eb) end +# Derive a 32-bit seed word from a 64-bit seed using SplitMix64 mixing. +@inline function splitmix32_from_u64(seed::UInt64)::UInt32 + return _u32_hi(_splitmix64_mix(seed + SPLITMIX64_INCREMENT)) +end + + # Natural SplitMix64 output path: compute 64 random bits directly from one counter @inline function rand_uint( rng::CounterRNG{<:SplitMix64}, From d687e2a84ab7816422273aea86b2a59b626177d8 Mon Sep 17 00:00:00 2001 From: fjbarter Date: Mon, 23 Mar 2026 01:40:44 +0000 Subject: [PATCH 10/18] streaming rng by now including an offset, incremented by length(x) when AK.rand!(x) is called. This gives stream-like behaviour without any on-device state but still faster than CURAND. Add RNGTest in prototyping for BigCrush and SmallCrush (Philox, SM64, and Threefry all now confirmed to pass in the AK implementation) --- docs/src/api/rand.md | 52 ++++++-- prototype/RNGTest/Project.toml | 3 + prototype/RNGTest/README.md | 33 +++++ prototype/RNGTest/run_bigcrush.jl | 26 ++++ prototype/RNGTest/run_smallcrush.jl | 23 ++++ prototype/RNGTest/stream.jl | 80 ++++++++++++ prototype/rand/Project.toml | 1 + src/rand/philox.jl | 14 +- src/rand/rand.jl | 95 ++++++++++---- src/rand/splitmix.jl | 9 +- src/rand/threefry.jl | 13 +- src/rand/utilities.jl | 24 ++-- test/rand.jl | 195 ++++++++++++++++++++++------ 13 files changed, 459 insertions(+), 109 deletions(-) create mode 100644 prototype/RNGTest/Project.toml create mode 100644 prototype/RNGTest/README.md create mode 100644 prototype/RNGTest/run_bigcrush.jl create mode 100644 prototype/RNGTest/run_smallcrush.jl create mode 100644 prototype/RNGTest/stream.jl diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md index 997e407..5c5596e 100644 --- a/docs/src/api/rand.md +++ b/docs/src/api/rand.md @@ -1,11 +1,35 @@ ### Random Number Generation -Counter-based random generation for CPU and GPU backends with deterministic behavior for fixed -`seed`, algorithm, array shape, and eltype. +Counter-based random generation for CPU and GPU backends with deterministic stream behavior for +fixed `seed`, algorithm, and call sequence. -Use an explicit `CounterRNG(seed; alg=...)` when reproducibility matters. For convenience, -`AK.rand!(x)` creates a fresh `CounterRNG()` on each call using one auto-seeded -`Random.rand(Random.default_rng(), UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used. +`CounterRNG` carries an internal `offset` (starting at `0`) that advances by `length(v)` on each +`AK.rand!(rng, v)` call. This means chunked fills are stream-consistent: +- filling `100` then `100` elements yields the same `200` values as one `200`-element fill. +- calls that share the same `CounterRNG` instance concurrently are not thread-safe. +- call `AK.reset!(rng)` to rewind a mutable offset-bearing RNG back to offset `0x0`. + +`AK.rand!` also accepts custom `AbstractCounterRNG` implementations: +- if they have a mutable `offset` field, streaming advancement is applied +- if they have no `offset` field, each call behaves statelessly from counter `0` +- if they have an immutable `offset` field, that offset is used as a fixed start and is not advanced + +Use an explicit `CounterRNG` when reproducibility is required. For +convenience, +`AK.rand!(v)` creates a fresh `CounterRNG()` on each call using one auto-seeded +`Base.rand(UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used. + +`AK.reset!(rng)` rewinds offset to `0x0` for mutable RNGs that have an `offset` field. + +Custom RNGs: +- Define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`. +- Define an RNG type `MyRNG <: AK.AbstractCounterRNG{MyAlg}` with fields `seed` and `alg`. +- Add a mutable `offset::UInt64` field if you want stream advancement across calls; omit it for stateless calls from counter `0`. +- Implement typed `rand_uint` methods: + - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32` + - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64` + +Both widths should be implemented so `AK.rand!` supports all integer/float output types without falling back or error. Supported element types: - `UInt8`, `UInt16`, `UInt32`, `UInt64` @@ -26,10 +50,7 @@ Algorithms currently available: - `Threefry` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)) Statistical-testing note: -- Published/reference versions of `SplitMix64`, `Philox`, and `Threefry` are reported to pass - TestU01 BigCrush. -- These results refer to specific constructions and test setups; wrapper choices (such as - seed/key mapping conventions) can change bitwise output streams. +- In this repository, `SplitMix64`, `Philox`, and `Threefry` have passed TestU01 BigCrush - These generators are not intended to be cryptographically secure. Philox keying note: @@ -42,7 +63,7 @@ Philox keying note: bit-for-bit identical to a raw Random123 Philox stream unless the same seed-to-key mapping and counter convention are used. -`Philox` is the default algorithm for `CounterRNG()`, as it is very thorough and very fast; it has been measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an Nvidia GeForce RTX +`Philox` is the default algorithm for `CounterRNG()` because it is thorough and very fast; it has been measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an Nvidia GeForce RTX 5060 (advertised 448 GB/s), i.e. effectively memory-bound throughput. Examples: @@ -52,8 +73,14 @@ using oneAPI # Reproducible rng = AK.CounterRNG(0x12345678; alg=AK.Philox()) -x = oneArray{Float32}(undef, 1024) -AK.rand!(rng, x) +v = oneArray{Float32}(undef, 1024) +AK.rand!(rng, v) + +# Stream-consistent chunking +v1 = oneArray{Float32}(undef, 100) +v2 = oneArray{Float32}(undef, 100) +AK.rand!(rng, v1) +AK.rand!(rng, v2) # Convenience (fresh auto-seeded RNG on each call) y = oneArray{Float32}(undef, 1024) @@ -62,5 +89,6 @@ AK.rand!(y) ```@docs AcceleratedKernels.CounterRNG +AcceleratedKernels.reset! AcceleratedKernels.rand! ``` diff --git a/prototype/RNGTest/Project.toml b/prototype/RNGTest/Project.toml new file mode 100644 index 0000000..7536db2 --- /dev/null +++ b/prototype/RNGTest/Project.toml @@ -0,0 +1,3 @@ +[deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" +RNGTest = "97cc5700-e6cb-5ca1-8fb2-7f6b45264ecd" diff --git a/prototype/RNGTest/README.md b/prototype/RNGTest/README.md new file mode 100644 index 0000000..2bc6a7b --- /dev/null +++ b/prototype/RNGTest/README.md @@ -0,0 +1,33 @@ +# AK + RNGTest SmallCrush Prototype + +This folder provides a chunked random stream generator based on `AcceleratedKernels.jl` that can be fed into `RNGTest.jl`. + +The stream is deterministic and effectively unbounded: +- each refill generates `chunk` random `UInt64` values with `AK.rand!` +- each refill advances one persistent `CounterRNG` stream offset +- this is a practical chunked stream for RNGTest callback mode + +`RNGTest.jl` (in this local checkout) expects a callback returning `Float64` in `[0,1]`, so `UInt64` words are mapped to `Float64` via top-53-bit scaling. + +Current status in this harness: `SplitMix64`, `Philox`, and `Threefry` all pass BigCrush using `run_bigcrush.jl`. + +## Run SmallCrush + +From this directory: + +```powershell +julia --project=. run_smallcrush.jl +``` + +## Run BigCrush + +```powershell +julia --project=. run_bigcrush.jl +``` + +Notes: +- Configure `ALG`, `SEED`, and `CHUNK` at the top of + `run_smallcrush.jl` / `run_bigcrush.jl`. +- The stream refills directly into host scratch using `AK.rand!` on CPU. +- `chunk` controls refill amortization and memory usage. +- `chunk=100000000` means ~800 MB host scratch (`UInt64`). diff --git a/prototype/RNGTest/run_bigcrush.jl b/prototype/RNGTest/run_bigcrush.jl new file mode 100644 index 0000000..a1298a0 --- /dev/null +++ b/prototype/RNGTest/run_bigcrush.jl @@ -0,0 +1,26 @@ +using RNGTest + +include("stream.jl") + + +const ALG = :philox +const SEED = 0x1234 +const CHUNK = 10_000_000 +const HOST_SCRATCH = Vector{UInt64}(undef, CHUNK) + + +stream = AKUInt64Stream( + HOST_SCRATCH; + seed=SEED, + alg=ALG, + start_counter=UInt64(0), +) +gen = make_rngtest_generator!(stream) +genname = "AK_Vector_$(ALG)_seed$(SEED)" + +println("Beginning BigCrush. This may take hours...") + +RNGTest.bigcrushTestU01(gen, genname) + +println("refills: ", stream.refill_count) +println("numbers consumed (approx): ", (stream.refill_count - 1) * stream.chunk + (stream.idx - 1)) diff --git a/prototype/RNGTest/run_smallcrush.jl b/prototype/RNGTest/run_smallcrush.jl new file mode 100644 index 0000000..db1251c --- /dev/null +++ b/prototype/RNGTest/run_smallcrush.jl @@ -0,0 +1,23 @@ +using RNGTest + +include("stream.jl") + + +const ALG = :philox +const SEED = 0x1234 +const CHUNK = 100_000_000 +const HOST_SCRATCH = Vector{UInt64}(undef, CHUNK) + + +stream = AKUInt64Stream( + HOST_SCRATCH; + seed=SEED, + alg=ALG, + start_counter=UInt64(0), +) +gen = make_rngtest_generator!(stream) +genname = "AK_Vector_$(ALG)_seed$(SEED)" + +println("Beginning SmallCrush...") + +RNGTest.smallcrushTestU01(gen, genname) diff --git a/prototype/RNGTest/stream.jl b/prototype/RNGTest/stream.jl new file mode 100644 index 0000000..ba65ce9 --- /dev/null +++ b/prototype/RNGTest/stream.jl @@ -0,0 +1,80 @@ +import AcceleratedKernels as AK + + +function make_rng(seed::Integer, alg::Symbol; offset::Integer=0) + if alg === :philox + return AK.CounterRNG(seed; alg=AK.Philox(), offset=offset) + elseif alg === :threefry + return AK.CounterRNG(seed; alg=AK.Threefry(), offset=offset) + elseif alg === :splitmix64 + return AK.CounterRNG(seed; alg=AK.SplitMix64(), offset=offset) + end + throw(ArgumentError("alg must be :philox, :threefry, or :splitmix64; got $alg")) +end + + +mutable struct AKUInt64Stream{R <: AK.AbstractCounterRNG} + rng::R + chunk::Int + idx::Int + host_scratch::Vector{UInt64} + refill_count::Int +end + + +function AKUInt64Stream( + host_scratch::Vector{UInt64}; + seed::Integer=0x1234, + alg::Symbol=:philox, + start_counter::UInt64=0x0000000000000000, +) + chunk = length(host_scratch) + chunk > 0 || throw(ArgumentError("host_scratch must be non-empty")) + rng = make_rng(seed, alg; offset=start_counter) + + return AKUInt64Stream( + rng, + chunk, + chunk + 1, + host_scratch, + 0, + ) +end + + +@inline _u01_from_u64(u::UInt64)::Float64 = Float64(u >>> 11) * 0x1.0p-53 + + +function _fill_chunk!(s::AKUInt64Stream) + AK.rand!(s.rng, s.host_scratch) + return nothing +end + + +function refill!(s::AKUInt64Stream) + _fill_chunk!(s) + s.idx = 1 + s.refill_count += 1 + return s +end + + +function next_u64!(s::AKUInt64Stream)::UInt64 + if s.idx > s.chunk + refill!(s) + end + @inbounds u = s.host_scratch[s.idx] + s.idx += 1 + return u +end + + +@inline next_float64!(s::AKUInt64Stream)::Float64 = _u01_from_u64(next_u64!(s)) + + +function make_rngtest_generator!(s::AKUInt64Stream) + if s.idx > s.chunk + refill!(s) + end + return () -> next_float64!(s) +end diff --git a/prototype/rand/Project.toml b/prototype/rand/Project.toml index 7e92c89..7757b4d 100644 --- a/prototype/rand/Project.toml +++ b/prototype/rand/Project.toml @@ -2,5 +2,6 @@ AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f" PProf = "e4faabce-9ead-11e9-39d9-4379958e3056" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" diff --git a/src/rand/philox.jl b/src/rand/philox.jl index 37149f1..61e5f97 100644 --- a/src/rand/philox.jl +++ b/src/rand/philox.jl @@ -18,13 +18,13 @@ end # Evaluate one Philox block at `counter`, returning two 32-bit lanes `(x0, x1)` @inline function _philox2x32_block( - rng::CounterRNG{<:Philox}, + seed::UInt64, counter::UInt64, )::Tuple{UInt32, UInt32} x0 = _u32_lo(counter) x1 = _u32_hi(counter) - k0 = splitmix32_from_u64(UInt64(rng.seed)) + k0 = splitmix32_from_u64(seed) @inbounds for _ in 1:PHILOX_ROUNDS x0, x1 = _philox2x32_round(x0, x1, k0) @@ -37,21 +37,23 @@ end # Return lane 0 from the single Philox block at `counter` @inline function rand_uint( - rng::CounterRNG{<:Philox}, + seed::UInt64, + alg::Philox, counter::UInt64, ::Type{UInt32}, )::UInt32 - x0, _ = _philox2x32_block(rng, counter) + x0, _ = _philox2x32_block(seed, counter) return x0 end # Build UInt64 from the two lanes `(x0, x1)` of the same Philox block at `counter` @inline function rand_uint( - rng::CounterRNG{<:Philox}, + seed::UInt64, + alg::Philox, counter::UInt64, ::Type{UInt64}, )::UInt64 - x0, x1 = _philox2x32_block(rng, counter) + x0, x1 = _philox2x32_block(seed, counter) return _u64_from_u32s(x0, x1) end diff --git a/src/rand/rand.jl b/src/rand/rand.jl index d1fcb42..229a899 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -1,50 +1,73 @@ """ - abstract type AbstractCounterRNG end abstract type CounterRNGAlgorithm end + abstract type AbstractCounterRNG{A <: CounterRNGAlgorithm} end RNG interface for counter-based random generation with AcceleratedKernels. """ -abstract type AbstractCounterRNG end abstract type CounterRNGAlgorithm end +abstract type AbstractCounterRNG{A <: CounterRNGAlgorithm} end """ CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox()) -Stateless counter-based RNG configuration for [`rand!`](@ref). +Counter-based RNG for [`rand!`](@ref). -`CounterRNG` is immutable and does not hold mutable thread-local or global state. Each generated -value is a pure function of: +`CounterRNG` stores: - `seed` -- logical linear element index - algorithm (`alg`) +- stream `offset` The default algorithm is `Philox()`. `seed` may be any non-negative `Integer`. It is normalised to `UInt64` internally. +`offset` is initialised to `0` by default and advances by `length(v)` after each [`rand!`](@ref) +call. Constructors: -- `CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())` - Uses an explicit non-negative seed. -- `CounterRNG(; alg::CounterRNGAlgorithm=Philox())` - Auto-seeds once using `Random.rand(Random.default_rng(), UInt64)`. Reusing the same `CounterRNG` instance is deterministic - for fixed seed, algorithm, array shape, and eltype. +- `CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)` + Uses an explicit non-negative seed and offset. +- `CounterRNG(; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)` + Auto-seeds once using `Base.rand(UInt64)`, with default `offset == 0`. """ -struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG +mutable struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG{A} seed::UInt64 alg::A + offset::UInt64 end -function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox()) +function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0) @argcheck seed >= 0 "Seed must be a non-negative integer" - CounterRNG(UInt64(seed), alg) + @argcheck offset >= 0 "Offset must be a non-negative integer" + CounterRNG(UInt64(seed), alg, UInt64(offset)) end -function CounterRNG(; alg::CounterRNGAlgorithm=Philox()) - CounterRNG(Random.rand(Random.default_rng(), UInt64); alg) +function CounterRNG(; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0) + CounterRNG(Base.rand(UInt64); alg, offset) +end + + +CounterRNG(seed::Integer, alg::CounterRNGAlgorithm) = CounterRNG(seed; alg) + + +""" + reset!(rng::AbstractCounterRNG) + +Reset `rng.offset` to `0x0` for RNGs that support mutable stream offsets. + +This requires `rng` to: +- have an `offset` field +- be mutable +""" +@inline function reset!(rng::AbstractCounterRNG) + @argcheck hasfield(typeof(rng), :offset) "reset! requires an `offset` field" + @argcheck ismutabletype(typeof(rng)) "reset! requires a mutable RNG type" + + rng.offset = UInt64(0) + return rng end @@ -64,8 +87,8 @@ include("threefry.jl") """ rand!( rng::AbstractCounterRNG, - x::AbstractArray{T}, - backend::Backend=get_backend(x); + v::AbstractArray{T}, + backend::Backend=get_backend(v); # CPU settings max_tasks::Int=Threads.nthreads(), @@ -78,8 +101,13 @@ include("threefry.jl") block_size::Int=256, ) -Fill `x` in-place with pseudo-random values using a stateless counter-based RNG. For `x[i]`, the -counter is exactly `UInt64(i - 1)` in linear indexing order. +Fill `v` in-place with pseudo-random values using a counter-based RNG stream. For `v[i]`, the +counter is `start_offset + UInt64(i - 1)` in linear indexing order, where `start_offset` is: +- `rng.offset` if `rng` has an `offset` field +- `0` otherwise + +After filling `v`, `rng.offset` advances by `length(v)` only when `rng` has a mutable `offset` +field. Supported scalar element types are: - `UInt8`, `UInt16`, `UInt32`, `UInt64` @@ -97,8 +125,8 @@ Semantics: """ function rand!( rng::AbstractCounterRNG, - x::AbstractArray{T}, - backend::Backend=get_backend(x); + v::AbstractArray{T}, + backend::Backend=get_backend(v); # CPU settings max_tasks::Int=Threads.nthreads(), @@ -110,23 +138,36 @@ function rand!( ) where T @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)" + + initial_offset = hasfield(typeof(rng), :offset) ? UInt64(getproperty(rng, :offset)) : UInt64(0) + + # local isbits captures from potentially mutable rng object + seed, alg = rng.seed, rng.alg + foreachindex( - 1:length(x), backend; + v, backend; max_tasks, min_elems, prefer_threads, block_size, ) do i - @inbounds x[i] = rand_scalar(rng, _counter_from_index(i), T) + @inbounds v[i] = rand_scalar(seed, alg, initial_offset + _counter_from_index(i), T) + end + + if hasfield(typeof(rng), :offset) && ismutabletype(typeof(rng)) + # XXX: maybe should be atomic add? would only be needed if AK.rand! were called + # concurrently on the same rng... ?? + rng.offset = initial_offset + UInt64(length(v)) end - return x + + v end function rand!( - x::AbstractArray, + v::AbstractArray, args...; kwargs..., ) - return rand!(CounterRNG(), x, args...; kwargs...) + return rand!(CounterRNG(), v, args...; kwargs...) end diff --git a/src/rand/splitmix.jl b/src/rand/splitmix.jl index 8d7992b..cc474be 100644 --- a/src/rand/splitmix.jl +++ b/src/rand/splitmix.jl @@ -24,20 +24,21 @@ end # Natural SplitMix64 output path: compute 64 random bits directly from one counter @inline function rand_uint( - rng::CounterRNG{<:SplitMix64}, + seed::UInt64, + alg::SplitMix64, counter::UInt64, ::Type{UInt64}, )::UInt64 - seed = UInt64(rng.seed) return _splitmix64_mix(counter + seed + SPLITMIX64_INCREMENT) end # UInt32 path is derived from the high 32 bits of the UInt64 SplitMix output @inline function rand_uint( - rng::CounterRNG{<:SplitMix64}, + seed::UInt64, + alg::SplitMix64, counter::UInt64, ::Type{UInt32}, )::UInt32 - return _u32_hi(rand_uint(rng, counter, UInt64)) + return _u32_hi(rand_uint(seed, alg, counter, UInt64)) end diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl index 7326f40..2006886 100644 --- a/src/rand/threefry.jl +++ b/src/rand/threefry.jl @@ -18,13 +18,12 @@ end # Evaluate one Threefry block at `counter`, returning two 32-bit lanes `(x0, x1)` @inline function _threefry2x32_block( - rng::CounterRNG{<:Threefry}, + seed::UInt64, counter::UInt64, )::Tuple{UInt32, UInt32} x0 = _u32_lo(counter) x1 = _u32_hi(counter) - seed = UInt64(rng.seed) k0 = _u32_lo(seed) k1 = _u32_hi(seed) k2 = xor(THREEFRY_PARITY, xor(k0, k1)) @@ -52,21 +51,23 @@ end # Return lane 0 from the single Threefry block at `counter` @inline function rand_uint( - rng::CounterRNG{<:Threefry}, + seed::UInt64, + alg::Threefry, counter::UInt64, ::Type{UInt32}, )::UInt32 - x0, _ = _threefry2x32_block(rng, counter) + x0, _ = _threefry2x32_block(seed, counter) return x0 end # Build UInt64 from the two lanes `(x0, x1)` of the same Threefry block at `counter` @inline function rand_uint( - rng::CounterRNG{<:Threefry}, + seed::UInt64, + alg::Threefry, counter::UInt64, ::Type{UInt64}, )::UInt64 - x0, x1 = _threefry2x32_block(rng, counter) + x0, x1 = _threefry2x32_block(seed, counter) return _u64_from_u32s(x0, x1) end diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index 3d5dab7..bc3da2f 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -51,15 +51,16 @@ const ALLOWED_RAND_SCALARS = Union{ #= -Every RNG algorithm implements rand_uint(rng, counter, UInt32/UInt64). -This fallback provides a clear failure for unsupported RNG types. +Every RNG algorithm implements rand_uint(seed, alg, counter, UInt32/UInt64). +This is the fallback for unsupported RNG algorithms. =# @inline function rand_uint( - rng::AbstractCounterRNG, + ::UInt64, + alg::CounterRNGAlgorithm, ::UInt64, ::Type{UIntType} )::UIntType where {UIntType <: Union{UInt32, UInt64}} - throw(ArgumentError("No rand_uint implementation for RNG: $rng")) + throw(ArgumentError("No rand_uint implementation for RNG algorithm: $(typeof(alg))")) end @@ -70,27 +71,26 @@ Shared scalar generation: 3) convert bits into requested scalar representation =# @inline function rand_scalar( - rng::AbstractCounterRNG, + seed::UInt64, + alg::CounterRNGAlgorithm, counter::UInt64, ::Type{T} )::T where {T <: ALLOWED_RAND_SCALARS} UIntType = raw_uint_type(T) - u = rand_uint(rng, counter, UIntType) + u = rand_uint(seed, alg, counter, UIntType) return from_uint(T, u) end -@inline function rand_scalar(::AbstractCounterRNG, ::UInt64, ::Type{T}) where {T} +@inline function rand_scalar(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T} throw(ArgumentError( "Unsupported random scalar type $(T). Supported: $(ALLOWED_RAND_SCALARS)" )) end - - # Convert random UInt32 bits to Float16 in [0, 1) by mantissa construction. @inline function uint32_to_unit_float16(u::UInt32)::Float16 # Keep 10 random bits for the mantissa (drop 22 rightmost bits from the UInt32) @@ -98,7 +98,7 @@ end bits = UInt16(0x3c00) | UInt16(u >> 22) # Interpret as 1.mantissa, then subtract 1 for [0, 1) - return reinterpret(Float16, bits) - Float16(1) + reinterpret(Float16, bits) - Float16(1) end @@ -109,7 +109,7 @@ end bits = UInt32(0x3f800000) | (u >> 9) # Interpret as 1.mantissa, then subtract 1 for [0, 1) - return reinterpret(Float32, bits) - 1.0f0 + reinterpret(Float32, bits) - 1.0f0 end @@ -120,5 +120,5 @@ end bits = UInt64(0x3ff0000000000000) | (u >> 12) # Interpret as 1.mantissa, then subtract 1 for [0, 1) - return reinterpret(Float64, bits) - 1.0 + reinterpret(Float64, bits) - 1.0 end diff --git a/test/rand.jl b/test/rand.jl index 9e03e59..b7ef167 100644 --- a/test/rand.jl +++ b/test/rand.jl @@ -7,25 +7,31 @@ const RAND_SCALAR_TYPES_ALL = ( ) const RAND_SCALAR_TYPES_BACKEND = IS_CPU_BACKEND ? RAND_SCALAR_TYPES_ALL : - (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float16, Float32, Bool) + (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Bool) +const RUN_FLOAT16_RAND_TESTS = IS_CPU_BACKEND const RUN_FLOAT64_RAND_TESTS = IS_CPU_BACKEND _is_unit_interval(v) = all(x -> !isnan(x) && zero(x) <= x < one(x), v) -function _rand_fill_reference!(rng, x::AbstractArray{T}) where {T <: AK.ALLOWED_RAND_SCALARS} +function _rand_fill_reference!( + rng, + x::AbstractArray{T}; + counter_offset::UInt64=UInt64(0), +) where {T <: AK.ALLOWED_RAND_SCALARS} @inbounds for i in eachindex(x) - x[i] = AK.rand_scalar(rng, UInt64(i - one(i)), T) + x[i] = AK.rand_scalar(rng.seed, rng.alg, counter_offset + UInt64(i - one(i)), T) end return x end function _assert_rand_matches_reference!(rng, x; kwargs...) + counter_offset = rng.offset AK.rand!(rng, x; kwargs...) ref = zeros(eltype(x), size(x)) - _rand_fill_reference!(rng, ref) + _rand_fill_reference!(rng, ref; counter_offset) @test Array(x) == ref return x end @@ -37,7 +43,11 @@ end @test AK.CounterRNG(UInt32(0x1); alg=AK.Philox()) isa AK.CounterRNG{AK.Philox} @test AK.CounterRNG(UInt16(123); alg=AK.Threefry()) isa AK.CounterRNG{AK.Threefry} @test AK.CounterRNG(UInt32(300)).seed == UInt64(300) + @test AK.CounterRNG(UInt32(300)).offset == UInt64(0) + @test AK.CounterRNG(0x1; offset=17).offset == UInt64(17) + @test AK.CounterRNG(0x1, AK.Philox()).offset == UInt64(0) @test_throws ArgumentError AK.CounterRNG(-1) + @test_throws ArgumentError AK.CounterRNG(1; offset=-1) Random.seed!(0x1234) expected_seed = Random.rand(Random.default_rng(), UInt64) @@ -45,11 +55,80 @@ end rng_auto = AK.CounterRNG() @test rng_auto.seed == expected_seed @test rng_auto.alg isa AK.Philox + @test rng_auto.offset == UInt64(0) + + rng_auto_off = AK.CounterRNG(; offset=42) + @test rng_auto_off.offset == UInt64(42) x1 = array_from_host(zeros(Float32, 1024)) x2 = array_from_host(zeros(Float32, 1024)) AK.rand!(rng_auto, x1; prefer_threads, block_size=64) AK.rand!(rng_auto, x2; prefer_threads, block_size=257) + @test rng_auto.offset == UInt64(2048) + @test Array(x1) != Array(x2) + end + + + @testset "abstract rng offset behavior" begin + mutable struct MutableNoOffsetRNG <: AK.AbstractCounterRNG{AK.Philox} + seed::UInt64 + alg::AK.Philox + end + + mutable struct MutableWithOffsetRNG <: AK.AbstractCounterRNG{AK.Philox} + seed::UInt64 + alg::AK.Philox + offset::UInt64 + end + + struct ImmutableWithOffsetRNG <: AK.AbstractCounterRNG{AK.Philox} + seed::UInt64 + alg::AK.Philox + offset::UInt64 + end + + rng_no_offset = MutableNoOffsetRNG(UInt64(0x1234), AK.Philox()) + x1 = array_from_host(zeros(Float32, 256)) + x2 = array_from_host(zeros(Float32, 256)) + AK.rand!(rng_no_offset, x1; prefer_threads, block_size=64) + AK.rand!(rng_no_offset, x2; prefer_threads, block_size=64) + @test Array(x1) == Array(x2) + + rng_stream = MutableWithOffsetRNG(UInt64(0x1234), AK.Philox(), UInt64(0)) + s1 = array_from_host(zeros(Float32, 100)) + s2 = array_from_host(zeros(Float32, 100)) + s12 = array_from_host(zeros(Float32, 200)) + AK.rand!(rng_stream, s1; prefer_threads, block_size=64) + AK.rand!(rng_stream, s2; prefer_threads, block_size=64) + AK.rand!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), s12; prefer_threads, block_size=64) + @test vcat(Array(s1), Array(s2)) == Array(s12) + @test rng_stream.offset == UInt64(200) + + rng_imm = ImmutableWithOffsetRNG(UInt64(0x1234), AK.Philox(), UInt64(17)) + y1 = array_from_host(zeros(Float32, 64)) + y2 = array_from_host(zeros(Float32, 64)) + AK.rand!(rng_imm, y1; prefer_threads, block_size=64) + AK.rand!(rng_imm, y2; prefer_threads, block_size=64) + @test Array(y1) == Array(y2) + + @test AK.reset!(rng_stream) === rng_stream + @test rng_stream.offset == UInt64(0) + @test_throws ArgumentError AK.reset!(rng_no_offset) + @test_throws ArgumentError AK.reset!(rng_imm) + end + + + @testset "reset!" begin + rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + x1 = array_from_host(zeros(Float32, 512)) + x2 = array_from_host(zeros(Float32, 512)) + + AK.rand!(rng, x1; prefer_threads, block_size=64) + @test rng.offset == UInt64(512) + @test AK.reset!(rng) === rng + @test rng.offset == UInt64(0) + AK.rand!(rng, x2; prefer_threads, block_size=64) + @test Array(x1) == Array(x2) end @@ -73,7 +152,9 @@ end @test AK.raw_uint_type(Int8) === UInt32 @test AK.raw_uint_type(Int16) === UInt32 @test AK.raw_uint_type(Int32) === UInt32 - @test AK.raw_uint_type(Float16) === UInt32 + if RUN_FLOAT16_RAND_TESTS + @test AK.raw_uint_type(Float16) === UInt32 + end @test AK.raw_uint_type(Float32) === UInt32 @test AK.raw_uint_type(UInt64) === UInt64 @test AK.raw_uint_type(Int64) === UInt64 @@ -92,12 +173,16 @@ end @test AK.from_uint( Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64 ) == Int64(-1) - @test AK.from_uint(Float16, UInt32(0)) == Float16(0) + if RUN_FLOAT16_RAND_TESTS + @test AK.from_uint(Float16, UInt32(0)) == Float16(0) + end @test AK.from_uint(Bool, UInt32(0)) == false @test AK.from_uint(Bool, UInt32(1)) == true - @test AK.uint32_to_unit_float16(UInt32(0)) == Float16(0) - @test Float16(0) <= AK.uint32_to_unit_float16(typemax(UInt32)) < Float16(1) + if RUN_FLOAT16_RAND_TESTS + @test AK.uint32_to_unit_float16(UInt32(0)) == Float16(0) + @test Float16(0) <= AK.uint32_to_unit_float16(typemax(UInt32)) < Float16(1) + end @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0 @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0 if RUN_FLOAT64_RAND_TESTS @@ -111,25 +196,26 @@ end for alg in RAND_ALGS rng = AK.CounterRNG(0x123456789abcdef; alg) for U in (UInt32, UInt64) - @test AK.rand_uint(rng, UInt64(0), U) == AK.rand_uint(rng, UInt64(0), U) - @test AK.rand_uint(rng, UInt64(1), U) != AK.rand_uint(rng, UInt64(0), U) + @test AK.rand_uint(rng.seed, rng.alg, UInt64(0), U) == AK.rand_uint(rng.seed, rng.alg, UInt64(0), U) + @test AK.rand_uint(rng.seed, rng.alg, UInt64(1), U) != AK.rand_uint(rng.seed, rng.alg, UInt64(0), U) - vals = [AK.rand_uint(rng, UInt64(i), U) for i in 0:511] + vals = [AK.rand_uint(rng.seed, rng.alg, UInt64(i), U) for i in 0:511] @test length(unique(vals)) > 460 end end rng_splitmix = AK.CounterRNG(0x31415926; alg=AK.SplitMix64()) for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023)) - @test AK.rand_uint(rng_splitmix, c, UInt32) == AK._u32_hi( - AK.rand_uint(rng_splitmix, c, UInt64) + @test AK.rand_uint(rng_splitmix.seed, rng_splitmix.alg, c, UInt32) == AK._u32_hi( + AK.rand_uint(rng_splitmix.seed, rng_splitmix.alg, c, UInt64) ) end for alg in (AK.Philox(), AK.Threefry()) rng = AK.CounterRNG(0xabcdef1234567890; alg) for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023)) - @test AK._u32_lo(AK.rand_uint(rng, c, UInt64)) == AK.rand_uint(rng, c, UInt32) + @test AK._u32_lo(AK.rand_uint(rng.seed, rng.alg, c, UInt64)) == + AK.rand_uint(rng.seed, rng.alg, c, UInt32) end end end @@ -139,10 +225,11 @@ end rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) for T in RAND_SCALAR_TYPES_BACKEND - s0 = AK.rand_scalar(rng, UInt64(0), T) - s1 = AK.rand_scalar(rng, UInt64(1), T) + s0 = AK.rand_scalar(rng.seed, rng.alg, UInt64(0), T) + s1 = AK.rand_scalar(rng.seed, rng.alg, UInt64(1), T) @test s0 isa T @test s1 isa T + @test s0 == AK.rand_scalar(rng.seed, rng.alg, UInt64(0), T) if !(T in (Bool, Float16, UInt8, UInt16, Int8, Int16)) @test s0 != s1 end @@ -153,32 +240,37 @@ end end c = UInt64(42) - @test AK.rand_scalar(rng, c, UInt8) == trunc(UInt8, AK.rand_uint(rng, c, UInt32) >> 24) - @test AK.rand_scalar(rng, c, UInt16) == trunc(UInt16, AK.rand_uint(rng, c, UInt32) >> 16) - @test AK.rand_scalar( - rng, c, Int8 - ) == reinterpret(Int8, trunc(UInt8, AK.rand_uint(rng, c, UInt32) >> 24)) - @test AK.rand_scalar( - rng, c, Int16 - ) == reinterpret(Int16, trunc(UInt16, AK.rand_uint(rng, c, UInt32) >> 16)) - @test AK.rand_scalar(rng, c, Int32) == reinterpret(Int32, AK.rand_uint(rng, c, UInt32)) - @test AK.rand_scalar(rng, c, Int64) == reinterpret(Int64, AK.rand_uint(rng, c, UInt64)) - @test AK.rand_scalar(rng, c, Float16) == AK.uint32_to_unit_float16( - AK.rand_uint(rng, c, UInt32) - ) - @test AK.rand_scalar(rng, c, Float32) == AK.uint32_to_unit_float32( - AK.rand_uint(rng, c, UInt32) + @test AK.rand_scalar(rng.seed, rng.alg, c, UInt8) == + trunc(UInt8, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 24) + @test AK.rand_scalar(rng.seed, rng.alg, c, UInt16) == + trunc(UInt16, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 16) + @test AK.rand_scalar(rng.seed, rng.alg, c, Int8) == + reinterpret(Int8, trunc(UInt8, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 24)) + @test AK.rand_scalar(rng.seed, rng.alg, c, Int16) == + reinterpret(Int16, trunc(UInt16, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 16)) + @test AK.rand_scalar(rng.seed, rng.alg, c, Int32) == + reinterpret(Int32, AK.rand_uint(rng.seed, rng.alg, c, UInt32)) + @test AK.rand_scalar(rng.seed, rng.alg, c, Int64) == + reinterpret(Int64, AK.rand_uint(rng.seed, rng.alg, c, UInt64)) + if RUN_FLOAT16_RAND_TESTS + @test AK.rand_scalar(rng.seed, rng.alg, c, Float16) == AK.uint32_to_unit_float16( + AK.rand_uint(rng.seed, rng.alg, c, UInt32) + ) + end + @test AK.rand_scalar(rng.seed, rng.alg, c, Float32) == AK.uint32_to_unit_float32( + AK.rand_uint(rng.seed, rng.alg, c, UInt32) ) - @test AK.rand_scalar(rng, c, Bool) == isodd(AK.rand_uint(rng, c, UInt32)) + @test AK.rand_scalar(rng.seed, rng.alg, c, Bool) == + isodd(AK.rand_uint(rng.seed, rng.alg, c, UInt32)) if RUN_FLOAT64_RAND_TESTS - @test AK.rand_scalar(rng, c, Float64) == AK.uint64_to_unit_float64( - AK.rand_uint(rng, c, UInt64) + @test AK.rand_scalar(rng.seed, rng.alg, c, Float64) == AK.uint64_to_unit_float64( + AK.rand_uint(rng.seed, rng.alg, c, UInt64) ) end - bools = [AK.rand_scalar(rng, UInt64(i), Bool) for i in 0:511] + bools = [AK.rand_scalar(rng.seed, rng.alg, UInt64(i), Bool) for i in 0:511] @test any(identity, bools) @test any(!, bools) - @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt128) + @test_throws ArgumentError AK.rand_scalar(rng.seed, rng.alg, UInt64(0), UInt128) end @@ -199,21 +291,37 @@ end for T in RAND_SCALAR_TYPES_BACKEND x1 = array_from_host(zeros(T, 2048)) x2 = array_from_host(zeros(T, 2048)) - AK.rand!(rng, x1; prefer_threads, block_size=64) - AK.rand!(rng, x2; prefer_threads, block_size=257) + rng1 = AK.CounterRNG(rng.seed; alg=rng.alg) + rng2 = AK.CounterRNG(rng.seed; alg=rng.alg) + AK.rand!(rng1, x1; prefer_threads, block_size=64) + AK.rand!(rng2, x2; prefer_threads, block_size=257) @test Array(x1) == Array(x2) end - rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg) for T in RAND_SCALAR_TYPES_BACKEND + rng1 = AK.CounterRNG(rng.seed; alg=rng.alg) + rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg) x1 = array_from_host(zeros(T, 2048)) x2 = array_from_host(zeros(T, 2048)) - AK.rand!(rng, x1; prefer_threads, block_size=64) + AK.rand!(rng1, x1; prefer_threads, block_size=64) AK.rand!(rng2, x2; prefer_threads, block_size=64) @test Array(x1) != Array(x2) end - for T in (Float16, Float32, UInt64, Bool) + begin + rng_stream = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + rng_once = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + x1 = array_from_host(zeros(Float32, 100)) + x2 = array_from_host(zeros(Float32, 100)) + x12 = array_from_host(zeros(Float32, 200)) + AK.rand!(rng_stream, x1; prefer_threads, block_size=64) + AK.rand!(rng_stream, x2; prefer_threads, block_size=64) + AK.rand!(rng_once, x12; prefer_threads, block_size=64) + @test vcat(Array(x1), Array(x2)) == Array(x12) + @test rng_stream.offset == UInt64(200) + end + + for T in (RUN_FLOAT16_RAND_TESTS ? (Float16, Float32, UInt64, Bool) : (Float32, UInt64, Bool)) xnd = array_from_host(zeros(T, 7, 11, 5)) _assert_rand_matches_reference!(rng, xnd; prefer_threads, block_size=128) end @@ -229,7 +337,10 @@ end prefer_threads=true ) ref_view = zeros(T, length(view_x)) - _rand_fill_reference!(rng, ref_view) + _rand_fill_reference!( + rng, ref_view; + counter_offset=rng.offset - UInt64(length(view_x)), + ) @test collect(view_x) == ref_view end end From 71a62d47f57653429b03a5621cf710511a0be49d Mon Sep 17 00:00:00 2001 From: fjbarter Date: Tue, 24 Mar 2026 20:02:41 +0000 Subject: [PATCH 11/18] style update + remove AbstractCounterRNG type in favour of purely allowing CounterRNG which now must have an offset. if needed, AK.reset!(::CounterRNG) can be called to reset a streaming CounterRNG every time AK.rand! is called, if needed --- docs/src/api/rand.md | 5 +-- prototype/RNGTest/stream.jl | 2 +- prototype/rand/test_rand.jl | 8 ++--- src/rand/philox.jl | 2 +- src/rand/rand.jl | 38 +++++++--------------- src/rand/threefry.jl | 1 + src/rand/utilities.jl | 65 ++++++++++++++++++++----------------- test/rand.jl | 6 ++-- test/runtests.jl | 8 ++++- 9 files changed, 66 insertions(+), 69 deletions(-) diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md index 5c5596e..03ed7f3 100644 --- a/docs/src/api/rand.md +++ b/docs/src/api/rand.md @@ -9,7 +9,7 @@ fixed `seed`, algorithm, and call sequence. - calls that share the same `CounterRNG` instance concurrently are not thread-safe. - call `AK.reset!(rng)` to rewind a mutable offset-bearing RNG back to offset `0x0`. -`AK.rand!` also accepts custom `AbstractCounterRNG` implementations: +`AK.rand!` also accepts custom `CounterRNG` implementations: - if they have a mutable `offset` field, streaming advancement is applied - if they have no `offset` field, each call behaves statelessly from counter `0` - if they have an immutable `offset` field, that offset is used as a fixed start and is not advanced @@ -23,7 +23,7 @@ convenience, Custom RNGs: - Define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`. -- Define an RNG type `MyRNG <: AK.AbstractCounterRNG{MyAlg}` with fields `seed` and `alg`. +- Define a `CounterRNG` with fields `seed` and `alg`. - Add a mutable `offset::UInt64` field if you want stream advancement across calls; omit it for stateless calls from counter `0`. - Implement typed `rand_uint` methods: - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32` @@ -89,6 +89,7 @@ AK.rand!(y) ```@docs AcceleratedKernels.CounterRNG +AcceleratedKernels.CounterRNGAlgorithm AcceleratedKernels.reset! AcceleratedKernels.rand! ``` diff --git a/prototype/RNGTest/stream.jl b/prototype/RNGTest/stream.jl index ba65ce9..a383645 100644 --- a/prototype/RNGTest/stream.jl +++ b/prototype/RNGTest/stream.jl @@ -13,7 +13,7 @@ function make_rng(seed::Integer, alg::Symbol; offset::Integer=0) end -mutable struct AKUInt64Stream{R <: AK.AbstractCounterRNG} +mutable struct AKUInt64Stream{R} rng::R chunk::Int idx::Int diff --git a/prototype/rand/test_rand.jl b/prototype/rand/test_rand.jl index 329214d..ec3ce65 100644 --- a/prototype/rand/test_rand.jl +++ b/prototype/rand/test_rand.jl @@ -43,13 +43,13 @@ is_unit_interval(v) = all(x -> 0.0f0 <= x <= 1.0f0, v) # warmup compile run_cuda_rand!(x_cuda) -run_ak_rand_gpu!(RNG_SPLITMIX, x_splitmix) +# run_ak_rand_gpu!(RNG_SPLITMIX, x_splitmix) run_ak_rand_gpu!(RNG_PHILOX, x_philox) run_ak_rand_gpu!(RNG_THREEFRY, x_threefry) run_ak_rand_cpu!(RNG_SPLITMIX, x_cpu) @assert is_unit_interval(Array(x_cuda)) -@assert is_unit_interval(Array(x_splitmix)) +# @assert is_unit_interval(Array(x_splitmix)) @assert is_unit_interval(Array(x_philox)) @assert is_unit_interval(Array(x_threefry)) @assert is_unit_interval(x_cpu) @@ -60,8 +60,8 @@ println("CPU threads: ", Threads.nthreads()) println("\nCUDA.rand! benchmark (CuArray{Float32}, in-place)") display(@benchmark run_cuda_rand!($x_cuda)) -println("\nAK.rand! SplitMix64 benchmark (GPU, CuArray{Float32})") -display(@benchmark run_ak_rand_gpu!($RNG_SPLITMIX, $x_splitmix)) +# println("\nAK.rand! SplitMix64 benchmark (GPU, CuArray{Float32})") +# display(@benchmark run_ak_rand_gpu!($RNG_SPLITMIX, $x_splitmix)) println("\nAK.rand! Philox benchmark (GPU, CuArray{Float32})") display(@benchmark run_ak_rand_gpu!($RNG_PHILOX, $x_philox)) diff --git a/src/rand/philox.jl b/src/rand/philox.jl index 61e5f97..0de9ff4 100644 --- a/src/rand/philox.jl +++ b/src/rand/philox.jl @@ -4,7 +4,7 @@ struct Philox <: CounterRNGAlgorithm end # Philox magic numbers const PHILOX_M0 = UInt32(0xD256D193) const PHILOX_W0 = UInt32(0x9E3779B9) -const PHILOX_ROUNDS = 10 +const PHILOX_ROUNDS = 7 @inline function _philox2x32_round(x0::UInt32, x1::UInt32, k0::UInt32) diff --git a/src/rand/rand.jl b/src/rand/rand.jl index 229a899..b9db756 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -1,12 +1,4 @@ -""" - abstract type CounterRNGAlgorithm end - abstract type AbstractCounterRNG{A <: CounterRNGAlgorithm} end - -RNG interface for counter-based random generation with AcceleratedKernels. -""" - abstract type CounterRNGAlgorithm end -abstract type AbstractCounterRNG{A <: CounterRNGAlgorithm} end """ @@ -31,11 +23,12 @@ Constructors: - `CounterRNG(; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)` Auto-seeds once using `Base.rand(UInt64)`, with default `offset == 0`. """ -mutable struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG{A} - seed::UInt64 - alg::A +mutable struct CounterRNG{A <: CounterRNGAlgorithm} + const seed::UInt64 + const alg::A offset::UInt64 end +#TODO: need to figure out a nice way to allow custom counter RNGs function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0) @@ -54,7 +47,7 @@ CounterRNG(seed::Integer, alg::CounterRNGAlgorithm) = CounterRNG(seed; alg) """ - reset!(rng::AbstractCounterRNG) + reset!(rng::CounterRNG) Reset `rng.offset` to `0x0` for RNGs that support mutable stream offsets. @@ -62,7 +55,7 @@ This requires `rng` to: - have an `offset` field - be mutable """ -@inline function reset!(rng::AbstractCounterRNG) +@inline function reset!(rng::CounterRNG) @argcheck hasfield(typeof(rng), :offset) "reset! requires an `offset` field" @argcheck ismutabletype(typeof(rng)) "reset! requires a mutable RNG type" @@ -86,7 +79,7 @@ include("threefry.jl") """ rand!( - rng::AbstractCounterRNG, + rng::CounterRNG, v::AbstractArray{T}, backend::Backend=get_backend(v); @@ -124,7 +117,7 @@ Semantics: """ function rand!( - rng::AbstractCounterRNG, + rng::CounterRNG, v::AbstractArray{T}, backend::Backend=get_backend(v); @@ -139,26 +132,17 @@ function rand!( @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)" - initial_offset = hasfield(typeof(rng), :offset) ? UInt64(getproperty(rng, :offset)) : UInt64(0) - - # local isbits captures from potentially mutable rng object + # Local isbits captures from potentially mutable rng object seed, alg = rng.seed, rng.alg foreachindex( v, backend; - max_tasks, - min_elems, - prefer_threads, - block_size, + max_tasks, min_elems, prefer_threads, block_size, ) do i @inbounds v[i] = rand_scalar(seed, alg, initial_offset + _counter_from_index(i), T) end - if hasfield(typeof(rng), :offset) && ismutabletype(typeof(rng)) - # XXX: maybe should be atomic add? would only be needed if AK.rand! were called - # concurrently on the same rng... ?? - rng.offset = initial_offset + UInt64(length(v)) - end + rng.offset += UInt64(length(v)) v end diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl index 2006886..c9052b1 100644 --- a/src/rand/threefry.jl +++ b/src/rand/threefry.jl @@ -21,6 +21,7 @@ end seed::UInt64, counter::UInt64, )::Tuple{UInt32, UInt32} + x0 = _u32_lo(counter) x1 = _u32_hi(counter) diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index bc3da2f..f2159c5 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -1,15 +1,17 @@ # lo: rightmost 32 bits, hi: leftmost 32 bits @inline _u32_lo(x::UInt64)::UInt32 = UInt32(x & UInt64(0xffffffff)) @inline _u32_hi(x::UInt64)::UInt32 = UInt32(x >> 32) + +# Construct UInt64 by bit concatenation of two UInt32s @inline _u64_from_u32s(lo::UInt32, hi::UInt32)::UInt64 = (UInt64(hi) << 32) | UInt64(lo) -# leftmost 32 bits of a*b cast to UInt64s +# Leftmost 32 bits of a*b cast to UInt64s @inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32) # 32-bit rotate left by r positions @inline _rotl32(x::UInt32, r::UInt32)::UInt32 = (x << r) | (x >> (UInt32(32) - r)) - +# Get counter used for CounterRNG from element index @inline _counter_from_index(i)::UInt64 = UInt64(i - one(i)) @@ -22,32 +24,32 @@ const ALLOWED_RAND_SCALARS = Union{ } -@inline raw_uint_type(::Type{UInt8}) = UInt32 -@inline raw_uint_type(::Type{UInt16}) = UInt32 -@inline raw_uint_type(::Type{UInt32}) = UInt32 -@inline raw_uint_type(::Type{Int8}) = UInt32 -@inline raw_uint_type(::Type{Int16}) = UInt32 -@inline raw_uint_type(::Type{Int32}) = UInt32 -@inline raw_uint_type(::Type{Float16}) = UInt32 -@inline raw_uint_type(::Type{Float32}) = UInt32 -@inline raw_uint_type(::Type{UInt64}) = UInt64 -@inline raw_uint_type(::Type{Int64}) = UInt64 -@inline raw_uint_type(::Type{Float64}) = UInt64 -@inline raw_uint_type(::Type{Bool}) = UInt32 - - -@inline from_uint(::Type{UInt8}, u::UInt32)::UInt8 = trunc(UInt8, u >> 24) -@inline from_uint(::Type{UInt16}, u::UInt32)::UInt16 = trunc(UInt16, u >> 16) -@inline from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u -@inline from_uint(::Type{UInt64}, u::UInt64)::UInt64 = u -@inline from_uint(::Type{Int8}, u::UInt32)::Int8 = reinterpret(Int8, trunc(UInt8, u >> 24)) -@inline from_uint(::Type{Int16}, u::UInt32)::Int16 = reinterpret(Int16, trunc(UInt16, u >> 16)) -@inline from_uint(::Type{Int32}, u::UInt32)::Int32 = reinterpret(Int32, u) -@inline from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u) -@inline from_uint(::Type{Float16}, u::UInt32)::Float16 = uint32_to_unit_float16(u) -@inline from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u) -@inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u) -@inline from_uint(::Type{Bool}, u::UInt32)::Bool = isodd(u) +@inline _rand_scalar_uint_type(::Type{UInt8}) = UInt32 +@inline _rand_scalar_uint_type(::Type{UInt16}) = UInt32 +@inline _rand_scalar_uint_type(::Type{UInt32}) = UInt32 +@inline _rand_scalar_uint_type(::Type{Int8}) = UInt32 +@inline _rand_scalar_uint_type(::Type{Int16}) = UInt32 +@inline _rand_scalar_uint_type(::Type{Int32}) = UInt32 +@inline _rand_scalar_uint_type(::Type{Float16}) = UInt32 +@inline _rand_scalar_uint_type(::Type{Float32}) = UInt32 +@inline _rand_scalar_uint_type(::Type{UInt64}) = UInt64 +@inline _rand_scalar_uint_type(::Type{Int64}) = UInt64 +@inline _rand_scalar_uint_type(::Type{Float64}) = UInt64 +@inline _rand_scalar_uint_type(::Type{Bool}) = UInt32 + + +@inline _rand_scalar_from_uint(::Type{UInt8}, u::UInt32)::UInt8 = trunc(UInt8, u >> 24) +@inline _rand_scalar_from_uint(::Type{UInt16}, u::UInt32)::UInt16 = trunc(UInt16, u >> 16) +@inline _rand_scalar_from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u +@inline _rand_scalar_from_uint(::Type{UInt64}, u::UInt64)::UInt64 = u +@inline _rand_scalar_from_uint(::Type{Int8}, u::UInt32)::Int8 = reinterpret(Int8, trunc(UInt8, u >> 24)) +@inline _rand_scalar_from_uint(::Type{Int16}, u::UInt32)::Int16 = reinterpret(Int16, trunc(UInt16, u >> 16)) +@inline _rand_scalar_from_uint(::Type{Int32}, u::UInt32)::Int32 = reinterpret(Int32, u) +@inline _rand_scalar_from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u) +@inline _rand_scalar_from_uint(::Type{Float16}, u::UInt32)::Float16 = uint32_to_unit_float16(u) +@inline _rand_scalar_from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u) +@inline _rand_scalar_from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u) +@inline _rand_scalar_from_uint(::Type{Bool}, u::UInt32)::Bool = isodd(u) #= @@ -77,10 +79,10 @@ Shared scalar generation: ::Type{T} )::T where {T <: ALLOWED_RAND_SCALARS} - UIntType = raw_uint_type(T) + UIntType = _rand_scalar_uint_type(T) u = rand_uint(seed, alg, counter, UIntType) - return from_uint(T, u) + return _rand_scalar_from_uint(T, u) end @@ -93,6 +95,7 @@ end # Convert random UInt32 bits to Float16 in [0, 1) by mantissa construction. @inline function uint32_to_unit_float16(u::UInt32)::Float16 + # Keep 10 random bits for the mantissa (drop 22 rightmost bits from the UInt32) # and combine with the bit pattern of Float16(1.0) (sign=0, exponent=15). bits = UInt16(0x3c00) | UInt16(u >> 22) @@ -104,6 +107,7 @@ end # Convert random UInt32 bits to Float32 in [0, 1) by mantissa construction. @inline function uint32_to_unit_float32(u::UInt32)::Float32 + # Keep 23 random bits for the mantissa (drop 9 rightmost bits from the UInt32) # and combine with the bit pattern of 1.0f0 (sign=0, exponent=127). bits = UInt32(0x3f800000) | (u >> 9) @@ -115,6 +119,7 @@ end # Convert random UInt64 bits to Float64 in [0, 1) by mantissa construction. @inline function uint64_to_unit_float64(u::UInt64)::Float64 + # Keep 52 random bits for the mantissa (drop 12 rightmost bits from the UInt64) # and combine with the bit pattern of 1.0 (sign=0, exponent=1023). bits = UInt64(0x3ff0000000000000) | (u >> 12) diff --git a/test/rand.jl b/test/rand.jl index b7ef167..0495b86 100644 --- a/test/rand.jl +++ b/test/rand.jl @@ -70,18 +70,18 @@ end @testset "abstract rng offset behavior" begin - mutable struct MutableNoOffsetRNG <: AK.AbstractCounterRNG{AK.Philox} + mutable struct MutableNoOffsetRNG seed::UInt64 alg::AK.Philox end - mutable struct MutableWithOffsetRNG <: AK.AbstractCounterRNG{AK.Philox} + mutable struct MutableWithOffsetRNG seed::UInt64 alg::AK.Philox offset::UInt64 end - struct ImmutableWithOffsetRNG <: AK.AbstractCounterRNG{AK.Philox} + struct ImmutableWithOffsetRNG seed::UInt64 alg::AK.Philox offset::UInt64 diff --git a/test/runtests.jl b/test/runtests.jl index a2707b6..d1a2d69 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,7 +17,13 @@ if "--CUDA" in ARGS const BACKEND = CUDABackend() TEST_DL[] = true elseif "--oneAPI" in ARGS - Pkg.add("oneAPI") + if Sys.iswindows() + # oneAPI v2.6.x can throw `UndefVarError: NEO_jll not defined` on native Windows. + # Pin to the latest known-good minor series until upstream fixes are available. + Pkg.add(name="oneAPI", version="2.5") + else + Pkg.add("oneAPI") + end using oneAPI oneAPI.versioninfo() const BACKEND = oneAPIBackend() From 42a434e6c0465593651f84e334838bf53b9327d6 Mon Sep 17 00:00:00 2001 From: fjbarter Date: Tue, 24 Mar 2026 20:23:31 +0000 Subject: [PATCH 12/18] update tests and docs to match new CounterRNG interface --- docs/src/api/rand.md | 15 +++--- src/rand/rand.jl | 19 ++------ test/rand.jl | 109 ++++++++++++++++++++----------------------- 3 files changed, 61 insertions(+), 82 deletions(-) diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md index 03ed7f3..aab3e5d 100644 --- a/docs/src/api/rand.md +++ b/docs/src/api/rand.md @@ -7,27 +7,24 @@ fixed `seed`, algorithm, and call sequence. `AK.rand!(rng, v)` call. This means chunked fills are stream-consistent: - filling `100` then `100` elements yields the same `200` values as one `200`-element fill. - calls that share the same `CounterRNG` instance concurrently are not thread-safe. -- call `AK.reset!(rng)` to rewind a mutable offset-bearing RNG back to offset `0x0`. +- call `AK.reset!(rng)` to rewind a `CounterRNG` offset back to `0x0`. -`AK.rand!` also accepts custom `CounterRNG` implementations: -- if they have a mutable `offset` field, streaming advancement is applied -- if they have no `offset` field, each call behaves statelessly from counter `0` -- if they have an immutable `offset` field, that offset is used as a fixed start and is not advanced +`AK.rand!(rng, v)` accepts `rng::AK.CounterRNG`. +Passing other RNG container types is not supported and will throw a `MethodError`. Use an explicit `CounterRNG` when reproducibility is required. For convenience, `AK.rand!(v)` creates a fresh `CounterRNG()` on each call using one auto-seeded `Base.rand(UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used. -`AK.reset!(rng)` rewinds offset to `0x0` for mutable RNGs that have an `offset` field. +`AK.reset!(rng::AK.CounterRNG)` rewinds `rng.offset` to `0x0`. -Custom RNGs: +Custom algorithms: - Define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`. -- Define a `CounterRNG` with fields `seed` and `alg`. -- Add a mutable `offset::UInt64` field if you want stream advancement across calls; omit it for stateless calls from counter `0`. - Implement typed `rand_uint` methods: - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32` - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64` +- Use your algorithm via `AK.CounterRNG(seed; alg=MyAlg(), offset=...)`. Both widths should be implemented so `AK.rand!` supports all integer/float output types without falling back or error. diff --git a/src/rand/rand.jl b/src/rand/rand.jl index b9db756..833bcae 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -28,7 +28,6 @@ mutable struct CounterRNG{A <: CounterRNGAlgorithm} const alg::A offset::UInt64 end -#TODO: need to figure out a nice way to allow custom counter RNGs function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0) @@ -49,16 +48,9 @@ CounterRNG(seed::Integer, alg::CounterRNGAlgorithm) = CounterRNG(seed; alg) """ reset!(rng::CounterRNG) -Reset `rng.offset` to `0x0` for RNGs that support mutable stream offsets. - -This requires `rng` to: -- have an `offset` field -- be mutable +Reset `rng.offset` to `0x0`. """ @inline function reset!(rng::CounterRNG) - @argcheck hasfield(typeof(rng), :offset) "reset! requires an `offset` field" - @argcheck ismutabletype(typeof(rng)) "reset! requires a mutable RNG type" - rng.offset = UInt64(0) return rng end @@ -95,12 +87,9 @@ include("threefry.jl") ) Fill `v` in-place with pseudo-random values using a counter-based RNG stream. For `v[i]`, the -counter is `start_offset + UInt64(i - 1)` in linear indexing order, where `start_offset` is: -- `rng.offset` if `rng` has an `offset` field -- `0` otherwise +counter is `rng.offset + UInt64(i - 1)` in linear indexing order. -After filling `v`, `rng.offset` advances by `length(v)` only when `rng` has a mutable `offset` -field. +After filling `v`, `rng.offset` advances by `length(v)`. Supported scalar element types are: - `UInt8`, `UInt16`, `UInt32`, `UInt64` @@ -133,7 +122,7 @@ function rand!( @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)" # Local isbits captures from potentially mutable rng object - seed, alg = rng.seed, rng.alg + seed, alg, initial_offset = rng.seed, rng.alg, rng.offset foreachindex( v, backend; diff --git a/test/rand.jl b/test/rand.jl index 0495b86..90e285a 100644 --- a/test/rand.jl +++ b/test/rand.jl @@ -69,52 +69,45 @@ end end - @testset "abstract rng offset behavior" begin - mutable struct MutableNoOffsetRNG - seed::UInt64 - alg::AK.Philox - end - - mutable struct MutableWithOffsetRNG - seed::UInt64 - alg::AK.Philox - offset::UInt64 - end - - struct ImmutableWithOffsetRNG - seed::UInt64 - alg::AK.Philox - offset::UInt64 - end - - rng_no_offset = MutableNoOffsetRNG(UInt64(0x1234), AK.Philox()) - x1 = array_from_host(zeros(Float32, 256)) - x2 = array_from_host(zeros(Float32, 256)) - AK.rand!(rng_no_offset, x1; prefer_threads, block_size=64) - AK.rand!(rng_no_offset, x2; prefer_threads, block_size=64) - @test Array(x1) == Array(x2) - - rng_stream = MutableWithOffsetRNG(UInt64(0x1234), AK.Philox(), UInt64(0)) + @testset "counter rng offset behavior" begin + rng_stream = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17)) s1 = array_from_host(zeros(Float32, 100)) s2 = array_from_host(zeros(Float32, 100)) s12 = array_from_host(zeros(Float32, 200)) AK.rand!(rng_stream, s1; prefer_threads, block_size=64) + @test rng_stream.offset == UInt64(117) AK.rand!(rng_stream, s2; prefer_threads, block_size=64) - AK.rand!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), s12; prefer_threads, block_size=64) + @test rng_stream.offset == UInt64(217) + + rng_once = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17)) + AK.rand!(rng_once, s12; prefer_threads, block_size=64) @test vcat(Array(s1), Array(s2)) == Array(s12) - @test rng_stream.offset == UInt64(200) + @test rng_once.offset == UInt64(217) + + empty = array_from_host(zeros(Float32, 0)) + stream_offset = rng_stream.offset + AK.rand!(rng_stream, empty; prefer_threads, block_size=64) + @test rng_stream.offset == stream_offset + + @test AK.reset!(rng_stream) === rng_stream + @test rng_stream.offset == UInt64(0) - rng_imm = ImmutableWithOffsetRNG(UInt64(0x1234), AK.Philox(), UInt64(17)) y1 = array_from_host(zeros(Float32, 64)) y2 = array_from_host(zeros(Float32, 64)) - AK.rand!(rng_imm, y1; prefer_threads, block_size=64) - AK.rand!(rng_imm, y2; prefer_threads, block_size=64) + AK.rand!(rng_stream, y1; prefer_threads, block_size=64) + AK.rand!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), y2; prefer_threads, block_size=64) @test Array(y1) == Array(y2) - @test AK.reset!(rng_stream) === rng_stream - @test rng_stream.offset == UInt64(0) - @test_throws ArgumentError AK.reset!(rng_no_offset) - @test_throws ArgumentError AK.reset!(rng_imm) + mutable struct DummyRNG + seed::UInt64 + alg::AK.Philox + offset::UInt64 + end + + rng_dummy = DummyRNG(UInt64(0x1234), AK.Philox(), UInt64(0)) + x = array_from_host(zeros(Float32, 16)) + @test_throws MethodError AK.rand!(rng_dummy, x; prefer_threads, block_size=64) + @test_throws MethodError AK.reset!(rng_dummy) end @@ -146,38 +139,38 @@ end @test AK._counter_from_index(1) == UInt64(0) @test AK._counter_from_index(17) == UInt64(16) - @test AK.raw_uint_type(UInt8) === UInt32 - @test AK.raw_uint_type(UInt16) === UInt32 - @test AK.raw_uint_type(UInt32) === UInt32 - @test AK.raw_uint_type(Int8) === UInt32 - @test AK.raw_uint_type(Int16) === UInt32 - @test AK.raw_uint_type(Int32) === UInt32 + @test AK._rand_scalar_uint_type(UInt8) === UInt32 + @test AK._rand_scalar_uint_type(UInt16) === UInt32 + @test AK._rand_scalar_uint_type(UInt32) === UInt32 + @test AK._rand_scalar_uint_type(Int8) === UInt32 + @test AK._rand_scalar_uint_type(Int16) === UInt32 + @test AK._rand_scalar_uint_type(Int32) === UInt32 if RUN_FLOAT16_RAND_TESTS - @test AK.raw_uint_type(Float16) === UInt32 + @test AK._rand_scalar_uint_type(Float16) === UInt32 end - @test AK.raw_uint_type(Float32) === UInt32 - @test AK.raw_uint_type(UInt64) === UInt64 - @test AK.raw_uint_type(Int64) === UInt64 - @test AK.raw_uint_type(Bool) === UInt32 + @test AK._rand_scalar_uint_type(Float32) === UInt32 + @test AK._rand_scalar_uint_type(UInt64) === UInt64 + @test AK._rand_scalar_uint_type(Int64) === UInt64 + @test AK._rand_scalar_uint_type(Bool) === UInt32 if RUN_FLOAT64_RAND_TESTS - @test AK.raw_uint_type(Float64) === UInt64 + @test AK._rand_scalar_uint_type(Float64) === UInt64 end - @test AK.from_uint(UInt8, UInt32(0xabcdef01)) == UInt8(0xab) - @test AK.from_uint(UInt16, UInt32(0xabcdef01)) == UInt16(0xabcd) - @test AK.from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32 - @test AK.from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64 - @test AK.from_uint(Int8, UInt32(0xff000000)) == Int8(-1) - @test AK.from_uint(Int16, UInt32(0xffff0000)) == Int16(-1) - @test AK.from_uint(Int32, 0b11111111111111111111111111111111 % UInt32) == Int32(-1) - @test AK.from_uint( + @test AK._rand_scalar_from_uint(UInt8, UInt32(0xabcdef01)) == UInt8(0xab) + @test AK._rand_scalar_from_uint(UInt16, UInt32(0xabcdef01)) == UInt16(0xabcd) + @test AK._rand_scalar_from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32 + @test AK._rand_scalar_from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64 + @test AK._rand_scalar_from_uint(Int8, UInt32(0xff000000)) == Int8(-1) + @test AK._rand_scalar_from_uint(Int16, UInt32(0xffff0000)) == Int16(-1) + @test AK._rand_scalar_from_uint(Int32, 0b11111111111111111111111111111111 % UInt32) == Int32(-1) + @test AK._rand_scalar_from_uint( Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64 ) == Int64(-1) if RUN_FLOAT16_RAND_TESTS - @test AK.from_uint(Float16, UInt32(0)) == Float16(0) + @test AK._rand_scalar_from_uint(Float16, UInt32(0)) == Float16(0) end - @test AK.from_uint(Bool, UInt32(0)) == false - @test AK.from_uint(Bool, UInt32(1)) == true + @test AK._rand_scalar_from_uint(Bool, UInt32(0)) == false + @test AK._rand_scalar_from_uint(Bool, UInt32(1)) == true if RUN_FLOAT16_RAND_TESTS @test AK.uint32_to_unit_float16(UInt32(0)) == Float16(0) From a9582a021b9592708bccf939fdf0fee36b59fc3c Mon Sep 17 00:00:00 2001 From: fjbarter Date: Wed, 25 Mar 2026 02:39:47 +0000 Subject: [PATCH 13/18] initial randn! --- docs/src/api/rand.md | 10 ++ prototype/rand/plot/Project.toml | 3 + prototype/rand/randn.jl | 53 +++++++ src/rand/rand.jl | 5 +- src/rand/randn.jl | 258 +++++++++++++++++++++++++++++++ test/randn.jl | 228 +++++++++++++++++++++++++++ test/runtests.jl | 1 + 7 files changed, 557 insertions(+), 1 deletion(-) create mode 100644 prototype/rand/plot/Project.toml create mode 100644 prototype/rand/randn.jl create mode 100644 src/rand/randn.jl create mode 100644 test/randn.jl diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md index aab3e5d..2ac9d91 100644 --- a/docs/src/api/rand.md +++ b/docs/src/api/rand.md @@ -34,6 +34,11 @@ Supported element types: - `Float16`, `Float32`, `Float64` - `Bool` +`AK.randn!` fills arrays with standard normal samples and currently supports: +- `Float16`, `Float32`, `Float64` + +`AK.randn!` uses Box-Muller with open-interval uniforms in `(0, 1)` from a branch-free midpoint mapping. + The core of the random number generation produces either a `UInt32` or `UInt64` depending on the width of the requested element type. That `UInt` is then either: - Unsigned integers: returned as-is or truncated if necessary. @@ -82,6 +87,10 @@ AK.rand!(rng, v2) # Convenience (fresh auto-seeded RNG on each call) y = oneArray{Float32}(undef, 1024) AK.rand!(y) + +# Standard normal samples +z = oneArray{Float32}(undef, 1024) +AK.randn!(rng, z) ``` ```@docs @@ -89,4 +98,5 @@ AcceleratedKernels.CounterRNG AcceleratedKernels.CounterRNGAlgorithm AcceleratedKernels.reset! AcceleratedKernels.rand! +AcceleratedKernels.randn! ``` diff --git a/prototype/rand/plot/Project.toml b/prototype/rand/plot/Project.toml new file mode 100644 index 0000000..a95f271 --- /dev/null +++ b/prototype/rand/plot/Project.toml @@ -0,0 +1,3 @@ +[deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" diff --git a/prototype/rand/randn.jl b/prototype/rand/randn.jl new file mode 100644 index 0000000..a21cc08 --- /dev/null +++ b/prototype/rand/randn.jl @@ -0,0 +1,53 @@ +using BenchmarkTools +using CUDA + +import AcceleratedKernels as AK + + +const N = 100_000_000 +const GPU_BLOCK_SIZE = 256 + +const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox()) + +TestType = Float32 + +x_cuda = CuArray{TestType}(undef, N) +x_philox = CuArray{TestType}(undef, N) +x_cpu = Vector{TestType}(undef, N) + + +function run_cuda_randn!(x) + CUDA.randn!(x) + CUDA.synchronize() + return x +end + + +function run_ak_randn_gpu!(rng, x) + AK.randn!(rng, x; block_size=GPU_BLOCK_SIZE) + AK.synchronize(AK.get_backend(x)) + return x +end + + +function run_ak_randn_cpu!(rng, x) + AK.randn!(rng, x) + return x +end + +# warmup compile +run_cuda_randn!(x_cuda) +run_ak_randn_gpu!(RNG_PHILOX, x_philox) + +println("N = ", N) +println("CPU threads: ", Threads.nthreads()) + +println("\nCUDA.randn! benchmark (CuArray{$TestType}, in-place)") +display(@benchmark run_cuda_randn!($x_cuda)) + +println("\nAK.randn! Philox benchmark (GPU, CuArray{$TestType})") +display(@benchmark run_ak_randn_gpu!($RNG_PHILOX, $x_philox)) + +# println("\nAK.randn! benchmark (CPU, Vector{$TestType}, Philox)") +# display(@benchmark run_ak_randn_cpu!($RNG_PHILOX, $x_cpu)) + diff --git a/src/rand/rand.jl b/src/rand/rand.jl index 833bcae..bafd0bd 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -66,6 +66,9 @@ include("splitmix.jl") include("philox.jl") include("threefry.jl") +# Normally distributed scalar generators and randn! +include("randn.jl") + @@ -121,7 +124,7 @@ function rand!( @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)" - # Local isbits captures from potentially mutable rng object + # Local isbits captures from mutable rng object seed, alg, initial_offset = rng.seed, rng.alg, rng.offset foreachindex( diff --git a/src/rand/randn.jl b/src/rand/randn.jl new file mode 100644 index 0000000..f6acb0e --- /dev/null +++ b/src/rand/randn.jl @@ -0,0 +1,258 @@ +const ALLOWED_RANDN_SCALARS = Union{Float16, Float32, Float64} + +const U24_MAX_SAFE_MIDPOINT = UInt32(0x00fffffe) # 2^24 - 2 +const U53_MAX_SAFE_MIDPOINT = UInt64(0x001ffffffffffffe) # 2^53 - 2 +const MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24) # 2^-24 +const MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53) # 2^-53 + + + + +#= +The below Float constructions are not duplicates of those in utilities.jl - they are needed to +ensure an interval of (0, 1) as opposed to [0, 1). Achieving this purely logically with midpoint +mapping means we can avoid a check for producing a 0 (which would normally cause a redraw). +Avoiding 0 is essential for Box-Muller due to the logarithm functions. +=# + + +# Convert random UInt32 bits to Float32 in (0, 1) using midpoint mapping on a 24-bit grid. +@inline function uint32_to_open_unit_float32_midpoint(u::UInt32)::Float32 + # `min` keeps the top midpoint below one after Float32 rounding. + k = min(u >> 8, U24_MAX_SAFE_MIDPOINT) + return (Float32(k) + 0.5f0) * MIDPOINT_SCALE_F32 +end + + +# Convert random UInt64 bits to Float64 in (0, 1) using midpoint mapping on a 53-bit grid. +@inline function uint64_to_open_unit_float64_midpoint(u::UInt64)::Float64 + # `min` keeps the top midpoint below one after Float64 rounding. + k = min(u >> 11, U53_MAX_SAFE_MIDPOINT) + return (Float64(k) + 0.5) * MIDPOINT_SCALE_F64 +end + + +# Float16 path reuses Float32 midpoint sampling for robust math in Box-Muller. +@inline function rand_open01( + seed::UInt64, + alg::CounterRNGAlgorithm, + counter::UInt64, + ::Type{Float16}, +)::Float16 + return Float16(rand_open01(seed, alg, counter, Float32)) +end + + +@inline function rand_open01( + seed::UInt64, + alg::CounterRNGAlgorithm, + counter::UInt64, + ::Type{Float32}, +)::Float32 + return uint32_to_open_unit_float32_midpoint(rand_uint(seed, alg, counter, UInt32)) +end + + +@inline function rand_open01( + seed::UInt64, + alg::CounterRNGAlgorithm, + counter::UInt64, + ::Type{Float64}, +)::Float64 + return uint64_to_open_unit_float64_midpoint(rand_uint(seed, alg, counter, UInt64)) +end + + +@inline function rand_open01(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T} + throw(ArgumentError( + "Unsupported open-interval random type $(T). Supported: $(ALLOWED_RANDN_SCALARS)" + )) +end + + +@inline function randn_pair( + seed::UInt64, + alg::CounterRNGAlgorithm, + pair_counter::UInt64, + ::Type{Float16}, +)::Tuple{Float16, Float16} + z0, z1 = randn_pair(seed, alg, pair_counter, Float32) + return Float16(z0), Float16(z1) +end + + +@inline function randn_pair( + seed::UInt64, + alg::CounterRNGAlgorithm, + pair_counter::UInt64, + ::Type{Float32}, +)::Tuple{Float32, Float32} + u = rand_uint(seed, alg, pair_counter, UInt64) + u1 = uint32_to_open_unit_float32_midpoint(_u32_lo(u)) + u2 = uint32_to_open_unit_float32_midpoint(_u32_hi(u)) + radius = sqrt(-2.0f0 * log(u1)) + theta = Float32(2pi) * u2 + stheta, ctheta = sincos(theta) + return radius * ctheta, radius * stheta +end + + +@inline function randn_pair( + seed::UInt64, + alg::CounterRNGAlgorithm, + pair_counter::UInt64, + ::Type{Float64}, +)::Tuple{Float64, Float64} + c0 = pair_counter << 1 + u1 = rand_open01(seed, alg, c0, Float64) + u2 = rand_open01(seed, alg, c0 + UInt64(1), Float64) + radius = sqrt(-2.0 * log(u1)) + theta = Float64(2pi) * u2 + stheta, ctheta = sincos(theta) + return radius * ctheta, radius * stheta +end + + +@inline function randn_pair(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T} + throw(ArgumentError( + "Unsupported normal random type $(T). Supported: $(ALLOWED_RANDN_SCALARS)" + )) +end + + +@inline function randn_scalar( + seed::UInt64, + alg::CounterRNGAlgorithm, + normal_counter::UInt64, + ::Type{T}, +)::T where {T <: ALLOWED_RANDN_SCALARS} + pair_counter = normal_counter >> 1 + z0, z1 = randn_pair(seed, alg, pair_counter, T) + return iszero(normal_counter & UInt64(0x1)) ? z0 : z1 +end + + +@inline function randn_scalar(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T} + throw(ArgumentError( + "Unsupported normal random scalar type $(T). Supported: $(ALLOWED_RANDN_SCALARS)" + )) +end + + +""" + randn!( + rng::CounterRNG, + v::AbstractArray{T}, + backend::Backend=get_backend(v); + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + + # Implementation choice + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, + ) where {T <: AbstractFloat} + +Fill `v` in-place with pseudo-random samples from a standard normal distribution. + +For `v[i]`, the normal stream counter is `rng.offset + UInt64(i - 1)` in linear indexing order. +Values are generated using Box-Muller from midpoint-open uniforms in `(0, 1)`. + +After filling `v`, `rng.offset` advances by `length(v)`. +""" +function randn!( + rng::CounterRNG, + v::AbstractArray{T}, + backend::Backend=get_backend(v); + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, +) where T + + @argcheck T <: ALLOWED_RANDN_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RANDN_SCALARS)" + + isempty(v) && return v + + # Local isbits captures from mutable rng object. + seed, alg, initial_offset = rng.seed, rng.alg, rng.offset + len = length(v) + pair_start = initial_offset >> 1 + + # Even stream offset is the common path and maps pair `i` to output indices `(2i-1, 2i)`. + if iszero(initial_offset & UInt64(0x1)) + pair_count = cld(len, 2) + pair_indices = Base.OneTo(pair_count) + + # Fully branch-free hot path when both offset and length are even. + if iseven(len) + foreachindex( + pair_indices, backend; + max_tasks, min_elems, prefer_threads, block_size, + ) do i + pair_counter = pair_start + _counter_from_index(i) + z0, z1 = randn_pair(seed, alg, pair_counter, T) + i0 = (i << 1) - 1 + @inbounds v[i0] = z0 + @inbounds v[i0 + 1] = z1 + end + else + foreachindex( + pair_indices, backend; + max_tasks, min_elems, prefer_threads, block_size, + ) do i + pair_counter = pair_start + _counter_from_index(i) + z0, z1 = randn_pair(seed, alg, pair_counter, T) + i0 = (i << 1) - 1 + @inbounds v[i0] = z0 + i1 = i0 + 1 + + if i1 <= len + @inbounds v[i1] = z1 + end + end + end + else + # Odd stream offset shifts pair `i` to `(2i-2, 2i-1)`; only the first z0 is out of range. + pair_count = cld(len + 1, 2) + pair_indices = Base.OneTo(pair_count) + + foreachindex( + pair_indices, backend; + max_tasks, min_elems, prefer_threads, block_size, + ) do i + pair_counter = pair_start + _counter_from_index(i) + z0, z1 = randn_pair(seed, alg, pair_counter, T) + i0 = (i << 1) - 2 + + if i0 >= 1 + @inbounds v[i0] = z0 + end + + i1 = i0 + 1 + if i1 <= len + @inbounds v[i1] = z1 + end + end + end + + rng.offset += UInt64(len) + + v +end + + +function randn!( + v::AbstractArray, + args...; + kwargs..., +) + return randn!(CounterRNG(), v, args...; kwargs...) +end diff --git a/test/randn.jl b/test/randn.jl new file mode 100644 index 0000000..c0e9c56 --- /dev/null +++ b/test/randn.jl @@ -0,0 +1,228 @@ +const RANDN_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry()) +const RANDN_FLOAT_TYPES_BACKEND = IS_CPU_BACKEND ? (Float16, Float32, Float64) : (Float32,) + + +_is_finite(v) = all(isfinite, v) + + +function _randn_fill_reference!( + rng, + x::AbstractArray{T}; + counter_offset::UInt64=UInt64(0), +) where {T <: AK.ALLOWED_RANDN_SCALARS} + @inbounds for i in eachindex(x) + x[i] = AK.randn_scalar(rng.seed, rng.alg, counter_offset + UInt64(i - one(i)), T) + end + return x +end + + +function _assert_randn_matches_reference!(rng, x; kwargs...) + counter_offset = rng.offset + AK.randn!(rng, x; kwargs...) + ref = zeros(eltype(x), size(x)) + _randn_fill_reference!(rng, ref; counter_offset) + @test Array(x) == ref + return x +end + + +@testset "randn" begin + @testset "open interval helpers" begin + @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(UInt32(0)) < 1.0f0 + @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(typemax(UInt32)) < 1.0f0 + @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0 + @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0 + end + + + @testset "rand_open01 and randn_scalar" begin + seed = UInt64(0x123456789abcdef) + for alg in RANDN_ALGS + for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023)) + u32 = AK.rand_open01(seed, alg, c, Float32) + @test 0.0f0 < u32 < 1.0f0 + if IS_CPU_BACKEND + u64 = AK.rand_open01(seed, alg, c, Float64) + @test 0.0 < u64 < 1.0 + end + end + + for T in RANDN_FLOAT_TYPES_BACKEND + s0 = AK.randn_scalar(seed, alg, UInt64(42), T) + s1 = AK.randn_scalar(seed, alg, UInt64(43), T) + @test s0 isa T + @test s1 isa T + @test isfinite(s0) + @test isfinite(s1) + @test s0 == AK.randn_scalar(seed, alg, UInt64(42), T) + @test s1 == AK.randn_scalar(seed, alg, UInt64(43), T) + + p0, p1 = AK.randn_pair(seed, alg, UInt64(21), T) + @test AK.randn_scalar(seed, alg, UInt64(42), T) == p0 + @test AK.randn_scalar(seed, alg, UInt64(43), T) == p1 + end + end + + @test_throws ArgumentError AK.randn_scalar(seed, AK.Philox(), UInt64(0), UInt32) + end + + + @testset "randn! explicit rng" begin + lengths = (0, 1, 31, 32, 33, 257, 1024) + + for alg in RANDN_ALGS + rng = AK.CounterRNG(0x123456789abcdef; alg) + + for T in RANDN_FLOAT_TYPES_BACKEND + for len in lengths + x = array_from_host(zeros(T, len)) + _assert_randn_matches_reference!(rng, x; prefer_threads, block_size=64) + @test _is_finite(Array(x)) + end + end + + for T in RANDN_FLOAT_TYPES_BACKEND + x1 = array_from_host(zeros(T, 2048)) + x2 = array_from_host(zeros(T, 2048)) + rng1 = AK.CounterRNG(rng.seed; alg=rng.alg) + rng2 = AK.CounterRNG(rng.seed; alg=rng.alg) + AK.randn!(rng1, x1; prefer_threads, block_size=64) + AK.randn!(rng2, x2; prefer_threads, block_size=257) + @test Array(x1) == Array(x2) + end + + for T in RANDN_FLOAT_TYPES_BACKEND + rng1 = AK.CounterRNG(rng.seed; alg=rng.alg) + rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg) + x1 = array_from_host(zeros(T, 2048)) + x2 = array_from_host(zeros(T, 2048)) + AK.randn!(rng1, x1; prefer_threads, block_size=64) + AK.randn!(rng2, x2; prefer_threads, block_size=64) + @test Array(x1) != Array(x2) + end + end + end + + + @testset "counter rng offset behavior" begin + rng_stream = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17)) + s1 = array_from_host(zeros(Float32, 99)) + s2 = array_from_host(zeros(Float32, 101)) + s12 = array_from_host(zeros(Float32, 200)) + AK.randn!(rng_stream, s1; prefer_threads, block_size=64) + @test rng_stream.offset == UInt64(116) + AK.randn!(rng_stream, s2; prefer_threads, block_size=64) + @test rng_stream.offset == UInt64(217) + + rng_once = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17)) + AK.randn!(rng_once, s12; prefer_threads, block_size=64) + @test vcat(Array(s1), Array(s2)) == Array(s12) + @test rng_once.offset == UInt64(217) + + empty = array_from_host(zeros(Float32, 0)) + stream_offset = rng_stream.offset + AK.randn!(rng_stream, empty; prefer_threads, block_size=64) + @test rng_stream.offset == stream_offset + + @test AK.reset!(rng_stream) === rng_stream + @test rng_stream.offset == UInt64(0) + + y1 = array_from_host(zeros(Float32, 64)) + y2 = array_from_host(zeros(Float32, 64)) + AK.randn!(rng_stream, y1; prefer_threads, block_size=64) + AK.randn!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), y2; prefer_threads, block_size=64) + @test Array(y1) == Array(y2) + end + + + @testset "reset!" begin + rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + x1 = array_from_host(zeros(Float32, 512)) + x2 = array_from_host(zeros(Float32, 512)) + + AK.randn!(rng, x1; prefer_threads, block_size=64) + @test rng.offset == UInt64(512) + @test AK.reset!(rng) === rng + @test rng.offset == UInt64(0) + AK.randn!(rng, x2; prefer_threads, block_size=64) + + @test Array(x1) == Array(x2) + end + + + @testset "randn! n-dimensional and views" begin + rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + + for T in RANDN_FLOAT_TYPES_BACKEND + xnd = array_from_host(zeros(T, 7, 11, 5)) + _assert_randn_matches_reference!(rng, xnd; prefer_threads, block_size=128) + end + + if IS_CPU_BACKEND + for T in RANDN_FLOAT_TYPES_BACKEND + base = zeros(T, 64) + view_x = @view base[2:2:end] + AK.randn!( + rng, view_x; + max_tasks=Threads.nthreads(), + min_elems=1, + prefer_threads=true + ) + ref_view = zeros(T, length(view_x)) + _randn_fill_reference!( + rng, ref_view; + counter_offset=rng.offset - UInt64(length(view_x)), + ) + @test collect(view_x) == ref_view + end + end + end + + + @testset "randn! convenience" begin + ref1 = array_from_host(zeros(Float32, 1024)) + ref2 = array_from_host(zeros(Float32, 1024)) + x1 = array_from_host(zeros(Float32, 1024)) + x2 = array_from_host(zeros(Float32, 1024)) + + Random.seed!(0xabcdef) + seed1 = Random.rand(Random.default_rng(), UInt64) + AK.randn!(AK.CounterRNG(seed1; alg=AK.Philox()), ref1; prefer_threads, block_size=64) + seed2 = Random.rand(Random.default_rng(), UInt64) + AK.randn!(AK.CounterRNG(seed2; alg=AK.Philox()), ref2; prefer_threads, block_size=64) + + Random.seed!(0xabcdef) + AK.randn!(x1; prefer_threads, block_size=64) + AK.randn!(x2; prefer_threads, block_size=64) + @test Array(x1) == Array(ref1) + @test Array(x2) == Array(ref2) + + x_bad = zeros(UInt32, 16) + @test_throws ArgumentError AK.randn!(x_bad; prefer_threads) + @test_throws ArgumentError AK.randn!(AK.CounterRNG(0x1), x_bad; prefer_threads) + end + + + @testset "moments sanity" begin + n = 200_000 + rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + + for T in RANDN_FLOAT_TYPES_BACKEND + x = array_from_host(zeros(T, n)) + AK.randn!(rng, x; prefer_threads, block_size=128) + xa = Float64.(Array(x)) + + m = sum(xa) / length(xa) + v = sum((xi - m)^2 for xi in xa) / length(xa) + + if T === Float16 + @test abs(m) < 0.1 + @test abs(v - one(v)) < 0.15 + else + @test abs(m) < 0.01 + @test abs(v - one(v)) < 0.03 + end + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index d1a2d69..5d3a6ad 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -76,6 +76,7 @@ include("partition.jl") include("looping.jl") include("map.jl") include("rand.jl") +include("randn.jl") include("sort.jl") include("reduce.jl") include("accumulate.jl") From 61aa11dc582ac474df2e05bda3cbad7d6f34c208 Mon Sep 17 00:00:00 2001 From: fjbarter Date: Thu, 26 Mar 2026 13:23:44 +0000 Subject: [PATCH 14/18] ensure compile-time initial index bias, now beating CUDA for both odd and even offsets + array lengths. Added convenience rand() and randn() convenience constructors using KernelAbstractions.allocate based on the provided backend, Type, and dims --- prototype/rand/Project.toml | 1 + prototype/rand/randn.jl | 2 +- src/rand/rand.jl | 56 ++++++++++++ src/rand/randn.jl | 167 ++++++++++++++++++++++++------------ src/rand/utilities.jl | 19 ++++ test/rand.jl | 40 +++++++++ test/randn.jl | 118 +++++++++++++++++-------- 7 files changed, 309 insertions(+), 94 deletions(-) diff --git a/prototype/rand/Project.toml b/prototype/rand/Project.toml index 7757b4d..675e6c7 100644 --- a/prototype/rand/Project.toml +++ b/prototype/rand/Project.toml @@ -3,5 +3,6 @@ AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" PProf = "e4faabce-9ead-11e9-39d9-4379958e3056" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" diff --git a/prototype/rand/randn.jl b/prototype/rand/randn.jl index a21cc08..8370d91 100644 --- a/prototype/rand/randn.jl +++ b/prototype/rand/randn.jl @@ -7,7 +7,7 @@ import AcceleratedKernels as AK const N = 100_000_000 const GPU_BLOCK_SIZE = 256 -const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox()) +const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox(), offset=0x0) TestType = Float32 diff --git a/src/rand/rand.jl b/src/rand/rand.jl index bafd0bd..c36a61e 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -147,3 +147,59 @@ function rand!( ) return rand!(CounterRNG(), v, args...; kwargs...) end + + +""" + rand( + rng::CounterRNG, + backend::Backend, + ::Type{T}, + dims::Integer...; + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + block_size::Int=256, + ) where T + +Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via +[`rand!`](@ref), and return it. +""" +function rand( + rng::CounterRNG, + backend::Backend, + ::Type{T}, + dims::Integer...; + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, +) where T + return _allocate_and_fill( + rand!, rng, backend, T, dims...; + max_tasks, min_elems, prefer_threads, block_size, + ) +end + + +function rand( + backend::Backend, + ::Type{T}, + dims::Integer...; + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, +) where T + return rand( + CounterRNG(), backend, T, dims...; + max_tasks, min_elems, prefer_threads, block_size, + ) +end diff --git a/src/rand/randn.jl b/src/rand/randn.jl index f6acb0e..f3cce14 100644 --- a/src/rand/randn.jl +++ b/src/rand/randn.jl @@ -139,6 +139,59 @@ end end +# `Val{ODD}` keeps parity in the type domain so each specialization (`ODD==0` / `ODD==1`) +# can fold index bias at compile time. +# - `Val{0}` => even-offset pair writes at indices `(2i-1, 2i)` so bias is `-1` +# - `Val{1}` => odd-offset pair writes at indices `(2i, 2i+1)` after prefix handling so bias is `0` +@inline _randn_i0_bias(::Val{0}) = -1 +@inline _randn_i0_bias(::Val{1}) = 0 + + +@inline function _randn_core!( + v::AbstractArray{T}, seed, alg, initial_offset, + backend, max_tasks, min_elems, prefer_threads, block_size, + ::Val{ODD}, +) where {T, ODD} + + len = length(v) + prefix_len = ODD + + # If offset is odd, need to individually handle the first element. + prefix_len == 1 && @allowscalar @inbounds v[1] = randn_scalar(seed, alg, initial_offset, T) + + # Stream is now even-aligned, so can foreachindex through the pairs. + pair_start = (initial_offset + UInt64(prefix_len)) >> 1 + + # Capture `Val(ODD)` into the closure so bias stays a compile-time constant inside the loop. + odd_val = Val(ODD) + i0_bias = _randn_i0_bias(odd_val) + remaining_len = len - prefix_len + pair_count = remaining_len >> 1 + + if pair_count > 0 + foreachindex( + Base.OneTo(pair_count), backend; + max_tasks, min_elems, prefer_threads, block_size, + ) do i + pair_counter = pair_start + _counter_from_index(i) + z0, z1 = randn_pair(seed, alg, pair_counter, T) + i0 = (i << 1) + _randn_i0_bias(odd_val) + @inbounds v[i0] = z0 + @inbounds v[i0 + 1] = z1 + end + end + + # If an extra element remains after pair writing, fill it individually. + tail_index = (pair_count << 1) + i0_bias + 2 + if tail_index <= len + tail_counter = initial_offset + UInt64(tail_index - 1) + @allowscalar @inbounds v[tail_index] = randn_scalar(seed, alg, tail_counter, T) + end + + return v +end + + """ randn!( rng::CounterRNG, @@ -183,67 +236,19 @@ function randn!( # Local isbits captures from mutable rng object. seed, alg, initial_offset = rng.seed, rng.alg, rng.offset - len = length(v) - pair_start = initial_offset >> 1 - - # Even stream offset is the common path and maps pair `i` to output indices `(2i-1, 2i)`. - if iszero(initial_offset & UInt64(0x1)) - pair_count = cld(len, 2) - pair_indices = Base.OneTo(pair_count) - - # Fully branch-free hot path when both offset and length are even. - if iseven(len) - foreachindex( - pair_indices, backend; - max_tasks, min_elems, prefer_threads, block_size, - ) do i - pair_counter = pair_start + _counter_from_index(i) - z0, z1 = randn_pair(seed, alg, pair_counter, T) - i0 = (i << 1) - 1 - @inbounds v[i0] = z0 - @inbounds v[i0 + 1] = z1 - end - else - foreachindex( - pair_indices, backend; - max_tasks, min_elems, prefer_threads, block_size, - ) do i - pair_counter = pair_start + _counter_from_index(i) - z0, z1 = randn_pair(seed, alg, pair_counter, T) - i0 = (i << 1) - 1 - @inbounds v[i0] = z0 - i1 = i0 + 1 - - if i1 <= len - @inbounds v[i1] = z1 - end - end - end - else - # Odd stream offset shifts pair `i` to `(2i-2, 2i-1)`; only the first z0 is out of range. - pair_count = cld(len + 1, 2) - pair_indices = Base.OneTo(pair_count) - - foreachindex( - pair_indices, backend; - max_tasks, min_elems, prefer_threads, block_size, - ) do i - pair_counter = pair_start + _counter_from_index(i) - z0, z1 = randn_pair(seed, alg, pair_counter, T) - i0 = (i << 1) - 2 - if i0 >= 1 - @inbounds v[i0] = z0 - end + core_args = ( + v, seed, alg, initial_offset, backend, max_tasks, min_elems, prefer_threads, block_size + ) - i1 = i0 + 1 - if i1 <= len - @inbounds v[i1] = z1 - end - end + # Dispatch depending on required initial index bias + if iseven(initial_offset) + _randn_core!(core_args..., Val(0)) + else + _randn_core!(core_args..., Val(1)) end - rng.offset += UInt64(len) + rng.offset += UInt64(length(v)) v end @@ -256,3 +261,51 @@ function randn!( ) return randn!(CounterRNG(), v, args...; kwargs...) end + + +""" + randn( + rng::CounterRNG, + backend::Backend, + ::Type{T}, + dims::Integer...; + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + block_size::Int=256, + ) where T + +Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via +[`randn!`](@ref), and return it. +""" +function randn( + rng::CounterRNG, + backend::Backend, + ::Type{T}, + dims::Integer...; + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + block_size::Int=256, +) where T + return _allocate_and_fill( + randn!, rng, backend, T, dims...; + max_tasks, min_elems, prefer_threads, block_size, + ) +end + + +function randn( + backend::Backend, + ::Type{T}, + dims::Integer...; + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + block_size::Int=256, +) where T + return randn( + CounterRNG(), backend, T, dims...; + max_tasks, min_elems, prefer_threads, block_size, + ) +end diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index f2159c5..b2c60db 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -15,6 +15,25 @@ @inline _counter_from_index(i)::UInt64 = UInt64(i - one(i)) +# Shared allocation + fill helper for rand/randn convenience constructors. +@inline function _allocate_and_fill( + fill!, + rng::CounterRNG, + backend::Backend, + ::Type{T}, + dims::Integer...; + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + block_size::Int=256, +) where {T} + dims_int = Base.map(Int, dims) + v = KernelAbstractions.allocate(backend, T, dims_int) + fill!(rng, v, backend; max_tasks, min_elems, prefer_threads, block_size) + return v +end + + # Internal scalar eltypes currently supported by rand!. const ALLOWED_RAND_SCALARS = Union{ UInt8, UInt16, UInt32, UInt64, diff --git a/test/rand.jl b/test/rand.jl index 90e285a..d3eacca 100644 --- a/test/rand.jl +++ b/test/rand.jl @@ -362,4 +362,44 @@ end @test_throws ArgumentError AK.rand!(x_bad; prefer_threads) @test_throws ArgumentError AK.rand!(AK.CounterRNG(0x1), x_bad; prefer_threads) end + + + @testset "rand allocation convenience" begin + rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()) + y = AK.rand(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64) + @test size(y) == (6, 7) + @test eltype(y) === Float32 + @test _is_unit_interval(Array(y)) + @test rng.offset == UInt64(length(y)) + + rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) + rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) + y_alloc = AK.rand(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64) + y_fill = array_from_host(zeros(Float32, 128)) + AK.rand!(rng_fill, y_fill; prefer_threads, block_size=64) + @test Array(y_alloc) == Array(y_fill) + @test rng_alloc.offset == rng_fill.offset == UInt64(128) + + # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks. + AK.rand(BACKEND, Float32, 1; prefer_threads, block_size=64) + + # Auto-seeded constructor should match explicit seed capture from default RNG. + Random.seed!(0x9abc) + seed = Random.rand(Random.default_rng(), UInt64) + ref = AK.rand(AK.CounterRNG(seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64) + Random.seed!(0x9abc) + x = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64) + @test Array(x) == Array(ref) + + # Reseeding should reproduce the same auto-seeded draw. + Random.seed!(0x7777) + x1 = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64) + Random.seed!(0x7777) + x2 = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64) + @test Array(x1) == Array(x2) + + @test_throws ArgumentError AK.rand(AK.CounterRNG(0x1), BACKEND, UInt128, 16; prefer_threads) + @test_throws MethodError AK.rand(AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.rand(BACKEND, Float32, 16; prefer_threads, bad=:kwarg) + end end diff --git a/test/randn.jl b/test/randn.jl index c0e9c56..82261c8 100644 --- a/test/randn.jl +++ b/test/randn.jl @@ -1,8 +1,12 @@ const RANDN_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry()) const RANDN_FLOAT_TYPES_BACKEND = IS_CPU_BACKEND ? (Float16, Float32, Float64) : (Float32,) +const RANDN_LENGTHS = (0, 1, 2, 31, 32, 33, 257, 1024) -_is_finite(v) = all(isfinite, v) +_all_finite(v) = all(isfinite, v) +_randn_reference_atol(::Type{Float16}) = 16 * eps(Float16) +_randn_reference_atol(::Type{Float32}) = 64 * eps(Float32) +_randn_reference_atol(::Type{Float64}) = 64 * eps(Float64) function _randn_fill_reference!( @@ -22,28 +26,40 @@ function _assert_randn_matches_reference!(rng, x; kwargs...) AK.randn!(rng, x; kwargs...) ref = zeros(eltype(x), size(x)) _randn_fill_reference!(rng, ref; counter_offset) - @test Array(x) == ref + xa = Array(x) + + if IS_CPU_BACKEND + @test xa == ref + else + # randn uses Box-Muller (`log`, `sqrt`, `sincos`), and GPU libm implementations are not + # bit-identical to CPU scalar libm. Stream/counter mapping is still deterministic, but the + # final Float32 values can differ by a few ULP, so we use a tight absolute tolerance here. + atol = _randn_reference_atol(eltype(xa)) + @test all(isapprox.(xa, ref; rtol=zero(atol), atol)) + end + return x end @testset "randn" begin - @testset "open interval helpers" begin + @testset "scalar helpers" begin @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(UInt32(0)) < 1.0f0 @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(typemax(UInt32)) < 1.0f0 - @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0 - @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0 - end + if IS_CPU_BACKEND + @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0 + @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0 + end - @testset "rand_open01 and randn_scalar" begin seed = UInt64(0x123456789abcdef) for alg in RANDN_ALGS - for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023)) - u32 = AK.rand_open01(seed, alg, c, Float32) + for counter in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023)) + u32 = AK.rand_open01(seed, alg, counter, Float32) @test 0.0f0 < u32 < 1.0f0 + if IS_CPU_BACKEND - u64 = AK.rand_open01(seed, alg, c, Float64) + u64 = AK.rand_open01(seed, alg, counter, Float64) @test 0.0 < u64 < 1.0 end end @@ -51,6 +67,7 @@ end for T in RANDN_FLOAT_TYPES_BACKEND s0 = AK.randn_scalar(seed, alg, UInt64(42), T) s1 = AK.randn_scalar(seed, alg, UInt64(43), T) + @test s0 isa T @test s1 isa T @test isfinite(s0) @@ -59,26 +76,25 @@ end @test s1 == AK.randn_scalar(seed, alg, UInt64(43), T) p0, p1 = AK.randn_pair(seed, alg, UInt64(21), T) - @test AK.randn_scalar(seed, alg, UInt64(42), T) == p0 - @test AK.randn_scalar(seed, alg, UInt64(43), T) == p1 + @test p0 == AK.randn_scalar(seed, alg, UInt64(42), T) + @test p1 == AK.randn_scalar(seed, alg, UInt64(43), T) end end + @test_throws ArgumentError AK.rand_open01(seed, AK.Philox(), UInt64(0), UInt32) @test_throws ArgumentError AK.randn_scalar(seed, AK.Philox(), UInt64(0), UInt32) end @testset "randn! explicit rng" begin - lengths = (0, 1, 31, 32, 33, 257, 1024) - for alg in RANDN_ALGS rng = AK.CounterRNG(0x123456789abcdef; alg) for T in RANDN_FLOAT_TYPES_BACKEND - for len in lengths + for len in RANDN_LENGTHS x = array_from_host(zeros(T, len)) _assert_randn_matches_reference!(rng, x; prefer_threads, block_size=64) - @test _is_finite(Array(x)) + @test _all_finite(Array(x)) end end @@ -87,16 +103,18 @@ end x2 = array_from_host(zeros(T, 2048)) rng1 = AK.CounterRNG(rng.seed; alg=rng.alg) rng2 = AK.CounterRNG(rng.seed; alg=rng.alg) + AK.randn!(rng1, x1; prefer_threads, block_size=64) AK.randn!(rng2, x2; prefer_threads, block_size=257) @test Array(x1) == Array(x2) end for T in RANDN_FLOAT_TYPES_BACKEND - rng1 = AK.CounterRNG(rng.seed; alg=rng.alg) - rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg) x1 = array_from_host(zeros(T, 2048)) x2 = array_from_host(zeros(T, 2048)) + rng1 = AK.CounterRNG(rng.seed; alg=rng.alg) + rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg) + AK.randn!(rng1, x1; prefer_threads, block_size=64) AK.randn!(rng2, x2; prefer_threads, block_size=64) @test Array(x1) != Array(x2) @@ -105,11 +123,12 @@ end end - @testset "counter rng offset behavior" begin + @testset "offset and reset semantics" begin rng_stream = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17)) s1 = array_from_host(zeros(Float32, 99)) s2 = array_from_host(zeros(Float32, 101)) s12 = array_from_host(zeros(Float32, 200)) + AK.randn!(rng_stream, s1; prefer_threads, block_size=64) @test rng_stream.offset == UInt64(116) AK.randn!(rng_stream, s2; prefer_threads, block_size=64) @@ -136,22 +155,7 @@ end end - @testset "reset!" begin - rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) - x1 = array_from_host(zeros(Float32, 512)) - x2 = array_from_host(zeros(Float32, 512)) - - AK.randn!(rng, x1; prefer_threads, block_size=64) - @test rng.offset == UInt64(512) - @test AK.reset!(rng) === rng - @test rng.offset == UInt64(0) - AK.randn!(rng, x2; prefer_threads, block_size=64) - - @test Array(x1) == Array(x2) - end - - - @testset "randn! n-dimensional and views" begin + @testset "shapes and views" begin rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) for T in RANDN_FLOAT_TYPES_BACKEND @@ -163,12 +167,14 @@ end for T in RANDN_FLOAT_TYPES_BACKEND base = zeros(T, 64) view_x = @view base[2:2:end] + AK.randn!( rng, view_x; max_tasks=Threads.nthreads(), min_elems=1, - prefer_threads=true + prefer_threads=true, ) + ref_view = zeros(T, length(view_x)) _randn_fill_reference!( rng, ref_view; @@ -204,6 +210,46 @@ end end + @testset "randn allocation convenience" begin + rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()) + y = AK.randn(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64) + @test size(y) == (6, 7) + @test eltype(y) === Float32 + @test _all_finite(Array(y)) + @test rng.offset == UInt64(length(y)) + + rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) + rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) + y_alloc = AK.randn(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64) + y_fill = array_from_host(zeros(Float32, 128)) + AK.randn!(rng_fill, y_fill; prefer_threads, block_size=64) + @test Array(y_alloc) == Array(y_fill) + @test rng_alloc.offset == rng_fill.offset == UInt64(128) + + # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks. + AK.randn(BACKEND, Float32, 1; prefer_threads, block_size=64) + + # Auto-seeded constructor should match explicit seed capture from default RNG. + Random.seed!(0x9abc) + seed = Random.rand(Random.default_rng(), UInt64) + ref = AK.randn(AK.CounterRNG(seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64) + Random.seed!(0x9abc) + x = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64) + @test Array(x) == Array(ref) + + # Reseeding should reproduce the same auto-seeded draw. + Random.seed!(0x7777) + x1 = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64) + Random.seed!(0x7777) + x2 = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64) + @test Array(x1) == Array(x2) + + @test_throws ArgumentError AK.randn(AK.CounterRNG(0x1), BACKEND, UInt32, 16; prefer_threads) + @test_throws MethodError AK.randn(AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.randn(BACKEND, Float32, 16; prefer_threads, bad=:kwarg) + end + + @testset "moments sanity" begin n = 200_000 rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) From 9abe3881f5afb0ef56b9a794960609a5fd26adc0 Mon Sep 17 00:00:00 2001 From: fjbarter Date: Thu, 26 Mar 2026 17:11:45 +0000 Subject: [PATCH 15/18] attempt to fix oneAPI test precompilation hang with Julia v1.10 by disabling package images --- .buildkite/pipeline.yml | 13 ++++++------- src/rand/rand.jl | 17 +++++++++++++++-- src/rand/randn.jl | 39 +++++++++++++++++++++++++++------------ src/rand/utilities.jl | 13 +++++-------- 4 files changed, 53 insertions(+), 29 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 43cfba3..c0e5830 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -88,14 +88,13 @@ steps: - JuliaCI/julia#v1: version: "1.10" command: | - julia -e 'using Pkg + julia --pkgimages=no -e 'using Pkg - println("--- :julia: Instantiating environment") - Pkg.add("oneAPI") - Pkg.develop(path=".") - - println("+++ :julia: Running tests") - Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' + println("--- :julia: Instantiating environment") + Pkg.add("oneAPI") + Pkg.develop(path=".") + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' agents: queue: "juliagpu" intel: "*" diff --git a/src/rand/rand.jl b/src/rand/rand.jl index c36a61e..af3fdef 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -1,3 +1,10 @@ +const ALLOWED_RAND_SCALARS = Union{ + UInt8, UInt16, UInt32, UInt64, + Int8, Int16, Int32, Int64, + Float16, Float32, Float64, + Bool +} + abstract type CounterRNGAlgorithm end @@ -92,7 +99,8 @@ include("randn.jl") Fill `v` in-place with pseudo-random values using a counter-based RNG stream. For `v[i]`, the counter is `rng.offset + UInt64(i - 1)` in linear indexing order. -After filling `v`, `rng.offset` advances by `length(v)`. +After filling `v`, `rng.offset` advances by `length(v)`. It can be called without `rng`, in which +case the default `CounterRNG` is used. Supported scalar element types are: - `UInt8`, `UInt16`, `UInt32`, `UInt64` @@ -155,9 +163,13 @@ end backend::Backend, ::Type{T}, dims::Integer...; + + # CPU settings max_tasks::Int=Threads.nthreads(), min_elems::Int=1, prefer_threads::Bool=true, + + # GPU settings block_size::Int=256, ) where T @@ -178,7 +190,8 @@ function rand( # GPU settings block_size::Int=256, ) where T - return _allocate_and_fill( + @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)" + return _allocate_and_fill_rand( rand!, rng, backend, T, dims...; max_tasks, min_elems, prefer_threads, block_size, ) diff --git a/src/rand/randn.jl b/src/rand/randn.jl index f3cce14..8d16c77 100644 --- a/src/rand/randn.jl +++ b/src/rand/randn.jl @@ -1,9 +1,11 @@ -const ALLOWED_RANDN_SCALARS = Union{Float16, Float32, Float64} +const ALLOWED_RANDN_SCALARS = Union{ + Float16, Float32, Float64 +} -const U24_MAX_SAFE_MIDPOINT = UInt32(0x00fffffe) # 2^24 - 2 -const U53_MAX_SAFE_MIDPOINT = UInt64(0x001ffffffffffffe) # 2^53 - 2 -const MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24) # 2^-24 -const MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53) # 2^-53 +const OPEN01_MAX_MIDPOINT_INDEX_F32 = UInt32(0x00fffffe) +const OPEN01_MAX_MIDPOINT_INDEX_F64 = UInt64(0x001ffffffffffffe) +const OPEN01_MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24) +const OPEN01_MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53) @@ -19,16 +21,16 @@ Avoiding 0 is essential for Box-Muller due to the logarithm functions. # Convert random UInt32 bits to Float32 in (0, 1) using midpoint mapping on a 24-bit grid. @inline function uint32_to_open_unit_float32_midpoint(u::UInt32)::Float32 # `min` keeps the top midpoint below one after Float32 rounding. - k = min(u >> 8, U24_MAX_SAFE_MIDPOINT) - return (Float32(k) + 0.5f0) * MIDPOINT_SCALE_F32 + k = min(u >> 8, OPEN01_MAX_MIDPOINT_INDEX_F32) + return (Float32(k) + 0.5f0) * OPEN01_MIDPOINT_SCALE_F32 end # Convert random UInt64 bits to Float64 in (0, 1) using midpoint mapping on a 53-bit grid. @inline function uint64_to_open_unit_float64_midpoint(u::UInt64)::Float64 # `min` keeps the top midpoint below one after Float64 rounding. - k = min(u >> 11, U53_MAX_SAFE_MIDPOINT) - return (Float64(k) + 0.5) * MIDPOINT_SCALE_F64 + k = min(u >> 11, OPEN01_MAX_MIDPOINT_INDEX_F64) + return (Float64(k) + 0.5) * OPEN01_MIDPOINT_SCALE_F64 end @@ -201,8 +203,6 @@ end # CPU settings max_tasks::Int=Threads.nthreads(), min_elems::Int=1, - - # Implementation choice prefer_threads::Bool=true, # GPU settings @@ -215,6 +215,8 @@ For `v[i]`, the normal stream counter is `rng.offset + UInt64(i - 1)` in linear Values are generated using Box-Muller from midpoint-open uniforms in `(0, 1)`. After filling `v`, `rng.offset` advances by `length(v)`. + +It can be called without an `rng`, in which case the default `CounterRNG` will be used. """ function randn!( rng::CounterRNG, @@ -269,9 +271,13 @@ end backend::Backend, ::Type{T}, dims::Integer...; + + # CPU settings max_tasks::Int=Threads.nthreads(), min_elems::Int=1, prefer_threads::Bool=true, + + # GPU settings block_size::Int=256, ) where T @@ -283,12 +289,17 @@ function randn( backend::Backend, ::Type{T}, dims::Integer...; + + # CPU settings max_tasks::Int=Threads.nthreads(), min_elems::Int=1, prefer_threads::Bool=true, + + # GPU settings block_size::Int=256, ) where T - return _allocate_and_fill( + @argcheck T <: ALLOWED_RANDN_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RANDN_SCALARS)" + return _allocate_and_fill_rand( randn!, rng, backend, T, dims...; max_tasks, min_elems, prefer_threads, block_size, ) @@ -299,9 +310,13 @@ function randn( backend::Backend, ::Type{T}, dims::Integer...; + + # CPU settings max_tasks::Int=Threads.nthreads(), min_elems::Int=1, prefer_threads::Bool=true, + + # GPU settings block_size::Int=256, ) where T return randn( diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index b2c60db..04f6929 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -16,15 +16,19 @@ # Shared allocation + fill helper for rand/randn convenience constructors. -@inline function _allocate_and_fill( +@inline function _allocate_and_fill_rand( fill!, rng::CounterRNG, backend::Backend, ::Type{T}, dims::Integer...; + + # CPU settings max_tasks::Int=Threads.nthreads(), min_elems::Int=1, prefer_threads::Bool=true, + + # GPU settings block_size::Int=256, ) where {T} dims_int = Base.map(Int, dims) @@ -34,13 +38,6 @@ end -# Internal scalar eltypes currently supported by rand!. -const ALLOWED_RAND_SCALARS = Union{ - UInt8, UInt16, UInt32, UInt64, - Int8, Int16, Int32, Int64, - Float16, Float32, Float64, - Bool -} @inline _rand_scalar_uint_type(::Type{UInt8}) = UInt32 From 841bafb0c4e2ad0cf81af96c0a6e49caff7d64fd Mon Sep 17 00:00:00 2001 From: fjbarter Date: Thu, 26 Mar 2026 17:43:40 +0000 Subject: [PATCH 16/18] fix Threefry UInt32 device arithmetic to avoid breaking on Metal --- src/rand/threefry.jl | 19 ++++++++++--------- src/rand/utilities.jl | 2 +- test/randn.jl | 8 -------- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl index c9052b1..f6ef632 100644 --- a/src/rand/threefry.jl +++ b/src/rand/threefry.jl @@ -9,9 +9,9 @@ const THREEFRY_ROTATIONS = ( const THREEFRY_ROUNDS = 20 -@inline function _threefry_key_word(k0::UInt32, k1::UInt32, k2::UInt32, idx::Int)::UInt32 - idx == 0 && return k0 - idx == 1 && return k1 +@inline function _threefry_key_word(k0::UInt32, k1::UInt32, k2::UInt32, idx::UInt32)::UInt32 + idx == UInt32(0) && return k0 + idx == UInt32(1) && return k1 return k2 end @@ -33,16 +33,17 @@ end x1 += k1 @inbounds for round in 0:(THREEFRY_ROUNDS - 1) - rot = THREEFRY_ROTATIONS[(round & 0x7) + 1] + round_u32 = UInt32(round) + rot = THREEFRY_ROTATIONS[Int((round_u32 & UInt32(0x7)) + UInt32(1))] x0 += x1 x1 = xor(_rotl32(x1, rot), x0) - if (round & 0x3) == 3 - s = (round >>> 2) + 1 - i0 = s % 3 - i1 = (s + 1) % 3 + if (round_u32 & UInt32(0x3)) == UInt32(0x3) + s = (round_u32 >>> 2) + UInt32(1) + i0 = s % UInt32(3) + i1 = (s + UInt32(1)) % UInt32(3) x0 += _threefry_key_word(k0, k1, k2, i0) - x1 += _threefry_key_word(k0, k1, k2, i1) + UInt32(s) + x1 += _threefry_key_word(k0, k1, k2, i1) + s end end diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index 04f6929..556ca7b 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -9,7 +9,7 @@ @inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32) # 32-bit rotate left by r positions -@inline _rotl32(x::UInt32, r::UInt32)::UInt32 = (x << r) | (x >> (UInt32(32) - r)) +@inline _rotl32(x::UInt32, r::UInt32)::UInt32 = bitrotate(x, Int32(r)) # Get counter used for CounterRNG from element index @inline _counter_from_index(i)::UInt64 = UInt64(i - one(i)) diff --git a/test/randn.jl b/test/randn.jl index 82261c8..0d239b2 100644 --- a/test/randn.jl +++ b/test/randn.jl @@ -90,14 +90,6 @@ end for alg in RANDN_ALGS rng = AK.CounterRNG(0x123456789abcdef; alg) - for T in RANDN_FLOAT_TYPES_BACKEND - for len in RANDN_LENGTHS - x = array_from_host(zeros(T, len)) - _assert_randn_matches_reference!(rng, x; prefer_threads, block_size=64) - @test _all_finite(Array(x)) - end - end - for T in RANDN_FLOAT_TYPES_BACKEND x1 = array_from_host(zeros(T, 2048)) x2 = array_from_host(zeros(T, 2048)) From 39e089ffb5a09518b248f5bf085d3830f8b3c069 Mon Sep 17 00:00:00 2001 From: fjbarter Date: Fri, 27 Mar 2026 01:09:45 +0000 Subject: [PATCH 17/18] expand rand/randn convenience APIs and align docs/tests. Move randn open-interval helpers into utilities.jl and switch naming of internals. Simplify rand!/randn! wrappers and add convenience constructors for omitted rng, backend, or type, with backend-dependent defaults (Float64 on CPU, Float32 otherwise). Add explicit zero-arg guards so rand()/randn() require at least one dimension. Update rand.md with concise convenience semantics and examples, including type-only no-rng calls, plus doc entries for rand/randn. Expand tests to cover default-type dispatch, CPU fallback routes, typed no-rng overloads, and invalid-signature/kwarg throw behavior while preserving deterministic CounterRNG offset progression. --- docs/src/api/rand.md | 31 ++++++++- prototype/rand/Project.toml | 3 +- prototype/rand/plot/Project.toml | 3 - prototype/rand/randn.jl | 2 - prototype/rand/test_rand.jl | 25 +++---- src/rand/rand.jl | 42 +++++------ src/rand/randn.jl | 116 +++++-------------------------- src/rand/utilities.jl | 67 ++++++++++++++++++ test/rand.jl | 83 ++++++++++++++++++++++ test/randn.jl | 105 +++++++++++++++++++++++++--- test/runtests.jl | 8 +-- 11 files changed, 325 insertions(+), 160 deletions(-) delete mode 100644 prototype/rand/plot/Project.toml diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md index 2ac9d91..d0d4113 100644 --- a/docs/src/api/rand.md +++ b/docs/src/api/rand.md @@ -3,6 +3,10 @@ Counter-based random generation for CPU and GPU backends with deterministic stream behavior for fixed `seed`, algorithm, and call sequence. +Both in-place and allocation forms are supported: +- Uniform: `AK.rand!`, `AK.rand` +- Standard normal: `AK.randn!`, `AK.randn` + `CounterRNG` carries an internal `offset` (starting at `0`) that advances by `length(v)` on each `AK.rand!(rng, v)` call. This means chunked fills are stream-consistent: - filling `100` then `100` elements yields the same `200` values as one `200`-element fill. @@ -16,9 +20,16 @@ Use an explicit `CounterRNG` when reproducibility is required. For convenience, `AK.rand!(v)` creates a fresh `CounterRNG()` on each call using one auto-seeded `Base.rand(UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used. +Likewise, `AK.rand(backend, args...)` creates a fresh auto-seeded `CounterRNG()` on each call. `AK.reset!(rng::AK.CounterRNG)` rewinds `rng.offset` to `0x0`. +Allocation convenience: +- Canonical forms are `AK.rand(rng, backend, T, dims...)` and `AK.randn(rng, backend, T, dims...)`. +- Defaults are shared: omit `rng` -> fresh `CounterRNG()`; omit `backend` -> CPU backend; omit `T` -> `Float64` on CPU backend and `Float32` otherwise. +- Common shorthands include `AK.rand(dims...)`, `AK.rand(T, dims...)`, `AK.rand(backend, dims...)`, and the corresponding `AK.randn(...)` variants. +- For explicit `rng`, both `AK.rand` and `AK.randn` advance `rng.offset` by `prod(dims)`. + Custom algorithms: - Define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`. - Implement typed `rand_uint` methods: @@ -39,6 +50,9 @@ Supported element types: `AK.randn!` uses Box-Muller with open-interval uniforms in `(0, 1)` from a branch-free midpoint mapping. +`AK.randn!(v)` and `AK.randn(backend, args...)` create a fresh auto-seeded `CounterRNG()` on each +call, so repeated calls produce different outputs unless `Random.seed!()` is used. + The core of the random number generation produces either a `UInt32` or `UInt64` depending on the width of the requested element type. That `UInt` is then either: - Unsigned integers: returned as-is or truncated if necessary. @@ -72,6 +86,7 @@ Examples: ```julia import AcceleratedKernels as AK using oneAPI +using ROCArray # Reproducible rng = AK.CounterRNG(0x12345678; alg=AK.Philox()) @@ -88,9 +103,19 @@ AK.rand!(rng, v2) y = oneArray{Float32}(undef, 1024) AK.rand!(y) -# Standard normal samples -z = oneArray{Float32}(undef, 1024) +# Allocation form +y_cpu_auto = AK.rand(1024) # defaults to CPU, Vector{Float64} +y_oneArray = AK.rand(oneAPIBackend(), Float32, 1024) # fresh RNG, allocate and fill oneArray +y_cpu_typed = AK.rand(rng, Float16, 1024) # CPU backend, explicit type, explicit RNG + +# Standard normal filling +z = ROCArray{Float32}(undef, 1024) AK.randn!(rng, z) + +# Standard normal allocation form +z_cpu_auto = AK.randn(1024) # defaults to CPU, Vector{Float64} +z_ROCArray = AK.randn(oneAPIBackend(), 1024) # allocate and fill ROCArray{Float32} +z_cpu_typed = AK.randn(rng, Float16, 1024) # CPU backend, explicit type, explicit RNG ``` ```@docs @@ -98,5 +123,7 @@ AcceleratedKernels.CounterRNG AcceleratedKernels.CounterRNGAlgorithm AcceleratedKernels.reset! AcceleratedKernels.rand! +AcceleratedKernels.rand AcceleratedKernels.randn! +AcceleratedKernels.randn ``` diff --git a/prototype/rand/Project.toml b/prototype/rand/Project.toml index 675e6c7..d1926b1 100644 --- a/prototype/rand/Project.toml +++ b/prototype/rand/Project.toml @@ -1,8 +1,7 @@ [deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -PProf = "e4faabce-9ead-11e9-39d9-4379958e3056" -Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" diff --git a/prototype/rand/plot/Project.toml b/prototype/rand/plot/Project.toml deleted file mode 100644 index a95f271..0000000 --- a/prototype/rand/plot/Project.toml +++ /dev/null @@ -1,3 +0,0 @@ -[deps] -AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" -Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" diff --git a/prototype/rand/randn.jl b/prototype/rand/randn.jl index 8370d91..d552b5f 100644 --- a/prototype/rand/randn.jl +++ b/prototype/rand/randn.jl @@ -48,6 +48,4 @@ display(@benchmark run_cuda_randn!($x_cuda)) println("\nAK.randn! Philox benchmark (GPU, CuArray{$TestType})") display(@benchmark run_ak_randn_gpu!($RNG_PHILOX, $x_philox)) -# println("\nAK.randn! benchmark (CPU, Vector{$TestType}, Philox)") -# display(@benchmark run_ak_randn_cpu!($RNG_PHILOX, $x_cpu)) diff --git a/prototype/rand/test_rand.jl b/prototype/rand/test_rand.jl index ec3ce65..5a7be0f 100644 --- a/prototype/rand/test_rand.jl +++ b/prototype/rand/test_rand.jl @@ -11,11 +11,13 @@ const RNG_SPLITMIX = AK.CounterRNG(0x12345678; alg=AK.SplitMix64()) const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox()) const RNG_THREEFRY = AK.CounterRNG(0x12345678; alg=AK.Threefry()) -x_cuda = CuArray{Float32}(undef, N) -x_splitmix = CuArray{Float32}(undef, N) -x_philox = CuArray{Float32}(undef, N) -x_threefry = CuArray{Float32}(undef, N) -x_cpu = Vector{Float32}(undef, N) +TestType = Float32 + +x_cuda = CuArray{TestType}(undef, N) +x_splitmix = CuArray{TestType}(undef, N) +x_philox = CuArray{TestType}(undef, N) +x_threefry = CuArray{TestType}(undef, N) +x_cpu = Vector{TestType}(undef, N) function run_cuda_rand!(x) @@ -43,13 +45,11 @@ is_unit_interval(v) = all(x -> 0.0f0 <= x <= 1.0f0, v) # warmup compile run_cuda_rand!(x_cuda) -# run_ak_rand_gpu!(RNG_SPLITMIX, x_splitmix) run_ak_rand_gpu!(RNG_PHILOX, x_philox) run_ak_rand_gpu!(RNG_THREEFRY, x_threefry) run_ak_rand_cpu!(RNG_SPLITMIX, x_cpu) @assert is_unit_interval(Array(x_cuda)) -# @assert is_unit_interval(Array(x_splitmix)) @assert is_unit_interval(Array(x_philox)) @assert is_unit_interval(Array(x_threefry)) @assert is_unit_interval(x_cpu) @@ -57,18 +57,15 @@ run_ak_rand_cpu!(RNG_SPLITMIX, x_cpu) println("N = ", N) println("CPU threads: ", Threads.nthreads()) -println("\nCUDA.rand! benchmark (CuArray{Float32}, in-place)") +println("\nCUDA.rand! benchmark (CuArray{$TestType}, in-place)") display(@benchmark run_cuda_rand!($x_cuda)) -# println("\nAK.rand! SplitMix64 benchmark (GPU, CuArray{Float32})") -# display(@benchmark run_ak_rand_gpu!($RNG_SPLITMIX, $x_splitmix)) - -println("\nAK.rand! Philox benchmark (GPU, CuArray{Float32})") +println("\nAK.rand! Philox benchmark (GPU, CuArray{$TestType})") display(@benchmark run_ak_rand_gpu!($RNG_PHILOX, $x_philox)) -println("\nAK.rand! Threefry benchmark (GPU, CuArray{Float32})") +println("\nAK.rand! Threefry benchmark (GPU, CuArray{$TestType})") display(@benchmark run_ak_rand_gpu!($RNG_THREEFRY, $x_threefry)) -println("\nAK.rand! benchmark (CPU, Vector{Float32}, SplitMix64)") +println("\nAK.rand! benchmark (CPU, Vector{$TestType}, SplitMix64)") display(@benchmark run_ak_rand_cpu!($RNG_SPLITMIX, $x_cpu)) diff --git a/src/rand/rand.jl b/src/rand/rand.jl index af3fdef..06d3619 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -148,13 +148,7 @@ function rand!( end -function rand!( - v::AbstractArray, - args...; - kwargs..., -) - return rand!(CounterRNG(), v, args...; kwargs...) -end +rand!(v::AbstractArray, args...; kwargs...) = rand!(CounterRNG(), v, args...; kwargs...) """ @@ -175,6 +169,11 @@ end Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via [`rand!`](@ref), and return it. + +Convenience overloads: +- `rng` omitted: uses a fresh `CounterRNG()`. +- `backend` omitted: defaults to `CPU_BACKEND`. +- `T` omitted: defaults by backend (`Float64` on CPU backend, `Float32` otherwise). """ function rand( rng::CounterRNG, @@ -198,21 +197,16 @@ function rand( end -function rand( - backend::Backend, - ::Type{T}, - dims::Integer...; - - # CPU settings - max_tasks::Int=Threads.nthreads(), - min_elems::Int=1, - prefer_threads::Bool=true, - - # GPU settings - block_size::Int=256, -) where T - return rand( - CounterRNG(), backend, T, dims...; - max_tasks, min_elems, prefer_threads, block_size, - ) +function rand(rng::CounterRNG, backend::Backend, dims::Integer...; kwargs...) + DefaultScalarType = (backend == CPU_BACKEND) ? Float64 : Float32 + rand(rng, backend, DefaultScalarType, dims...; kwargs...) end + + +rand(rng::CounterRNG, args...; kwargs...) = rand(rng, CPU_BACKEND, args...; kwargs...) +rand(backend::Backend, args...; kwargs...) = rand(CounterRNG(), backend, args...; kwargs...) +rand(::Type{T}, dims::Integer...; kwargs...) where {T} = rand(CPU_BACKEND, T, dims...; kwargs...) +rand(dims::Integer...; kwargs...) = rand(CPU_BACKEND, dims...; kwargs...) +rand(; kwargs...) = throw(ArgumentError("rand requires at least one dimension")) + + diff --git a/src/rand/randn.jl b/src/rand/randn.jl index 8d16c77..b2b5dbb 100644 --- a/src/rand/randn.jl +++ b/src/rand/randn.jl @@ -2,76 +2,6 @@ const ALLOWED_RANDN_SCALARS = Union{ Float16, Float32, Float64 } -const OPEN01_MAX_MIDPOINT_INDEX_F32 = UInt32(0x00fffffe) -const OPEN01_MAX_MIDPOINT_INDEX_F64 = UInt64(0x001ffffffffffffe) -const OPEN01_MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24) -const OPEN01_MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53) - - - - -#= -The below Float constructions are not duplicates of those in utilities.jl - they are needed to -ensure an interval of (0, 1) as opposed to [0, 1). Achieving this purely logically with midpoint -mapping means we can avoid a check for producing a 0 (which would normally cause a redraw). -Avoiding 0 is essential for Box-Muller due to the logarithm functions. -=# - - -# Convert random UInt32 bits to Float32 in (0, 1) using midpoint mapping on a 24-bit grid. -@inline function uint32_to_open_unit_float32_midpoint(u::UInt32)::Float32 - # `min` keeps the top midpoint below one after Float32 rounding. - k = min(u >> 8, OPEN01_MAX_MIDPOINT_INDEX_F32) - return (Float32(k) + 0.5f0) * OPEN01_MIDPOINT_SCALE_F32 -end - - -# Convert random UInt64 bits to Float64 in (0, 1) using midpoint mapping on a 53-bit grid. -@inline function uint64_to_open_unit_float64_midpoint(u::UInt64)::Float64 - # `min` keeps the top midpoint below one after Float64 rounding. - k = min(u >> 11, OPEN01_MAX_MIDPOINT_INDEX_F64) - return (Float64(k) + 0.5) * OPEN01_MIDPOINT_SCALE_F64 -end - - -# Float16 path reuses Float32 midpoint sampling for robust math in Box-Muller. -@inline function rand_open01( - seed::UInt64, - alg::CounterRNGAlgorithm, - counter::UInt64, - ::Type{Float16}, -)::Float16 - return Float16(rand_open01(seed, alg, counter, Float32)) -end - - -@inline function rand_open01( - seed::UInt64, - alg::CounterRNGAlgorithm, - counter::UInt64, - ::Type{Float32}, -)::Float32 - return uint32_to_open_unit_float32_midpoint(rand_uint(seed, alg, counter, UInt32)) -end - - -@inline function rand_open01( - seed::UInt64, - alg::CounterRNGAlgorithm, - counter::UInt64, - ::Type{Float64}, -)::Float64 - return uint64_to_open_unit_float64_midpoint(rand_uint(seed, alg, counter, UInt64)) -end - - -@inline function rand_open01(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T} - throw(ArgumentError( - "Unsupported open-interval random type $(T). Supported: $(ALLOWED_RANDN_SCALARS)" - )) -end - - @inline function randn_pair( seed::UInt64, alg::CounterRNGAlgorithm, @@ -90,8 +20,8 @@ end ::Type{Float32}, )::Tuple{Float32, Float32} u = rand_uint(seed, alg, pair_counter, UInt64) - u1 = uint32_to_open_unit_float32_midpoint(_u32_lo(u)) - u2 = uint32_to_open_unit_float32_midpoint(_u32_hi(u)) + u1 = _uint32_to_open_unit_float32_midpoint(_u32_lo(u)) + u2 = _uint32_to_open_unit_float32_midpoint(_u32_hi(u)) radius = sqrt(-2.0f0 * log(u1)) theta = Float32(2pi) * u2 stheta, ctheta = sincos(theta) @@ -106,8 +36,8 @@ end ::Type{Float64}, )::Tuple{Float64, Float64} c0 = pair_counter << 1 - u1 = rand_open01(seed, alg, c0, Float64) - u2 = rand_open01(seed, alg, c0 + UInt64(1), Float64) + u1 = rand_float_open01(seed, alg, c0, Float64) + u2 = rand_float_open01(seed, alg, c0 + UInt64(1), Float64) radius = sqrt(-2.0 * log(u1)) theta = Float64(2pi) * u2 stheta, ctheta = sincos(theta) @@ -256,13 +186,7 @@ function randn!( end -function randn!( - v::AbstractArray, - args...; - kwargs..., -) - return randn!(CounterRNG(), v, args...; kwargs...) -end +randn!(v::AbstractArray, args...; kwargs...) = randn!(CounterRNG(), v, args...; kwargs...) """ @@ -283,6 +207,11 @@ end Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via [`randn!`](@ref), and return it. + +Convenience overloads: +- `rng` omitted: uses a fresh `CounterRNG()`. +- `backend` omitted: defaults to `CPU_BACKEND`. +- `T` omitted: defaults by backend (`Float64` on CPU backend, `Float32` otherwise). """ function randn( rng::CounterRNG, @@ -306,21 +235,14 @@ function randn( end -function randn( - backend::Backend, - ::Type{T}, - dims::Integer...; +function randn(rng::CounterRNG, backend::Backend, dims::Integer...; kwargs...) + DefaultScalarType = (backend == CPU_BACKEND) ? Float64 : Float32 + randn(rng, backend, DefaultScalarType, dims...; kwargs...) +end - # CPU settings - max_tasks::Int=Threads.nthreads(), - min_elems::Int=1, - prefer_threads::Bool=true, - # GPU settings - block_size::Int=256, -) where T - return randn( - CounterRNG(), backend, T, dims...; - max_tasks, min_elems, prefer_threads, block_size, - ) -end +randn(rng::CounterRNG, args...; kwargs...) = randn(rng, CPU_BACKEND, args...; kwargs...) +randn(backend::Backend, args...; kwargs...) = randn(CounterRNG(), backend, args...; kwargs...) +randn(::Type{T}, dims::Integer...; kwargs...) where {T} = randn(CPU_BACKEND, T, dims...; kwargs...) +randn(dims::Integer...; kwargs...) = randn(CPU_BACKEND, dims...; kwargs...) +randn(; kwargs...) = throw(ArgumentError("randn requires at least one dimension")) diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index 556ca7b..9e8ccf5 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -143,3 +143,70 @@ end # Interpret as 1.mantissa, then subtract 1 for [0, 1) reinterpret(Float64, bits) - 1.0 end + + + + + +### Helpers for randn ### + + +# Midpoint-mapped open-interval Float sampling in (0, 1), used for Box-Muller +const OPEN01_MAX_MIDPOINT_INDEX_F32 = UInt32(0x00fffffe) +const OPEN01_MAX_MIDPOINT_INDEX_F64 = UInt64(0x001ffffffffffffe) +const OPEN01_MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24) +const OPEN01_MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53) + + +# Convert random UInt32 bits to Float32 in (0, 1) using midpoint mapping on a 24-bit grid. +@inline function _uint32_to_open_unit_float32_midpoint(u::UInt32)::Float32 + # `min` keeps the top midpoint below one after Float32 rounding. + k = min(u >> 8, OPEN01_MAX_MIDPOINT_INDEX_F32) + return (Float32(k) + 0.5f0) * OPEN01_MIDPOINT_SCALE_F32 +end + + +# Convert random UInt64 bits to Float64 in (0, 1) using midpoint mapping on a 53-bit grid. +@inline function _uint64_to_open_unit_float64_midpoint(u::UInt64)::Float64 + # `min` keeps the top midpoint below one after Float64 rounding. + k = min(u >> 11, OPEN01_MAX_MIDPOINT_INDEX_F64) + return (Float64(k) + 0.5) * OPEN01_MIDPOINT_SCALE_F64 +end + + +# Float16 path reuses Float32 midpoint sampling for robust math in Box-Muller. +@inline function rand_float_open01( + seed::UInt64, + alg::CounterRNGAlgorithm, + counter::UInt64, + ::Type{Float16}, +)::Float16 + return Float16(rand_float_open01(seed, alg, counter, Float32)) +end + + +@inline function rand_float_open01( + seed::UInt64, + alg::CounterRNGAlgorithm, + counter::UInt64, + ::Type{Float32}, +)::Float32 + return _uint32_to_open_unit_float32_midpoint(rand_uint(seed, alg, counter, UInt32)) +end + + +@inline function rand_float_open01( + seed::UInt64, + alg::CounterRNGAlgorithm, + counter::UInt64, + ::Type{Float64}, +)::Float64 + return _uint64_to_open_unit_float64_midpoint(rand_uint(seed, alg, counter, UInt64)) +end + + +@inline function rand_float_open01(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T} + throw(ArgumentError( + "Unsupported open-interval random type $(T). Supported: Union{Float16, Float32, Float64}" + )) +end diff --git a/test/rand.jl b/test/rand.jl index d3eacca..a6b62c6 100644 --- a/test/rand.jl +++ b/test/rand.jl @@ -365,6 +365,8 @@ end @testset "rand allocation convenience" begin + default_alloc_type = IS_CPU_BACKEND ? Float64 : Float32 + rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()) y = AK.rand(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64) @test size(y) == (6, 7) @@ -372,6 +374,16 @@ end @test _is_unit_interval(Array(y)) @test rng.offset == UInt64(length(y)) + rng_default = AK.CounterRNG(UInt64(0x99); alg=AK.Philox()) + rng_default_ref = AK.CounterRNG(UInt64(0x99); alg=AK.Philox()) + y_default = AK.rand(rng_default, BACKEND, 128; prefer_threads, block_size=64) + y_default_ref = AK.rand( + rng_default_ref, BACKEND, default_alloc_type, 128; prefer_threads, block_size=64 + ) + @test eltype(y_default) === default_alloc_type + @test Array(y_default) == Array(y_default_ref) + @test rng_default.offset == rng_default_ref.offset == UInt64(128) + rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) y_alloc = AK.rand(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64) @@ -380,6 +392,26 @@ end @test Array(y_alloc) == Array(y_fill) @test rng_alloc.offset == rng_fill.offset == UInt64(128) + rng_cpu_default = AK.CounterRNG(UInt64(0x66); alg=AK.Philox()) + rng_cpu_default_ref = AK.CounterRNG(UInt64(0x66); alg=AK.Philox()) + y_cpu_default = AK.rand(rng_cpu_default, 128; prefer_threads, block_size=64) + y_cpu_default_ref = AK.rand( + rng_cpu_default_ref, AK.get_backend([]), 128; prefer_threads, block_size=64 + ) + @test eltype(y_cpu_default) === Float64 + @test Array(y_cpu_default) == Array(y_cpu_default_ref) + @test rng_cpu_default.offset == rng_cpu_default_ref.offset == UInt64(128) + + rng_cpu_typed = AK.CounterRNG(UInt64(0x77); alg=AK.Philox()) + rng_cpu_typed_ref = AK.CounterRNG(UInt64(0x77); alg=AK.Philox()) + y_cpu_typed = AK.rand(rng_cpu_typed, Float32, 128; prefer_threads, block_size=64) + y_cpu_typed_ref = AK.rand( + rng_cpu_typed_ref, AK.get_backend([]), Float32, 128; prefer_threads, block_size=64 + ) + @test eltype(y_cpu_typed) === Float32 + @test Array(y_cpu_typed) == Array(y_cpu_typed_ref) + @test rng_cpu_typed.offset == rng_cpu_typed_ref.offset == UInt64(128) + # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks. AK.rand(BACKEND, Float32, 1; prefer_threads, block_size=64) @@ -391,6 +423,54 @@ end x = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64) @test Array(x) == Array(ref) + # Auto-seeded convenience without explicit type should use backend-dependent default type. + Random.seed!(0x4242) + seed_default = Random.rand(Random.default_rng(), UInt64) + ref_default = AK.rand( + AK.CounterRNG(seed_default; alg=AK.Philox()), + BACKEND, + default_alloc_type, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x4242) + x_default = AK.rand(BACKEND, 64; prefer_threads, block_size=64) + @test eltype(x_default) === default_alloc_type + @test Array(x_default) == Array(ref_default) + + # Convenience without backend should default to CPU backend and Float64. + Random.seed!(0x4545) + seed_cpu_default = Random.rand(Random.default_rng(), UInt64) + ref_cpu_default = AK.rand( + AK.CounterRNG(seed_cpu_default; alg=AK.Philox()), + AK.get_backend([]), + Float64, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x4545) + x_cpu_default = AK.rand(64; prefer_threads, block_size=64) + @test eltype(x_cpu_default) === Float64 + @test Array(x_cpu_default) == Array(ref_cpu_default) + + # Type-only convenience should default to CPU backend. + Random.seed!(0x5656) + seed_cpu_typed = Random.rand(Random.default_rng(), UInt64) + ref_cpu_typed = AK.rand( + AK.CounterRNG(seed_cpu_typed; alg=AK.Philox()), + AK.get_backend([]), + Float32, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x5656) + x_cpu_typed_no_rng = AK.rand(Float32, 64; prefer_threads, block_size=64) + @test eltype(x_cpu_typed_no_rng) === Float32 + @test Array(x_cpu_typed_no_rng) == Array(ref_cpu_typed) + # Reseeding should reproduce the same auto-seeded draw. Random.seed!(0x7777) x1 = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64) @@ -401,5 +481,8 @@ end @test_throws ArgumentError AK.rand(AK.CounterRNG(0x1), BACKEND, UInt128, 16; prefer_threads) @test_throws MethodError AK.rand(AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg) @test_throws MethodError AK.rand(BACKEND, Float32, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.rand(BACKEND, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.rand(16; prefer_threads, bad=:kwarg) + @test_throws ArgumentError AK.rand() end end diff --git a/test/randn.jl b/test/randn.jl index 0d239b2..2d7b5ff 100644 --- a/test/randn.jl +++ b/test/randn.jl @@ -44,22 +44,22 @@ end @testset "randn" begin @testset "scalar helpers" begin - @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(UInt32(0)) < 1.0f0 - @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(typemax(UInt32)) < 1.0f0 + @test 0.0f0 < AK._uint32_to_open_unit_float32_midpoint(UInt32(0)) < 1.0f0 + @test 0.0f0 < AK._uint32_to_open_unit_float32_midpoint(typemax(UInt32)) < 1.0f0 if IS_CPU_BACKEND - @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0 - @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0 + @test 0.0 < AK._uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0 + @test 0.0 < AK._uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0 end seed = UInt64(0x123456789abcdef) for alg in RANDN_ALGS for counter in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023)) - u32 = AK.rand_open01(seed, alg, counter, Float32) + u32 = AK.rand_float_open01(seed, alg, counter, Float32) @test 0.0f0 < u32 < 1.0f0 if IS_CPU_BACKEND - u64 = AK.rand_open01(seed, alg, counter, Float64) + u64 = AK.rand_float_open01(seed, alg, counter, Float64) @test 0.0 < u64 < 1.0 end end @@ -81,7 +81,7 @@ end end end - @test_throws ArgumentError AK.rand_open01(seed, AK.Philox(), UInt64(0), UInt32) + @test_throws ArgumentError AK.rand_float_open01(seed, AK.Philox(), UInt64(0), UInt32) @test_throws ArgumentError AK.randn_scalar(seed, AK.Philox(), UInt64(0), UInt32) end @@ -203,6 +203,8 @@ end @testset "randn allocation convenience" begin + default_alloc_type = IS_CPU_BACKEND ? Float64 : Float32 + rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()) y = AK.randn(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64) @test size(y) == (6, 7) @@ -210,6 +212,16 @@ end @test _all_finite(Array(y)) @test rng.offset == UInt64(length(y)) + rng_default = AK.CounterRNG(UInt64(0x99); alg=AK.Philox()) + rng_default_ref = AK.CounterRNG(UInt64(0x99); alg=AK.Philox()) + y_default = AK.randn(rng_default, BACKEND, 128; prefer_threads, block_size=64) + y_default_ref = AK.randn( + rng_default_ref, BACKEND, default_alloc_type, 128; prefer_threads, block_size=64 + ) + @test eltype(y_default) === default_alloc_type + @test Array(y_default) == Array(y_default_ref) + @test rng_default.offset == rng_default_ref.offset == UInt64(128) + rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) y_alloc = AK.randn(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64) @@ -218,17 +230,87 @@ end @test Array(y_alloc) == Array(y_fill) @test rng_alloc.offset == rng_fill.offset == UInt64(128) + rng_cpu_default = AK.CounterRNG(UInt64(0x66); alg=AK.Philox()) + rng_cpu_default_ref = AK.CounterRNG(UInt64(0x66); alg=AK.Philox()) + y_cpu_default = AK.randn(rng_cpu_default, 128; prefer_threads, block_size=64) + y_cpu_default_ref = AK.randn( + rng_cpu_default_ref, AK.get_backend([]), 128; prefer_threads, block_size=64 + ) + @test eltype(y_cpu_default) === Float64 + @test Array(y_cpu_default) == Array(y_cpu_default_ref) + @test rng_cpu_default.offset == rng_cpu_default_ref.offset == UInt64(128) + + rng_cpu_typed = AK.CounterRNG(UInt64(0x77); alg=AK.Philox()) + rng_cpu_typed_ref = AK.CounterRNG(UInt64(0x77); alg=AK.Philox()) + y_cpu_typed = AK.randn(rng_cpu_typed, Float32, 128; prefer_threads, block_size=64) + y_cpu_typed_ref = AK.randn( + rng_cpu_typed_ref, AK.get_backend([]), Float32, 128; prefer_threads, block_size=64 + ) + @test eltype(y_cpu_typed) === Float32 + @test Array(y_cpu_typed) == Array(y_cpu_typed_ref) + @test rng_cpu_typed.offset == rng_cpu_typed_ref.offset == UInt64(128) + # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks. AK.randn(BACKEND, Float32, 1; prefer_threads, block_size=64) # Auto-seeded constructor should match explicit seed capture from default RNG. Random.seed!(0x9abc) seed = Random.rand(Random.default_rng(), UInt64) - ref = AK.randn(AK.CounterRNG(seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64) + ref = AK.randn(AK.CounterRNG( + seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64 + ) Random.seed!(0x9abc) x = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64) @test Array(x) == Array(ref) + # Auto-seeded convenience without explicit type should use backend-dependent default type. + Random.seed!(0x4242) + seed_default = Random.rand(Random.default_rng(), UInt64) + ref_default = AK.randn( + AK.CounterRNG(seed_default; alg=AK.Philox()), + BACKEND, + default_alloc_type, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x4242) + x_default = AK.randn(BACKEND, 64; prefer_threads, block_size=64) + @test eltype(x_default) === default_alloc_type + @test Array(x_default) == Array(ref_default) + + # Convenience without backend should default to CPU backend and Float64. + Random.seed!(0x4545) + seed_cpu_default = Random.rand(Random.default_rng(), UInt64) + ref_cpu_default = AK.randn( + AK.CounterRNG(seed_cpu_default; alg=AK.Philox()), + AK.get_backend([]), + Float64, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x4545) + x_cpu_default = AK.randn(64; prefer_threads, block_size=64) + @test eltype(x_cpu_default) === Float64 + @test Array(x_cpu_default) == Array(ref_cpu_default) + + # Type-only convenience should default to CPU backend. + Random.seed!(0x5656) + seed_cpu_typed = Random.rand(Random.default_rng(), UInt64) + ref_cpu_typed = AK.randn( + AK.CounterRNG(seed_cpu_typed; alg=AK.Philox()), + AK.get_backend([]), + Float32, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x5656) + x_cpu_typed_no_rng = AK.randn(Float32, 64; prefer_threads, block_size=64) + @test eltype(x_cpu_typed_no_rng) === Float32 + @test Array(x_cpu_typed_no_rng) == Array(ref_cpu_typed) + # Reseeding should reproduce the same auto-seeded draw. Random.seed!(0x7777) x1 = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64) @@ -237,8 +319,13 @@ end @test Array(x1) == Array(x2) @test_throws ArgumentError AK.randn(AK.CounterRNG(0x1), BACKEND, UInt32, 16; prefer_threads) - @test_throws MethodError AK.randn(AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.randn( + AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg + ) @test_throws MethodError AK.randn(BACKEND, Float32, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.randn(BACKEND, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.randn(16; prefer_threads, bad=:kwarg) + @test_throws ArgumentError AK.randn() end diff --git a/test/runtests.jl b/test/runtests.jl index 5d3a6ad..707afee 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,13 +17,7 @@ if "--CUDA" in ARGS const BACKEND = CUDABackend() TEST_DL[] = true elseif "--oneAPI" in ARGS - if Sys.iswindows() - # oneAPI v2.6.x can throw `UndefVarError: NEO_jll not defined` on native Windows. - # Pin to the latest known-good minor series until upstream fixes are available. - Pkg.add(name="oneAPI", version="2.5") - else - Pkg.add("oneAPI") - end + Pkg.add("oneAPI") using oneAPI oneAPI.versioninfo() const BACKEND = oneAPIBackend() From 9595086ed35d6005627eeaf8733c305ee20c0a76 Mon Sep 17 00:00:00 2001 From: fjbarter Date: Sat, 28 Mar 2026 00:08:13 +0000 Subject: [PATCH 18/18] document rand_uint and CounterRNGAlgorithm properly, improve RNG docs page --- docs/src/api/rand.md | 169 +++++++++++++++++++++++------------- prototype/rand/test_rand.jl | 26 ++---- src/rand/rand.jl | 9 ++ src/rand/utilities.jl | 28 ++++++ 4 files changed, 153 insertions(+), 79 deletions(-) diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md index d0d4113..381a797 100644 --- a/docs/src/api/rand.md +++ b/docs/src/api/rand.md @@ -1,92 +1,141 @@ ### Random Number Generation -Counter-based random generation for CPU and GPU backends with deterministic stream behavior for +Counter-based random generation for CPU and GPU backends with deterministic stream behaviour for a fixed `seed`, algorithm, and call sequence. Both in-place and allocation forms are supported: - Uniform: `AK.rand!`, `AK.rand` - Standard normal: `AK.randn!`, `AK.randn` -`CounterRNG` carries an internal `offset` (starting at `0`) that advances by `length(v)` on each -`AK.rand!(rng, v)` call. This means chunked fills are stream-consistent: +`CounterRNG` stores: +- `seed::UInt64` +- algorithm `alg` +- stream `offset::UInt64` + +The offset starts at `0` and advances by the number of generated values after each call. For +`AK.rand!(rng, v)` and `AK.randn!(rng, v)`, element `v[i]` is generated from logical counter +`rng.offset + UInt64(i - 1)` in linear indexing order. + +This gives stream-consistent chunking: - filling `100` then `100` elements yields the same `200` values as one `200`-element fill. -- calls that share the same `CounterRNG` instance concurrently are not thread-safe. -- call `AK.reset!(rng)` to rewind a `CounterRNG` offset back to `0x0`. +- `AK.reset!(rng)` rewinds `rng.offset` to `0x0`. -`AK.rand!(rng, v)` accepts `rng::AK.CounterRNG`. -Passing other RNG container types is not supported and will throw a `MethodError`. +Calls that share the same `CounterRNG` instance concurrently are not thread-safe and may race on +`offset`. -Use an explicit `CounterRNG` when reproducibility is required. For -convenience, -`AK.rand!(v)` creates a fresh `CounterRNG()` on each call using one auto-seeded -`Base.rand(UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used. -Likewise, `AK.rand(backend, args...)` creates a fresh auto-seeded `CounterRNG()` on each call. +`AK.rand!` and `AK.randn!` accept `rng::AK.CounterRNG`. Passing other RNG container types is not +supported and will throw a `MethodError`. -`AK.reset!(rng::AK.CounterRNG)` rewinds `rng.offset` to `0x0`. +#### Auto-seeded convenience behaviour -Allocation convenience: -- Canonical forms are `AK.rand(rng, backend, T, dims...)` and `AK.randn(rng, backend, T, dims...)`. -- Defaults are shared: omit `rng` -> fresh `CounterRNG()`; omit `backend` -> CPU backend; omit `T` -> `Float64` on CPU backend and `Float32` otherwise. -- Common shorthands include `AK.rand(dims...)`, `AK.rand(T, dims...)`, `AK.rand(backend, dims...)`, and the corresponding `AK.randn(...)` variants. -- For explicit `rng`, both `AK.rand` and `AK.randn` advance `rng.offset` by `prod(dims)`. +Use an explicit `CounterRNG` when reproducibility is required. -Custom algorithms: -- Define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`. -- Implement typed `rand_uint` methods: - - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32` - - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64` -- Use your algorithm via `AK.CounterRNG(seed; alg=MyAlg(), offset=...)`. +For convenience, calls without an explicit `rng` construct a fresh `CounterRNG()` on each call, +using one auto-seeded `Base.rand(UInt64)` draw. Therefore repeated bare calls intentionally produce +different outputs unless `Random.seed!()` is used first. + +Examples: +- `AK.rand!(v)` +- `AK.randn!(v)` +- `AK.rand(backend, args...)` +- `AK.randn(backend, args...)` + +These do **not** continue a shared stream across calls unless you pass the same explicit +`CounterRNG`. -Both widths should be implemented so `AK.rand!` supports all integer/float output types without falling back or error. +#### Allocation forms -Supported element types: +Canonical forms: +- `AK.rand(rng, backend, T, dims...)` +- `AK.randn(rng, backend, T, dims...)` + +Shared defaults: +- omit `rng` -> fresh `CounterRNG()` +- omit `backend` -> CPU backend +- omit `T` -> `Float64` on CPU backend, `Float32` otherwise + +Common shorthands include: +- `AK.rand(dims...)` +- `AK.rand(T, dims...)` +- `AK.rand(backend, dims...)` +- and the corresponding `AK.randn(...)` variants + +For explicit `rng`, both `AK.rand` and `AK.randn` advance `rng.offset` by the number of generated +elements, i.e. `prod(dims)`. + +#### Supported element types + +`AK.rand!` / `AK.rand` support: - `UInt8`, `UInt16`, `UInt32`, `UInt64` - `Int8`, `Int16`, `Int32`, `Int64` - `Float16`, `Float32`, `Float64` - `Bool` -`AK.randn!` fills arrays with standard normal samples and currently supports: +`AK.randn!` / `AK.randn` currently support: - `Float16`, `Float32`, `Float64` -`AK.randn!` uses Box-Muller with open-interval uniforms in `(0, 1)` from a branch-free midpoint mapping. +#### Value generation semantics + +The core generator produces either a `UInt32` or `UInt64`, depending on the requested output type. +That raw unsigned value is then mapped as follows: +- Unsigned integers: returned directly, or truncated if narrower +- Signed integers: the corresponding unsigned bit pattern reinterpreted as signed, then truncated if narrower +- Floats: mantissa construction onto a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/)) +- Bool: `true` if the raw `UInt` draw is odd (`isodd(u)`), otherwise `false` -`AK.randn!(v)` and `AK.randn(backend, args...)` create a fresh auto-seeded `CounterRNG()` on each -call, so repeated calls produce different outputs unless `Random.seed!()` is used. +`AK.randn!` uses Box-Muller with midpoint-mapped open-interval uniforms in `(0, 1)`. -The core of the random number generation produces either a `UInt32` or `UInt64` depending on the width of the requested element type. -That `UInt` is then either: -- Unsigned integers: returned as-is or truncated if necessary. -- Signed integers: reinterpreted as a signed integer bit pattern and truncated if necessary. -- Floats: mantissa construction into a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/)). -- Bool: `true` if the `UInt` draw is odd (`isodd(u)`), otherwise `false`. +#### Algorithms currently available -Algorithms currently available: - `SplitMix64` ([read more](https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64)) - `Philox` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)) - `Threefry` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)) -Statistical-testing note: +`Philox` is the default algorithm for `CounterRNG()`. + +#### Statistical testing and security + - In this repository, `SplitMix64`, `Philox`, and `Threefry` have passed TestU01 BigCrush -- These generators are not intended to be cryptographically secure. +- These generators are not intended to be cryptographically secure + +#### Philox keying note + +AK uses `Philox2x32` internally, which has a single 32-bit Philox key word. + +Users may pass any non-negative `Integer` seed with `seed <= typemax(UInt64)`; AK converts it to +`UInt64` and derives the 32-bit Philox key using SplitMix. This wrapper choice is deliberate for +ease of use and deterministic streams, not a change to the Philox round function itself. -Philox keying note: -- AK uses `Philox2x32` internally (one 32-bit Philox key word). -- Users can pass any non-negative `Integer` seed; AK normalises to `UInt64` then derives the - 32-bit Philox key via a SplitMix-based mapping. -- This is a deliberate wrapper choice for ease of use (simple `seed` API with deterministic - streams), not a change to the Philox round function itself. -- Therefore, AK Philox streams are deterministic and high-quality, but not guaranteed to be - bit-for-bit identical to a raw Random123 Philox stream unless the same seed-to-key mapping and - counter convention are used. +Therefore, AK Philox streams are deterministic and high-quality, but are not guaranteed to be +bit-for-bit identical to a raw Random123 Philox stream unless the same seed-to-key mapping and +counter convention are used. -`Philox` is the default algorithm for `CounterRNG()` because it is thorough and very fast; it has been measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an Nvidia GeForce RTX -5060 (advertised 448 GB/s), i.e. effectively memory-bound throughput. +#### Custom algorithms + +To define a custom counter RNG: +- define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm` +- implement: + - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32` + - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64` + +Then use it via: +- `AK.CounterRNG(seed; alg=MyAlg(), offset=...)` + +Both widths should be implemented so `AK.rand!` supports all integer and floating-point output +types without fallback or error. + +#### Performance note + +`Philox` is the default because it is high-quality and very fast. `AK.rand!` has been measured at +roughly memory-bound throughput (~390 GB/s) on an Nvidia GeForce RTX 5060, including slightly better +performance than CURAND for large `CuArray{Float32}` fills and substantially faster `CuArray{Int32}` +filling than native `CUDA.rand!` in the benchmarks used for this repository. Examples: ```julia import AcceleratedKernels as AK using oneAPI -using ROCArray +using AMDGPU # Reproducible rng = AK.CounterRNG(0x12345678; alg=AK.Philox()) @@ -94,36 +143,38 @@ v = oneArray{Float32}(undef, 1024) AK.rand!(rng, v) # Stream-consistent chunking +rng = AK.CounterRNG(0x12345678; alg=AK.Philox()) v1 = oneArray{Float32}(undef, 100) v2 = oneArray{Float32}(undef, 100) AK.rand!(rng, v1) AK.rand!(rng, v2) -# Convenience (fresh auto-seeded RNG on each call) +# Convenience: fresh auto-seeded RNG on each call y = oneArray{Float32}(undef, 1024) AK.rand!(y) # Allocation form -y_cpu_auto = AK.rand(1024) # defaults to CPU, Vector{Float64} -y_oneArray = AK.rand(oneAPIBackend(), Float32, 1024) # fresh RNG, allocate and fill oneArray -y_cpu_typed = AK.rand(rng, Float16, 1024) # CPU backend, explicit type, explicit RNG +y_cpu_auto = AK.rand(1024) # CPU, Vector{Float64} +y_one = AK.rand(oneAPIBackend(), Float32, 1024) # fresh RNG, allocate + fill oneArray +y_cpu_typed = AK.rand(rng, Float16, 1024) # CPU backend, explicit type, explicit RNG # Standard normal filling z = ROCArray{Float32}(undef, 1024) AK.randn!(rng, z) # Standard normal allocation form -z_cpu_auto = AK.randn(1024) # defaults to CPU, Vector{Float64} -z_ROCArray = AK.randn(oneAPIBackend(), 1024) # allocate and fill ROCArray{Float32} -z_cpu_typed = AK.randn(rng, Float16, 1024) # CPU backend, explicit type, explicit RNG +z_cpu_auto = AK.randn(1024) # CPU, Vector{Float64} +z_roc = AK.randn(ROCBackend(), 1024) # fresh RNG, allocate + fill ROCArray{Float32} +z_cpu_typed = AK.randn(rng, Float16, 1024) # CPU backend, explicit type, explicit RNG ``` ```@docs AcceleratedKernels.CounterRNG AcceleratedKernels.CounterRNGAlgorithm +AcceleratedKernels.rand_uint AcceleratedKernels.reset! AcceleratedKernels.rand! AcceleratedKernels.rand AcceleratedKernels.randn! AcceleratedKernels.randn -``` +``` \ No newline at end of file diff --git a/prototype/rand/test_rand.jl b/prototype/rand/test_rand.jl index 5a7be0f..0d916f2 100644 --- a/prototype/rand/test_rand.jl +++ b/prototype/rand/test_rand.jl @@ -7,16 +7,13 @@ import AcceleratedKernels as AK const N = 100_000_000 const GPU_BLOCK_SIZE = 256 -const RNG_SPLITMIX = AK.CounterRNG(0x12345678; alg=AK.SplitMix64()) + const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox()) -const RNG_THREEFRY = AK.CounterRNG(0x12345678; alg=AK.Threefry()) -TestType = Float32 +TestType = Float32 x_cuda = CuArray{TestType}(undef, N) -x_splitmix = CuArray{TestType}(undef, N) x_philox = CuArray{TestType}(undef, N) -x_threefry = CuArray{TestType}(undef, N) x_cpu = Vector{TestType}(undef, N) @@ -40,19 +37,11 @@ function run_ak_rand_cpu!(rng, x) end -# Julia base rand() gives [0, 1) and so does EVERYTHING ELSE EVER! but CuRAND gives (0, 1] ... -is_unit_interval(v) = all(x -> 0.0f0 <= x <= 1.0f0, v) - -# warmup compile +# warmup run_cuda_rand!(x_cuda) run_ak_rand_gpu!(RNG_PHILOX, x_philox) -run_ak_rand_gpu!(RNG_THREEFRY, x_threefry) -run_ak_rand_cpu!(RNG_SPLITMIX, x_cpu) +run_ak_rand_cpu!(RNG_PHILOX, x_cpu) -@assert is_unit_interval(Array(x_cuda)) -@assert is_unit_interval(Array(x_philox)) -@assert is_unit_interval(Array(x_threefry)) -@assert is_unit_interval(x_cpu) println("N = ", N) println("CPU threads: ", Threads.nthreads()) @@ -63,9 +52,6 @@ display(@benchmark run_cuda_rand!($x_cuda)) println("\nAK.rand! Philox benchmark (GPU, CuArray{$TestType})") display(@benchmark run_ak_rand_gpu!($RNG_PHILOX, $x_philox)) -println("\nAK.rand! Threefry benchmark (GPU, CuArray{$TestType})") -display(@benchmark run_ak_rand_gpu!($RNG_THREEFRY, $x_threefry)) - -println("\nAK.rand! benchmark (CPU, Vector{$TestType}, SplitMix64)") -display(@benchmark run_ak_rand_cpu!($RNG_SPLITMIX, $x_cpu)) +println("\nAK.rand! benchmark (CPU, Vector{$TestType}, Philox)") +display(@benchmark run_ak_rand_cpu!($RNG_PHILOX, $x_cpu)) diff --git a/src/rand/rand.jl b/src/rand/rand.jl index 06d3619..b2b14bd 100644 --- a/src/rand/rand.jl +++ b/src/rand/rand.jl @@ -5,6 +5,15 @@ const ALLOWED_RAND_SCALARS = Union{ Bool } + +""" + CounterRNGAlgorithm + +Abstract supertype for algorithms used by [`CounterRNG`](@ref). + +To define a custom counter-based RNG algorithm, subtype `CounterRNGAlgorithm` and implement +[`rand_uint`](@ref) for both `UInt32` and `UInt64` outputs. +""" abstract type CounterRNGAlgorithm end diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl index 9e8ccf5..35d9084 100644 --- a/src/rand/utilities.jl +++ b/src/rand/utilities.jl @@ -72,6 +72,34 @@ end Every RNG algorithm implements rand_uint(seed, alg, counter, UInt32/UInt64). This is the fallback for unsupported RNG algorithms. =# +""" + rand_uint(seed::UInt64, alg::CounterRNGAlgorithm, counter::UInt64, ::Type{UIntType}) -> UIntType + where {UIntType <: Union{UInt32, UInt64}} + +Low-level extension point for counter-based RNG algorithms used by [`CounterRNG`](@ref). + +`rand_uint` must deterministically map `(seed, alg, counter)` to a raw unsigned integer of the +requested width. Custom algorithms should implement methods for both: + +- `rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32` +- `rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64` + +These methods are used internally by [`rand!`](@ref), [`rand`](@ref), [`randn!`](@ref), and +[`randn`](@ref) to generate integers, floats, and normal samples. + +# Requirements +- The mapping must be deterministic for fixed `seed`, `alg`, and `counter`. +- Implement both `UInt32` and `UInt64` widths. +- The method should return raw random bits; higher-level type conversion is handled by AK separately. + +# Notes +- `counter` is the logical stream position (typically the array index). +- For block-based algorithms such as Philox or Threefry, the `UInt32` and `UInt64` methods may + share an internal block computation. +- The fallback method throws an `ArgumentError` for algorithms that do not implement `rand_uint`. + +See also: [`CounterRNGAlgorithm`](@ref), [`CounterRNG`](@ref). +""" @inline function rand_uint( ::UInt64, alg::CounterRNGAlgorithm,