From 20eea2c9ab6d9f994d1a57cf167344570365f4fc Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Mon, 16 Mar 2026 01:06:06 +0000
Subject: [PATCH 01/18] on-device rand! with three options for stateless
 counter-based RNG

---
 prototype/rand/Project.toml |   6 ++
 prototype/rand/test_rand.jl |  74 +++++++++++++++++++
 src/AcceleratedKernels.jl   |   1 +
 src/rand/philox.jl          |  37 ++++++++++
 src/rand/rand.jl            | 137 ++++++++++++++++++++++++++++++++++++
 src/rand/splitmix64.jl      |  26 +++++++
 src/rand/threefry.jl        |  49 +++++++++++++
 src/rand/utilities.jl       |  36 ++++++++++
 test/rand.jl                | 132 ++++++++++++++++++++++++++++++++++
 test/runtests.jl            |   1 +
 10 files changed, 499 insertions(+)
 create mode 100644 prototype/rand/Project.toml
 create mode 100644 prototype/rand/test_rand.jl
 create mode 100644 src/rand/philox.jl
 create mode 100644 src/rand/rand.jl
 create mode 100644 src/rand/splitmix64.jl
 create mode 100644 src/rand/threefry.jl
 create mode 100644 src/rand/utilities.jl
 create mode 100644 test/rand.jl

diff --git a/prototype/rand/Project.toml b/prototype/rand/Project.toml
new file mode 100644
index 0000000..7e92c89
--- /dev/null
+++ b/prototype/rand/Project.toml
@@ -0,0 +1,6 @@
+[deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+PProf = "e4faabce-9ead-11e9-39d9-4379958e3056"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
diff --git a/prototype/rand/test_rand.jl b/prototype/rand/test_rand.jl
new file mode 100644
index 0000000..329214d
--- /dev/null
+++ b/prototype/rand/test_rand.jl
@@ -0,0 +1,74 @@
+using BenchmarkTools
+using CUDA
+
+import AcceleratedKernels as AK
+
+
+const N = 100_000_000
+const GPU_BLOCK_SIZE = 256
+
+const RNG_SPLITMIX = AK.CounterRNG(0x12345678; alg=AK.SplitMix64())
+const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox())
+const RNG_THREEFRY = AK.CounterRNG(0x12345678; alg=AK.Threefry())
+
+x_cuda = CuArray{Float32}(undef, N)
+x_splitmix = CuArray{Float32}(undef, N)
+x_philox = CuArray{Float32}(undef, N)
+x_threefry = CuArray{Float32}(undef, N)
+x_cpu = Vector{Float32}(undef, N)
+
+
+function run_cuda_rand!(x)
+    CUDA.rand!(x)
+    CUDA.synchronize()
+    return x
+end
+
+
+function run_ak_rand_gpu!(rng, x)
+    AK.rand!(rng, x; block_size=GPU_BLOCK_SIZE)
+    AK.synchronize(AK.get_backend(x))
+    return x
+end
+
+
+function run_ak_rand_cpu!(rng, x)
+    AK.rand!(rng, x)
+    return x
+end
+
+
+# Julia base rand() gives [0, 1) and so does EVERYTHING ELSE EVER! but CuRAND gives (0, 1] ...
+is_unit_interval(v) = all(x -> 0.0f0 <= x <= 1.0f0, v)
+
+# warmup compile
+run_cuda_rand!(x_cuda)
+run_ak_rand_gpu!(RNG_SPLITMIX, x_splitmix)
+run_ak_rand_gpu!(RNG_PHILOX, x_philox)
+run_ak_rand_gpu!(RNG_THREEFRY, x_threefry)
+run_ak_rand_cpu!(RNG_SPLITMIX, x_cpu)
+
+@assert is_unit_interval(Array(x_cuda))
+@assert is_unit_interval(Array(x_splitmix))
+@assert is_unit_interval(Array(x_philox))
+@assert is_unit_interval(Array(x_threefry))
+@assert is_unit_interval(x_cpu)
+
+println("N = ", N)
+println("CPU threads: ", Threads.nthreads())
+
+println("\nCUDA.rand! benchmark (CuArray{Float32}, in-place)")
+display(@benchmark run_cuda_rand!($x_cuda))
+
+println("\nAK.rand! SplitMix64 benchmark (GPU, CuArray{Float32})")
+display(@benchmark run_ak_rand_gpu!($RNG_SPLITMIX, $x_splitmix))
+
+println("\nAK.rand! Philox benchmark (GPU, CuArray{Float32})")
+display(@benchmark run_ak_rand_gpu!($RNG_PHILOX, $x_philox))
+
+println("\nAK.rand! Threefry benchmark (GPU, CuArray{Float32})")
+display(@benchmark run_ak_rand_gpu!($RNG_THREEFRY, $x_threefry))
+
+println("\nAK.rand! benchmark (CPU, Vector{Float32}, SplitMix64)")
+display(@benchmark run_ak_rand_cpu!($RNG_SPLITMIX, $x_cpu))
+
diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
index d662c2a..06a401a 100644
--- a/src/AcceleratedKernels.jl
+++ b/src/AcceleratedKernels.jl
@@ -31,6 +31,7 @@ include("map.jl")
 include("sort/sort.jl")
 include("reduce/reduce.jl")
 include("accumulate/accumulate.jl")
+include("rand/rand.jl")
 include("searchsorted.jl")
 include("predicates.jl")
 include("arithmetics.jl")
diff --git a/src/rand/philox.jl b/src/rand/philox.jl
new file mode 100644
index 0000000..2b32d95
--- /dev/null
+++ b/src/rand/philox.jl
@@ -0,0 +1,37 @@
+struct Philox <: CounterRNGAlgorithm end
+
+
+# Philox magic numbers
+const PHILOX_M0 = UInt32(0xD256D193)
+const PHILOX_W0 = UInt32(0x9E3779B9)
+const PHILOX_ROUNDS = 10
+
+
+# Each round destroys x0 with multiplication, addition, and XORs
+@inline function _philox2x32_round(x0::UInt32, x1::UInt32, k0::UInt32)
+    lo = PHILOX_M0 * x0
+    hi = _mulhi_u32(PHILOX_M0, x0)
+    y0 = xor(xor(hi, k0), x1)
+    y1 = lo
+    return y0, y1
+end
+
+
+"""
+    rand_uint32(rng::CounterRNG{<:Unsigned, Philox}, counter::UInt64) -> UInt32
+"""
+@inline function rand_uint32(rng::CounterRNG{<:Unsigned, Philox}, counter::UInt64)::UInt32
+    x0 = _u32_lo(counter)
+    x1 = _u32_hi(counter)
+
+    seed = UInt64(rng.seed)
+    k0 = _u32_lo(seed)
+    x1 = xor(x1, _u32_hi(seed))
+
+    @inbounds for _ in 1:PHILOX_ROUNDS
+        x0, x1 = _philox2x32_round(x0, x1, k0)
+        k0 += PHILOX_W0
+    end
+
+    return x0
+end
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
new file mode 100644
index 0000000..07952e1
--- /dev/null
+++ b/src/rand/rand.jl
@@ -0,0 +1,137 @@
+"""
+    abstract type AbstractCounterRNG end
+    abstract type CounterRNGAlgorithm end
+
+RNG interface for counter-based random generation with AcceleratedKernels.
+"""
+
+abstract type AbstractCounterRNG end
+abstract type CounterRNGAlgorithm end
+
+
+"""
+    CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=SplitMix64())
+
+Stateless counter-based RNG configuration for [`rand!`](@ref).
+
+`CounterRNG` is immutable and does not hold mutable thread-local or global state. Each generated
+value is a pure function of:
+- `seed`
+- logical linear element index
+- algorithm (`alg`)
+
+The default algorithm is `Philox()`.
+"""
+struct CounterRNG{K <: Unsigned, A <: CounterRNGAlgorithm} <: AbstractCounterRNG
+    seed::K
+    alg::A
+end
+
+
+function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())
+    @argcheck seed >= 0
+    CounterRNG(UInt64(seed), alg)
+end
+
+
+
+
+# Shared helpers
+include("utilities.jl")
+
+# Algorithm-specific integer generators
+include("splitmix64.jl")
+include("philox.jl")
+include("threefry.jl")
+
+
+
+
+
+function _rand_fill_threads!(
+    rng::AbstractCounterRNG,
+    x::AbstractArray{Float32};
+    max_tasks::Int,
+    min_elems::Int,
+)
+    task_partition(length(x), max_tasks, min_elems) do irange
+        @inbounds for i in irange
+            counter = _counter_from_index(i)
+            x[i] = uint32_to_unit_float32(rand_uint32(rng, counter))
+        end
+    end
+    return x
+end
+
+
+@kernel inbounds=true cpu=false unsafe_indices=true function _rand_fill_kernel!(
+    rng,
+    x,
+)
+    i = @index(Global, Linear)
+    if i <= length(x)
+        counter = _counter_from_index(i)
+        x[i] = uint32_to_unit_float32(rand_uint32(rng, counter))
+    end
+end
+
+
+function _rand_fill_gpu!(
+    rng::AbstractCounterRNG,
+    x::AbstractArray{Float32},
+    backend::Backend;
+    block_size::Int,
+)
+    @argcheck block_size > 0
+    len = length(x)
+    len == 0 && return x
+
+    blocks = div(len, block_size, RoundUp)
+    kernel! = _rand_fill_kernel!(backend, block_size)
+    kernel!(rng, x, ndrange=(blocks * block_size,))
+    return x
+end
+
+
+"""
+    rand!(
+        rng::AbstractCounterRNG,
+        x::AbstractArray{Float32},
+        backend::Backend=get_backend(x);
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+
+        # Implementation choice
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+    )
+
+Fill `x` in-place with pseudo-random `Float32` values in `[0, 1)` using a stateless counter-based
+RNG. For `x[i]`, the counter is exactly `UInt64(i - 1)` in linear indexing order.
+
+The float conversion is mantissa-based: uniform over the produced mantissa grid, not over all
+representable `Float32` values in `[0, 1)`.
+"""
+function rand!(
+    rng::AbstractCounterRNG,
+    x::AbstractArray{Float32},
+    backend::Backend=get_backend(x);
+
+    # CPU settings
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int=256,
+)
+    if use_gpu_algorithm(backend, prefer_threads)
+        _rand_fill_gpu!(rng, x, backend; block_size)
+    else
+        _rand_fill_threads!(rng, x; max_tasks, min_elems)
+    end
+end
diff --git a/src/rand/splitmix64.jl b/src/rand/splitmix64.jl
new file mode 100644
index 0000000..165f72f
--- /dev/null
+++ b/src/rand/splitmix64.jl
@@ -0,0 +1,26 @@
+struct SplitMix64 <: CounterRNGAlgorithm end
+
+# SplitMix64 magic numbers
+const SPLITMIX64_INCREMENT = UInt64(0x9e3779b97f4a7c15)
+const SPLITMIX64_MIX_A = UInt64(0xbf58476d1ce4e5b9)
+const SPLITMIX64_MIX_B = UInt64(0x94d049bb133111eb)
+
+
+@inline function _splitmix64_mix(x::UInt64)::UInt64
+    x = xor(x, x >> 30)
+    x *= SPLITMIX64_MIX_A
+    x = xor(x, x >> 27)
+    x *= SPLITMIX64_MIX_B
+    x = xor(x, x >> 31)
+    return x
+end
+
+
+"""
+    rand_uint32(rng::CounterRNG{<:Unsigned, SplitMix64}, counter::UInt64) -> UInt32
+"""
+@inline function rand_uint32(rng::CounterRNG{<:Unsigned, SplitMix64}, counter::UInt64)::UInt32
+    seed = UInt64(rng.seed)
+    mixed = _splitmix64_mix(counter + seed + SPLITMIX64_INCREMENT)
+    return UInt32(mixed >> 32)
+end
diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl
new file mode 100644
index 0000000..0a65dc3
--- /dev/null
+++ b/src/rand/threefry.jl
@@ -0,0 +1,49 @@
+struct Threefry <: CounterRNGAlgorithm end
+
+# Threefry magic numbers
+const THREEFRY_PARITY = UInt32(0x1BD11BDA)
+const THREEFRY_ROTATIONS = (
+    UInt32(13), UInt32(15), UInt32(26), UInt32(6),
+    UInt32(17), UInt32(29), UInt32(16), UInt32(24),
+)
+const THREEFRY_ROUNDS = 20
+
+
+@inline function _threefry_key_word(k0::UInt32, k1::UInt32, k2::UInt32, idx::Int)::UInt32
+    idx == 0 && return k0
+    idx == 1 && return k1
+    return k2
+end
+
+
+"""
+    rand_uint32(rng::CounterRNG{<:Unsigned, Threefry}, counter::UInt64) -> UInt32
+"""
+@inline function rand_uint32(rng::CounterRNG{<:Unsigned, Threefry}, counter::UInt64)::UInt32
+    x0 = _u32_lo(counter)
+    x1 = _u32_hi(counter)
+
+    seed = UInt64(rng.seed)
+    k0 = _u32_lo(seed)
+    k1 = _u32_hi(seed)
+    k2 = xor(THREEFRY_PARITY, xor(k0, k1))
+
+    x0 += k0
+    x1 += k1
+
+    @inbounds for round in 0:(THREEFRY_ROUNDS - 1)
+        rot = THREEFRY_ROTATIONS[(round & 0x7) + 1]
+        x0 += x1
+        x1 = xor(_rotl32(x1, rot), x0)
+
+        if (round & 0x3) == 3
+            s = (round >>> 2) + 1
+            i0 = s % 3
+            i1 = (s + 1) % 3
+            x0 += _threefry_key_word(k0, k1, k2, i0)
+            x1 += _threefry_key_word(k0, k1, k2, i1) + UInt32(s)
+        end
+    end
+
+    return x0
+end
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
new file mode 100644
index 0000000..35b72f0
--- /dev/null
+++ b/src/rand/utilities.jl
@@ -0,0 +1,36 @@
+# lo: rightmost 32 bits, hi: leftmost 32 bits
+@inline _u32_lo(x::UInt64)::UInt32 = UInt32(x & UInt64(0xffffffff))
+@inline _u32_hi(x::UInt64)::UInt32 = UInt32(x >> 32)
+
+# leftmost 32 bits of a*b cast to UInt64s
+@inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32)
+
+
+@inline function _rotl32(x::UInt32, r::UInt32)::UInt32
+    return (x << r) | (x >> (UInt32(32) - r))
+end
+
+
+@inline _counter_from_index(i)::UInt64 = UInt64(i - one(i))
+
+
+@inline function rand_uint32(::AbstractCounterRNG, ::UInt64)::UInt32
+    # Unrecognised AbstractCounterRNG
+    throw(ArgumentError("No rand_uint32 implementation for this RNG type"))
+end
+
+
+
+"""
+    uint32_to_unit_float32(u::UInt32) -> Float32
+
+Convert a random `UInt32` to `Float32` in `[0, 1)` by mantissa construction.
+"""
+@inline function uint32_to_unit_float32(u::UInt32)::Float32
+    # Keep 23 random bits for the mantissa (drop 9 rightmost bits from the UInt32)
+    # and combine with the bit pattern of 1.0f0 (sign=0, exponent=127).
+    bits = UInt32(0x3f800000) | (u >> 9)
+
+    # Interpret as 1.mantissa, then subtract 1 for [0, 1)
+    return reinterpret(Float32, bits) - 1.0f0
+end
diff --git a/test/rand.jl b/test/rand.jl
new file mode 100644
index 0000000..7fefdff
--- /dev/null
+++ b/test/rand.jl
@@ -0,0 +1,132 @@
+function _is_unit_interval(v)
+    for x in v
+        if isnan(x) || x < 0.0f0 || x >= 1.0f0
+            return false
+        end
+    end
+    return true
+end
+
+function _rand_fill_reference!(rng, x::AbstractArray{Float32})
+    @inbounds for i in eachindex(x)
+        counter = UInt64(i - one(i))
+        x[i] = AK.uint32_to_unit_float32(AK.rand_uint32(rng, counter))
+    end
+    return x
+end
+
+@testset "rand" begin
+    @test AK.CounterRNG{AK.SplitMix64}(0x1) isa AK.CounterRNG{AK.SplitMix64, UInt64}
+    @test AK.CounterRNG{AK.Philox}(UInt32(0x1)) isa AK.CounterRNG{AK.Philox, UInt32}
+    @test AK.CounterRNG{AK.Threefry, UInt16}(123) isa AK.CounterRNG{AK.Threefry, UInt16}
+    @test_throws ArgumentError AK.CounterRNG{AK.SplitMix64, UInt8}(300)
+
+    rng_algs = (AK.SplitMix64(), AK.Philox(), AK.Threefry())
+
+    for alg in rng_algs
+        rng_alg = AK.CounterRNG(0x123456789abcdef; alg)
+        @test AK.rand_uint32(rng_alg, UInt64(0)) == AK.rand_uint32(rng_alg, UInt64(0))
+        @test AK.rand_uint32(rng_alg, UInt64(1)) != AK.rand_uint32(rng_alg, UInt64(0))
+
+        vals_alg = [AK.rand_uint32(rng_alg, UInt64(i)) for i in 0:1023]
+        @test length(unique(vals_alg)) == length(vals_alg)
+
+        x_alg = array_from_host(zeros(Float32, 2048))
+        AK.rand!(rng_alg, x_alg; prefer_threads, block_size=64)
+        @test _is_unit_interval(Array(x_alg))
+    end
+
+    rng = AK.CounterRNG(0x123456789abcdef)
+
+    @test AK.rand_uint32(rng, UInt64(0)) == AK.rand_uint32(rng, UInt64(0))
+    @test AK.rand_uint32(rng, UInt64(1)) != AK.rand_uint32(rng, UInt64(0))
+    @test AK.rand_uint32(rng, UInt64(17)) != AK.rand_uint32(rng, UInt64(18))
+
+    vals = [AK.rand_uint32(rng, UInt64(i)) for i in 0:2047]
+    @test length(unique(vals)) == length(vals)
+
+    for u in (
+        UInt32(0x00000000),
+        UInt32(0x00000001),
+        UInt32(0x7fffffff),
+        UInt32(0x80000000),
+        UInt32(0xffffffff),
+    )
+        x = AK.uint32_to_unit_float32(u)
+        @test !isnan(x)
+        @test 0.0f0 <= x < 1.0f0
+    end
+
+    lengths = (0, 1, 31, 32, 33, 1024, 1025)
+    for len in lengths
+        x = array_from_host(zeros(Float32, len))
+        AK.rand!(rng, x; prefer_threads, block_size=64)
+        xh = Array(x)
+
+        ref = zeros(Float32, len)
+        _rand_fill_reference!(rng, ref)
+
+        @test xh == ref
+        @test _is_unit_interval(xh)
+    end
+
+    x1 = array_from_host(zeros(Float32, 4096))
+    x2 = array_from_host(zeros(Float32, 4096))
+    AK.rand!(rng, x1; prefer_threads, block_size=64)
+    AK.rand!(rng, x2; prefer_threads, block_size=257)
+    @test Array(x1) == Array(x2)
+
+    rng2 = AK.CounterRNG(rng.seed + UInt64(1))
+    x3 = array_from_host(zeros(Float32, 4096))
+    AK.rand!(rng2, x3; prefer_threads, block_size=64)
+    @test Array(x3) != Array(x1)
+
+    xnd = array_from_host(zeros(Float32, 7, 11, 5))
+    AK.rand!(rng, xnd; prefer_threads, block_size=128)
+    xndh = Array(xnd)
+    refnd = zeros(Float32, 7, 11, 5)
+    _rand_fill_reference!(rng, refnd)
+    @test xndh == refnd
+
+    if IS_CPU_BACKEND
+        base = zeros(Float32, 64)
+        view_x = @view base[2:2:end]
+        AK.rand!(
+            rng,
+            view_x;
+            max_tasks=Threads.nthreads(),
+            min_elems=1,
+            prefer_threads=true,
+        )
+
+        ref_view = zeros(Float32, length(view_x))
+        _rand_fill_reference!(rng, ref_view)
+        @test collect(view_x) == ref_view
+    end
+
+    nstats = 200_000
+    xstats = array_from_host(zeros(Float32, nstats))
+    AK.rand!(rng, xstats; prefer_threads, block_size=256)
+    xh = Array(xstats)
+
+    @test _is_unit_interval(xh)
+
+    m = sum(xh) / nstats
+    v = sum((x - m)^2 for x in xh) / nstats
+    @test abs(m - 0.5) < 0.01
+    @test abs(v - (1 / 12)) < 0.01
+
+    nbins = 16
+    counts = zeros(Int, nbins)
+    for x in xh
+        ibin = Int(floor(x * nbins)) + 1
+        ibin = min(ibin, nbins)
+        counts[ibin] += 1
+    end
+    expected = nstats / nbins
+    max_rel_dev = maximum(abs(c - expected) / expected for c in counts)
+    @test max_rel_dev < 0.1
+
+    x64 = array_from_host(zeros(Float64, 16))
+    @test_throws MethodError AK.rand!(rng, x64; prefer_threads)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 716fd8e..a2707b6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -69,6 +69,7 @@ end
 include("partition.jl")
 include("looping.jl")
 include("map.jl")
+include("rand.jl")
 include("sort.jl")
 include("reduce.jl")
 include("accumulate.jl")

From 3c3e81a6ca8f48ccd2b8d93180b50af9691c1b74 Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Mon, 16 Mar 2026 16:51:00 +0000
Subject: [PATCH 02/18] add convenience for AK.rand!(X), and single UInt64 seed
 construction for fresh fills with simple calls. determinism obviously remains
 if an rng is passed

---
 src/rand/philox.jl     |  33 ++++++-
 src/rand/rand.jl       |  89 ++++++++++++++----
 src/rand/splitmix64.jl |  23 ++++-
 src/rand/threefry.jl   |  33 ++++++-
 src/rand/utilities.jl  |  63 ++++++++++++-
 test/rand.jl           | 204 +++++++++++++++++++++++++++++------------
 6 files changed, 357 insertions(+), 88 deletions(-)

diff --git a/src/rand/philox.jl b/src/rand/philox.jl
index 2b32d95..8137f64 100644
--- a/src/rand/philox.jl
+++ b/src/rand/philox.jl
@@ -18,9 +18,12 @@ end
 
 
 """
-    rand_uint32(rng::CounterRNG{<:Unsigned, Philox}, counter::UInt64) -> UInt32
+    _philox2x32_block(rng::CounterRNG{<:Philox}, counter::UInt64)
 """
-@inline function rand_uint32(rng::CounterRNG{<:Unsigned, Philox}, counter::UInt64)::UInt32
+@inline function _philox2x32_block(
+    rng::CounterRNG{<:Philox},
+    counter::UInt64,
+)::Tuple{UInt32, UInt32}
     x0 = _u32_lo(counter)
     x1 = _u32_hi(counter)
 
@@ -33,5 +36,31 @@ end
         k0 += PHILOX_W0
     end
 
+    return x0, x1
+end
+
+
+"""
+    rand_uint(rng::CounterRNG{<:Philox}, counter::UInt64, UInt32) -> UInt32
+"""
+@inline function rand_uint(
+    rng::CounterRNG{<:Philox},
+    counter::UInt64,
+    ::Type{UInt32},
+)::UInt32
+    x0, _ = _philox2x32_block(rng, counter)
     return x0
 end
+
+
+"""
+    rand_uint(rng::CounterRNG{<:Philox}, counter::UInt64, UInt64) -> UInt64
+"""
+@inline function rand_uint(
+    rng::CounterRNG{<:Philox},
+    counter::UInt64,
+    ::Type{UInt64},
+)::UInt64
+    x0, x1 = _philox2x32_block(rng, counter)
+    return _u64_from_u32(x0, x1)
+end
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index 07952e1..0e29452 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -10,7 +10,7 @@ abstract type CounterRNGAlgorithm end
 
 
 """
-    CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=SplitMix64())
+    CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())
 
 Stateless counter-based RNG configuration for [`rand!`](@ref).
 
@@ -21,13 +21,20 @@ value is a pure function of:
 - algorithm (`alg`)
 
 The default algorithm is `Philox()`.
+
+`seed` may be any non-negative `Integer`. It is normalised to `UInt64` internally.
 """
-struct CounterRNG{K <: Unsigned, A <: CounterRNGAlgorithm} <: AbstractCounterRNG
-    seed::K
+struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG
+    seed::UInt64
     alg::A
 end
 
 
+function CounterRNG(seed::Unsigned; alg::CounterRNGAlgorithm=Philox())
+    CounterRNG(UInt64(seed), alg)
+end
+
+
 function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())
     @argcheck seed >= 0
     CounterRNG(UInt64(seed), alg)
@@ -35,6 +42,20 @@ end
 
 
 
+"""
+    CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64())
+
+Create a stateless counter-based RNG with an automatically generated seed.
+
+The seed is sampled exactly once at construction using `rand(UInt64)`. Reusing this same
+`CounterRNG` instance is deterministic for fixed seed, algorithm, array shape, and eltype.
+"""
+function CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64())
+    CounterRNG(Base.rand(UInt64); alg)
+end
+
+
+
 
 # Shared helpers
 include("utilities.jl")
@@ -50,14 +71,14 @@ include("threefry.jl")
 
 function _rand_fill_threads!(
     rng::AbstractCounterRNG,
-    x::AbstractArray{Float32};
+    x::AbstractArray{T};
     max_tasks::Int,
     min_elems::Int,
-)
+) where {T <: ALLOWED_RAND_SCALARS}
     task_partition(length(x), max_tasks, min_elems) do irange
         @inbounds for i in irange
             counter = _counter_from_index(i)
-            x[i] = uint32_to_unit_float32(rand_uint32(rng, counter))
+            x[i] = rand_scalar(rng, counter, T)
         end
     end
     return x
@@ -71,17 +92,17 @@ end
     i = @index(Global, Linear)
     if i <= length(x)
         counter = _counter_from_index(i)
-        x[i] = uint32_to_unit_float32(rand_uint32(rng, counter))
+        x[i] = rand_scalar(rng, counter, eltype(x))
     end
 end
 
 
 function _rand_fill_gpu!(
     rng::AbstractCounterRNG,
-    x::AbstractArray{Float32},
+    x::AbstractArray{T},
     backend::Backend;
     block_size::Int,
-)
+) where {T <: ALLOWED_RAND_SCALARS}
     @argcheck block_size > 0
     len = length(x)
     len == 0 && return x
@@ -96,7 +117,7 @@ end
 """
     rand!(
         rng::AbstractCounterRNG,
-        x::AbstractArray{Float32},
+        x::AbstractArray{T},
         backend::Backend=get_backend(x);
 
         # CPU settings
@@ -110,15 +131,23 @@ end
         block_size::Int=256,
     )
 
-Fill `x` in-place with pseudo-random `Float32` values in `[0, 1)` using a stateless counter-based
-RNG. For `x[i]`, the counter is exactly `UInt64(i - 1)` in linear indexing order.
+Fill `x` in-place with pseudo-random values using a stateless counter-based RNG. For `x[i]`, the
+counter is exactly `UInt64(i - 1)` in linear indexing order.
 
-The float conversion is mantissa-based: uniform over the produced mantissa grid, not over all
-representable `Float32` values in `[0, 1)`.
+Supported scalar element types are:
+- `UInt32`, `UInt64`
+- `Int32`, `Int64`
+- `Float32`, `Float64`
+
+Semantics:
+- Unsigned integers: raw random bit patterns of requested width.
+- Signed integers: corresponding unsigned patterns reinterpreted as signed.
+- Floats: mantissa-based conversion from `UInt32`/`UInt64` into `[0, 1)`, uniform over the
+  produced mantissa grid (not over all representable floats).
 """
 function rand!(
     rng::AbstractCounterRNG,
-    x::AbstractArray{Float32},
+    x::AbstractArray{T},
     backend::Backend=get_backend(x);
 
     # CPU settings
@@ -128,10 +157,34 @@ function rand!(
 
     # GPU settings
     block_size::Int=256,
-)
+) where T
+
+    @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)"
+
     if use_gpu_algorithm(backend, prefer_threads)
-        _rand_fill_gpu!(rng, x, backend; block_size)
+        return _rand_fill_gpu!(rng, x, backend; block_size)
     else
-        _rand_fill_threads!(rng, x; max_tasks, min_elems)
+        return _rand_fill_threads!(rng, x; max_tasks, min_elems)
     end
 end
+
+
+"""
+    rand!(
+        x::AbstractArray{T},
+        args...;
+        kwargs...,
+    )
+
+Convenience overload that creates a fresh `CounterRNG()` and fills `x`.
+
+Each call to `rand!(x, ...)` auto-seeds a new RNG once using `rand(UInt64)`, so repeated calls
+produce different outputs unless an explicit `CounterRNG` is provided.
+"""
+function rand!(
+    x::AbstractArray,
+    args...;
+    kwargs...,
+)
+    return rand!(CounterRNG(), x, args...; kwargs...)
+end
diff --git a/src/rand/splitmix64.jl b/src/rand/splitmix64.jl
index 165f72f..0b94f31 100644
--- a/src/rand/splitmix64.jl
+++ b/src/rand/splitmix64.jl
@@ -17,10 +17,25 @@ end
 
 
 """
-    rand_uint32(rng::CounterRNG{<:Unsigned, SplitMix64}, counter::UInt64) -> UInt32
+    rand_uint(rng::CounterRNG{<:SplitMix64}, counter::UInt64, UInt64) -> UInt64
 """
-@inline function rand_uint32(rng::CounterRNG{<:Unsigned, SplitMix64}, counter::UInt64)::UInt32
+@inline function rand_uint(
+    rng::CounterRNG{<:SplitMix64},
+    counter::UInt64,
+    ::Type{UInt64},
+)::UInt64
     seed = UInt64(rng.seed)
-    mixed = _splitmix64_mix(counter + seed + SPLITMIX64_INCREMENT)
-    return UInt32(mixed >> 32)
+    return _splitmix64_mix(counter + seed + SPLITMIX64_INCREMENT)
+end
+
+
+"""
+    rand_uint(rng::CounterRNG{<:SplitMix64}, counter::UInt64, UInt32) -> UInt32
+"""
+@inline function rand_uint(
+    rng::CounterRNG{<:SplitMix64},
+    counter::UInt64,
+    ::Type{UInt32},
+)::UInt32
+    return _u32_hi(rand_uint(rng, counter, UInt64))
 end
diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl
index 0a65dc3..4b1e5af 100644
--- a/src/rand/threefry.jl
+++ b/src/rand/threefry.jl
@@ -17,9 +17,12 @@ end
 
 
 """
-    rand_uint32(rng::CounterRNG{<:Unsigned, Threefry}, counter::UInt64) -> UInt32
+    _threefry2x32_block(rng::CounterRNG{<:Threefry}, counter::UInt64)
 """
-@inline function rand_uint32(rng::CounterRNG{<:Unsigned, Threefry}, counter::UInt64)::UInt32
+@inline function _threefry2x32_block(
+    rng::CounterRNG{<:Threefry},
+    counter::UInt64,
+)::Tuple{UInt32, UInt32}
     x0 = _u32_lo(counter)
     x1 = _u32_hi(counter)
 
@@ -45,5 +48,31 @@ end
         end
     end
 
+    return x0, x1
+end
+
+
+"""
+    rand_uint(rng::CounterRNG{<:Threefry}, counter::UInt64, UInt32) -> UInt32
+"""
+@inline function rand_uint(
+    rng::CounterRNG{<:Threefry},
+    counter::UInt64,
+    ::Type{UInt32},
+)::UInt32
+    x0, _ = _threefry2x32_block(rng, counter)
     return x0
 end
+
+
+"""
+    rand_uint(rng::CounterRNG{<:Threefry}, counter::UInt64, UInt64) -> UInt64
+"""
+@inline function rand_uint(
+    rng::CounterRNG{<:Threefry},
+    counter::UInt64,
+    ::Type{UInt64},
+)::UInt64
+    x0, x1 = _threefry2x32_block(rng, counter)
+    return _u64_from_u32(x0, x1)
+end
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index 35b72f0..1f27c95 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -1,6 +1,7 @@
 # lo: rightmost 32 bits, hi: leftmost 32 bits
 @inline _u32_lo(x::UInt64)::UInt32 = UInt32(x & UInt64(0xffffffff))
 @inline _u32_hi(x::UInt64)::UInt32 = UInt32(x >> 32)
+@inline _u64_from_u32(lo::UInt32, hi::UInt32)::UInt64 = (UInt64(hi) << 32) | UInt64(lo)
 
 # leftmost 32 bits of a*b cast to UInt64s
 @inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32)
@@ -14,9 +15,50 @@ end
 @inline _counter_from_index(i)::UInt64 = UInt64(i - one(i))
 
 
-@inline function rand_uint32(::AbstractCounterRNG, ::UInt64)::UInt32
-    # Unrecognised AbstractCounterRNG
-    throw(ArgumentError("No rand_uint32 implementation for this RNG type"))
+"""
+    ALLOWED_RAND_SCALARS
+
+Internal scalar eltypes currently supported by [`rand!`](@ref).
+"""
+const ALLOWED_RAND_SCALARS = Union{
+    UInt32, UInt64,
+    Int32, Int64,
+    Float32, Float64,
+}
+
+
+@inline function rand_uint(::AbstractCounterRNG, ::UInt64, ::Type{U})::U where {U <: Union{UInt32, UInt64}}
+    throw(ArgumentError("No rand_uint implementation for this RNG type"))
+end
+
+
+@inline raw_uint_type(::Type{UInt32}) = UInt32
+@inline raw_uint_type(::Type{Int32}) = UInt32
+@inline raw_uint_type(::Type{Float32}) = UInt32
+@inline raw_uint_type(::Type{UInt64}) = UInt64
+@inline raw_uint_type(::Type{Int64}) = UInt64
+@inline raw_uint_type(::Type{Float64}) = UInt64
+
+
+@inline from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u
+@inline from_uint(::Type{UInt64}, u::UInt64)::UInt64 = u
+@inline from_uint(::Type{Int32}, u::UInt32)::Int32 = reinterpret(Int32, u)
+@inline from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u)
+@inline from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u)
+@inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u)
+
+
+@inline function rand_scalar(rng::AbstractCounterRNG, counter::UInt64, ::Type{T})::T where {T <: ALLOWED_RAND_SCALARS}
+    U = raw_uint_type(T)
+    u = rand_uint(rng, counter, U)
+    return from_uint(T, u)
+end
+
+
+@inline function rand_scalar(::AbstractCounterRNG, ::UInt64, ::Type{T}) where {T}
+    throw(ArgumentError(
+        "Unsupported random scalar type $(T). Supported: UInt32, UInt64, Int32, Int64, Float32, Float64."
+    ))
 end
 
 
@@ -34,3 +76,18 @@ Convert a random `UInt32` to `Float32` in `[0, 1)` by mantissa construction.
     # Interpret as 1.mantissa, then subtract 1 for [0, 1)
     return reinterpret(Float32, bits) - 1.0f0
 end
+
+
+"""
+    uint64_to_unit_float64(u::UInt64) -> Float64
+
+Convert a random `UInt64` to `Float64` in `[0, 1)` by mantissa construction.
+"""
+@inline function uint64_to_unit_float64(u::UInt64)::Float64
+    # Keep 52 random bits for the mantissa (drop 12 rightmost bits from the UInt64)
+    # and combine with the bit pattern of 1.0 (sign=0, exponent=1023).
+    bits = UInt64(0x3ff0000000000000) | (u >> 12)
+
+    # Interpret as 1.mantissa, then subtract 1 for [0, 1)
+    return reinterpret(Float64, bits) - 1.0
+end
diff --git a/test/rand.jl b/test/rand.jl
index 7fefdff..339a2cb 100644
--- a/test/rand.jl
+++ b/test/rand.jl
@@ -1,49 +1,100 @@
 function _is_unit_interval(v)
     for x in v
-        if isnan(x) || x < 0.0f0 || x >= 1.0f0
+        if isnan(x) || x < zero(x) || x >= one(x)
             return false
         end
     end
     return true
 end
 
-function _rand_fill_reference!(rng, x::AbstractArray{Float32})
+function _rand_fill_reference!(rng, x::AbstractArray{T}) where {T <: AK.ALLOWED_RAND_SCALARS}
     @inbounds for i in eachindex(x)
         counter = UInt64(i - one(i))
-        x[i] = AK.uint32_to_unit_float32(AK.rand_uint32(rng, counter))
+        x[i] = AK.rand_scalar(rng, counter, T)
     end
     return x
 end
 
 @testset "rand" begin
-    @test AK.CounterRNG{AK.SplitMix64}(0x1) isa AK.CounterRNG{AK.SplitMix64, UInt64}
-    @test AK.CounterRNG{AK.Philox}(UInt32(0x1)) isa AK.CounterRNG{AK.Philox, UInt32}
-    @test AK.CounterRNG{AK.Threefry, UInt16}(123) isa AK.CounterRNG{AK.Threefry, UInt16}
-    @test_throws ArgumentError AK.CounterRNG{AK.SplitMix64, UInt8}(300)
+    @test AK.CounterRNG(0x1; alg=AK.SplitMix64()) isa AK.CounterRNG{AK.SplitMix64}
+    @test AK.CounterRNG(UInt32(0x1); alg=AK.Philox()) isa AK.CounterRNG{AK.Philox}
+    @test AK.CounterRNG(UInt16(123); alg=AK.Threefry()) isa AK.CounterRNG{AK.Threefry}
+    @test AK.CounterRNG(UInt32(300)).seed == UInt64(300)
+    @test_throws ArgumentError AK.CounterRNG(-1)
+
+    Random.seed!(0x1234)
+    expected_auto_seed = rand(UInt64)
+    Random.seed!(0x1234)
+    rng_auto = AK.CounterRNG()
+    @test rng_auto.seed == expected_auto_seed
+    @test rng_auto.alg isa AK.SplitMix64
+
+    xauto1 = array_from_host(zeros(Float32, 1024))
+    xauto2 = array_from_host(zeros(Float32, 1024))
+    AK.rand!(rng_auto, xauto1; prefer_threads, block_size=64)
+    AK.rand!(rng_auto, xauto2; prefer_threads, block_size=257)
+    @test Array(xauto1) == Array(xauto2)
+
+    Random.seed!(0xabcdef)
+    seed1 = rand(UInt64)
+    seed2 = rand(UInt64)
+    ref1 = array_from_host(zeros(Float32, 1024))
+    ref2 = array_from_host(zeros(Float32, 1024))
+    AK.rand!(AK.CounterRNG(seed1; alg=AK.SplitMix64()), ref1; prefer_threads, block_size=64)
+    AK.rand!(AK.CounterRNG(seed2; alg=AK.SplitMix64()), ref2; prefer_threads, block_size=64)
+
+    Random.seed!(0xabcdef)
+    xconv1 = array_from_host(zeros(Float32, 1024))
+    xconv2 = array_from_host(zeros(Float32, 1024))
+    AK.rand!(xconv1; prefer_threads, block_size=64)
+    AK.rand!(xconv2; prefer_threads, block_size=64)
+    @test Array(xconv1) == Array(ref1)
+    @test Array(xconv2) == Array(ref2)
 
     rng_algs = (AK.SplitMix64(), AK.Philox(), AK.Threefry())
+    scalar_types = (UInt32, UInt64, Int32, Int64, Float32, Float64)
 
     for alg in rng_algs
         rng_alg = AK.CounterRNG(0x123456789abcdef; alg)
-        @test AK.rand_uint32(rng_alg, UInt64(0)) == AK.rand_uint32(rng_alg, UInt64(0))
-        @test AK.rand_uint32(rng_alg, UInt64(1)) != AK.rand_uint32(rng_alg, UInt64(0))
+        for U in (UInt32, UInt64)
+            @test AK.rand_uint(rng_alg, UInt64(0), U) == AK.rand_uint(rng_alg, UInt64(0), U)
+            @test AK.rand_uint(rng_alg, UInt64(1), U) != AK.rand_uint(rng_alg, UInt64(0), U)
 
-        vals_alg = [AK.rand_uint32(rng_alg, UInt64(i)) for i in 0:1023]
-        @test length(unique(vals_alg)) == length(vals_alg)
+            vals_alg = [AK.rand_uint(rng_alg, UInt64(i), U) for i in 0:1023]
+            @test length(unique(vals_alg)) > 900
+        end
 
-        x_alg = array_from_host(zeros(Float32, 2048))
-        AK.rand!(rng_alg, x_alg; prefer_threads, block_size=64)
-        @test _is_unit_interval(Array(x_alg))
+        for T in scalar_types
+            x_alg = array_from_host(zeros(T, 2048))
+            AK.rand!(rng_alg, x_alg; prefer_threads, block_size=64)
+            x_alg_h = Array(x_alg)
+            ref_alg = zeros(T, 2048)
+            _rand_fill_reference!(rng_alg, ref_alg)
+            @test x_alg_h == ref_alg
+            if T <: AbstractFloat
+                @test _is_unit_interval(x_alg_h)
+            end
+        end
     end
 
     rng = AK.CounterRNG(0x123456789abcdef)
 
-    @test AK.rand_uint32(rng, UInt64(0)) == AK.rand_uint32(rng, UInt64(0))
-    @test AK.rand_uint32(rng, UInt64(1)) != AK.rand_uint32(rng, UInt64(0))
-    @test AK.rand_uint32(rng, UInt64(17)) != AK.rand_uint32(rng, UInt64(18))
+    @test AK.from_uint(UInt32, UInt32(0xdeadbeef)) == UInt32(0xdeadbeef)
+    @test AK.from_uint(UInt64, UInt64(0x0123456789abcdef)) == UInt64(0x0123456789abcdef)
+    @test AK.from_uint(Int32, UInt32(0xdeadbeef)) == reinterpret(Int32, UInt32(0xdeadbeef))
+    @test AK.from_uint(Int64, UInt64(0x0123456789abcdef)) == reinterpret(Int64, UInt64(0x0123456789abcdef))
+
+    @test AK.rand_uint(rng, UInt64(0), UInt32) == AK.rand_uint(rng, UInt64(0), UInt32)
+    @test AK.rand_uint(rng, UInt64(1), UInt32) != AK.rand_uint(rng, UInt64(0), UInt32)
+    @test AK.rand_uint(rng, UInt64(17), UInt32) != AK.rand_uint(rng, UInt64(18), UInt32)
+    @test AK.rand_uint(rng, UInt64(0), UInt64) == AK.rand_uint(rng, UInt64(0), UInt64)
+    @test AK.rand_uint(rng, UInt64(1), UInt64) != AK.rand_uint(rng, UInt64(0), UInt64)
+    @test AK.rand_uint(rng, UInt64(17), UInt64) != AK.rand_uint(rng, UInt64(18), UInt64)
 
-    vals = [AK.rand_uint32(rng, UInt64(i)) for i in 0:2047]
-    @test length(unique(vals)) == length(vals)
+    vals_u32 = [AK.rand_uint(rng, UInt64(i), UInt32) for i in 0:2047]
+    vals_u64 = [AK.rand_uint(rng, UInt64(i), UInt64) for i in 0:2047]
+    @test length(unique(vals_u32)) > 1800
+    @test length(unique(vals_u64)) > 2000
 
     for u in (
         UInt32(0x00000000),
@@ -57,51 +108,85 @@ end
         @test 0.0f0 <= x < 1.0f0
     end
 
-    lengths = (0, 1, 31, 32, 33, 1024, 1025)
-    for len in lengths
-        x = array_from_host(zeros(Float32, len))
-        AK.rand!(rng, x; prefer_threads, block_size=64)
-        xh = Array(x)
-
-        ref = zeros(Float32, len)
-        _rand_fill_reference!(rng, ref)
+    for u in (
+        UInt64(0x0000000000000000),
+        UInt64(0x0000000000000001),
+        UInt64(0x7fffffffffffffff),
+        UInt64(0x8000000000000000),
+        UInt64(0xffffffffffffffff),
+    )
+        x = AK.uint64_to_unit_float64(u)
+        @test !isnan(x)
+        @test 0.0 <= x < 1.0
+    end
 
-        @test xh == ref
-        @test _is_unit_interval(xh)
+    for T in scalar_types
+        s0 = AK.rand_scalar(rng, UInt64(0), T)
+        s1 = AK.rand_scalar(rng, UInt64(1), T)
+        @test s0 isa T
+        @test s1 isa T
+        @test s0 != s1
+        if T <: AbstractFloat
+            @test zero(T) <= s0 < one(T)
+            @test zero(T) <= s1 < one(T)
+        end
     end
 
-    x1 = array_from_host(zeros(Float32, 4096))
-    x2 = array_from_host(zeros(Float32, 4096))
-    AK.rand!(rng, x1; prefer_threads, block_size=64)
-    AK.rand!(rng, x2; prefer_threads, block_size=257)
-    @test Array(x1) == Array(x2)
+    @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16)
 
-    rng2 = AK.CounterRNG(rng.seed + UInt64(1))
-    x3 = array_from_host(zeros(Float32, 4096))
-    AK.rand!(rng2, x3; prefer_threads, block_size=64)
-    @test Array(x3) != Array(x1)
+    lengths = (0, 1, 31, 32, 33, 1024, 1025)
+    for T in scalar_types
+        for len in lengths
+            x = array_from_host(zeros(T, len))
+            AK.rand!(rng, x; prefer_threads, block_size=64)
+            xh = Array(x)
+
+            ref = zeros(T, len)
+            _rand_fill_reference!(rng, ref)
+
+            @test xh == ref
+            if T <: AbstractFloat
+                @test _is_unit_interval(xh)
+            end
+        end
+    end
 
-    xnd = array_from_host(zeros(Float32, 7, 11, 5))
-    AK.rand!(rng, xnd; prefer_threads, block_size=128)
-    xndh = Array(xnd)
-    refnd = zeros(Float32, 7, 11, 5)
-    _rand_fill_reference!(rng, refnd)
-    @test xndh == refnd
+    rng2 = AK.CounterRNG(rng.seed + UInt64(1))
+    for T in scalar_types
+        x1 = array_from_host(zeros(T, 4096))
+        x2 = array_from_host(zeros(T, 4096))
+        AK.rand!(rng, x1; prefer_threads, block_size=64)
+        AK.rand!(rng, x2; prefer_threads, block_size=257)
+        @test Array(x1) == Array(x2)
+
+        x3 = array_from_host(zeros(T, 4096))
+        AK.rand!(rng2, x3; prefer_threads, block_size=64)
+        @test Array(x3) != Array(x1)
+
+        xnd = array_from_host(zeros(T, 7, 11, 5))
+        AK.rand!(rng, xnd; prefer_threads, block_size=128)
+        xndh = Array(xnd)
+        refnd = zeros(T, 7, 11, 5)
+        _rand_fill_reference!(rng, refnd)
+        @test xndh == refnd
+    end
 
     if IS_CPU_BACKEND
-        base = zeros(Float32, 64)
-        view_x = @view base[2:2:end]
-        AK.rand!(
-            rng,
-            view_x;
-            max_tasks=Threads.nthreads(),
-            min_elems=1,
-            prefer_threads=true,
-        )
-
-        ref_view = zeros(Float32, length(view_x))
-        _rand_fill_reference!(rng, ref_view)
-        @test collect(view_x) == ref_view
+        for T in scalar_types
+            base = zeros(T, 64)
+            view_x = @view base[2:2:end]
+            AK.rand!(
+                rng,
+                view_x;
+                max_tasks=Threads.nthreads(),
+                min_elems=1,
+                prefer_threads=true,
+            )
+
+            ref_view = zeros(T, length(view_x))
+            _rand_fill_reference!(rng, ref_view)
+            @test collect(view_x) == ref_view
+        end
     end
 
     nstats = 200_000
@@ -127,6 +212,7 @@ end
     max_rel_dev = maximum(abs(c - expected) / expected for c in counts)
     @test max_rel_dev < 0.1
 
-    x64 = array_from_host(zeros(Float64, 16))
-    @test_throws MethodError AK.rand!(rng, x64; prefer_threads)
+    x16 = array_from_host(zeros(UInt16, 16))
+    @test_throws ArgumentError AK.rand!(x16; prefer_threads)
+    @test_throws ArgumentError AK.rand!(rng, x16; prefer_threads)
 end

From f590d5eabcb5a616776cb13913c3b72eba2e1428 Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Mon, 16 Mar 2026 17:42:43 +0000
Subject: [PATCH 03/18] just use foreachindex for the kernel now that it is
 simple enough

---
 src/rand/rand.jl      | 86 +++++++++----------------------------------
 src/rand/utilities.jl | 27 ++++++++++----
 2 files changed, 36 insertions(+), 77 deletions(-)

diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index 0e29452..23ef801 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -23,6 +23,13 @@ value is a pure function of:
 The default algorithm is `Philox()`.
 
 `seed` may be any non-negative `Integer`. It is normalised to `UInt64` internally.
+
+Constructors:
+- `CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())`
+  Uses an explicit non-negative seed.
+- `CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64())`
+  Auto-seeds once using `rand(UInt64)`. Reusing the same `CounterRNG` instance is deterministic
+  for fixed seed, algorithm, array shape, and eltype.
 """
 struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG
     seed::UInt64
@@ -30,26 +37,12 @@ struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG
 end
 
 
-function CounterRNG(seed::Unsigned; alg::CounterRNGAlgorithm=Philox())
-    CounterRNG(UInt64(seed), alg)
-end
-
-
 function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())
-    @argcheck seed >= 0
+    @argcheck seed >= 0 "Seed must be a positive integer"
     CounterRNG(UInt64(seed), alg)
 end
 
 
-
-"""
-    CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64())
-
-Create a stateless counter-based RNG with an automatically generated seed.
-
-The seed is sampled exactly once at construction using `rand(UInt64)`. Reusing this same
-`CounterRNG` instance is deterministic for fixed seed, algorithm, array shape, and eltype.
-"""
 function CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64())
     CounterRNG(Base.rand(UInt64); alg)
 end
@@ -65,55 +58,6 @@ include("splitmix64.jl")
 include("philox.jl")
 include("threefry.jl")
 
-
-
-
-
-function _rand_fill_threads!(
-    rng::AbstractCounterRNG,
-    x::AbstractArray{T};
-    max_tasks::Int,
-    min_elems::Int,
-) where {T <: ALLOWED_RAND_SCALARS}
-    task_partition(length(x), max_tasks, min_elems) do irange
-        @inbounds for i in irange
-            counter = _counter_from_index(i)
-            x[i] = rand_scalar(rng, counter, T)
-        end
-    end
-    return x
-end
-
-
-@kernel inbounds=true cpu=false unsafe_indices=true function _rand_fill_kernel!(
-    rng,
-    x,
-)
-    i = @index(Global, Linear)
-    if i <= length(x)
-        counter = _counter_from_index(i)
-        x[i] = rand_scalar(rng, counter, eltype(x))
-    end
-end
-
-
-function _rand_fill_gpu!(
-    rng::AbstractCounterRNG,
-    x::AbstractArray{T},
-    backend::Backend;
-    block_size::Int,
-) where {T <: ALLOWED_RAND_SCALARS}
-    @argcheck block_size > 0
-    len = length(x)
-    len == 0 && return x
-
-    blocks = div(len, block_size, RoundUp)
-    kernel! = _rand_fill_kernel!(backend, block_size)
-    kernel!(rng, x, ndrange=(blocks * block_size,))
-    return x
-end
-
-
 """
     rand!(
         rng::AbstractCounterRNG,
@@ -160,12 +104,16 @@ function rand!(
 ) where T
 
     @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)"
-
-    if use_gpu_algorithm(backend, prefer_threads)
-        return _rand_fill_gpu!(rng, x, backend; block_size)
-    else
-        return _rand_fill_threads!(rng, x; max_tasks, min_elems)
+    foreachindex(
+        1:length(x), backend;
+        max_tasks,
+        min_elems,
+        prefer_threads,
+        block_size,
+    ) do i
+        @inbounds x[i] = rand_scalar(rng, _counter_from_index(i), T)
     end
+    return x
 end
 
 
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index 1f27c95..c22a863 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -27,11 +27,6 @@ const ALLOWED_RAND_SCALARS = Union{
 }
 
 
-@inline function rand_uint(::AbstractCounterRNG, ::UInt64, ::Type{U})::U where {U <: Union{UInt32, UInt64}}
-    throw(ArgumentError("No rand_uint implementation for this RNG type"))
-end
-
-
 @inline raw_uint_type(::Type{UInt32}) = UInt32
 @inline raw_uint_type(::Type{Int32}) = UInt32
 @inline raw_uint_type(::Type{Float32}) = UInt32
@@ -48,9 +43,24 @@ end
 @inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u)
 
 
-@inline function rand_scalar(rng::AbstractCounterRNG, counter::UInt64, ::Type{T})::T where {T <: ALLOWED_RAND_SCALARS}
-    U = raw_uint_type(T)
-    u = rand_uint(rng, counter, U)
+@inline function rand_uint(
+    rng::AbstractCounterRNG,
+    ::UInt64,
+    ::Type{UIntType}
+)::UIntType where {UIntType <: Union{UInt32, UInt64}}
+    throw(ArgumentError("No rand_uint implementation for RNG: $rng"))
+end
+
+
+@inline function rand_scalar(
+    rng::AbstractCounterRNG,
+    counter::UInt64,
+    ::Type{T}
+)::T where {T <: ALLOWED_RAND_SCALARS}
+
+    UIntType = raw_uint_type(T)
+    u = rand_uint(rng, counter, UIntType)
+
     return from_uint(T, u)
 end
 
@@ -63,6 +73,7 @@ end
 
 
 
+
 """
     uint32_to_unit_float32(u::UInt32) -> Float32
 

From 9e471d6c3144704e036bd98bf16705181426b85a Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Mon, 16 Mar 2026 18:34:24 +0000
Subject: [PATCH 04/18] finalise tests and docs

---
 docs/make.jl           |   1 +
 docs/src/api/rand.md   |  48 ++++++
 src/rand/philox.jl     |  15 +-
 src/rand/rand.jl       |  18 +-
 src/rand/splitmix64.jl |   8 +-
 src/rand/threefry.jl   |  14 +-
 src/rand/utilities.jl  |  36 ++--
 test/rand.jl           | 362 ++++++++++++++++++++---------------------
 8 files changed, 256 insertions(+), 246 deletions(-)
 create mode 100644 docs/src/api/rand.md

diff --git a/docs/make.jl b/docs/make.jl
index 58b4632..5e794a2 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -23,6 +23,7 @@ makedocs(;
             "Using Different Backends" => "api/using_backends.md",
             "General Loops" => "api/foreachindex.md",
             "Map" => "api/map.md",
+            "Random Number Generation" => "api/rand.md",
             "Sorting" => "api/sort.md",
             "Reduce" => "api/reduce.md",
             "MapReduce" => "api/mapreduce.md",
diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md
new file mode 100644
index 0000000..2fa468f
--- /dev/null
+++ b/docs/src/api/rand.md
@@ -0,0 +1,48 @@
+### Random Number Generation
+
+Counter-based random generation for CPU and GPU backends with deterministic behavior for fixed
+`seed`, algorithm, array shape, and eltype.
+
+Use an explicit `CounterRNG(seed; alg=...)` when reproducibility matters. For convenience,
+`AK.rand!(x)` creates a fresh `CounterRNG()` on each call using one auto-seeded `Base.rand(UInt64)`
+draw, so repeated calls usually produce different outputs.
+
+Supported output element types:
+- `UInt32`, `UInt64`
+- `Int32`, `Int64`
+- `Float32`, `Float64`
+
+The core of the random number generation produces a `UInt` of the requested scalar width.
+That `UInt` is then either:
+- Unsigned integers: returned as-is
+- Signed integers: reinterpreted as a signed integer bit pattern.
+- Floats: mantissa construction into a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/)).
+
+Algorithms currently available:
+- `SplitMix64` ([read more](https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64))
+- `Philox` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf))
+- `Threefry` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf))
+
+`Philox` is the default algorithm for `CounterRNG()`, as it is more thoroughly
+statistically tested and measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an RTX
+5060 (advertised 448 GB/s), i.e. effectively memory-bound throughput.
+
+Examples:
+```julia
+import AcceleratedKernels as AK
+using oneAPI
+
+# Reproducible
+rng = AK.CounterRNG(0x12345678; alg=AK.Philox())
+x = oneArray{Float32}(undef, 1024)
+AK.rand!(rng, x)
+
+# Convenience (fresh auto-seeded RNG on each call)
+y = oneArray{Float32}(undef, 1024)
+AK.rand!(y)
+```
+
+```@docs
+AcceleratedKernels.CounterRNG
+AcceleratedKernels.rand!
+```
diff --git a/src/rand/philox.jl b/src/rand/philox.jl
index 8137f64..bdebc8e 100644
--- a/src/rand/philox.jl
+++ b/src/rand/philox.jl
@@ -7,7 +7,6 @@ const PHILOX_W0 = UInt32(0x9E3779B9)
 const PHILOX_ROUNDS = 10
 
 
-# Each round destroys x0 with multiplication, addition, and XORs
 @inline function _philox2x32_round(x0::UInt32, x1::UInt32, k0::UInt32)
     lo = PHILOX_M0 * x0
     hi = _mulhi_u32(PHILOX_M0, x0)
@@ -17,9 +16,7 @@ const PHILOX_ROUNDS = 10
 end
 
 
-"""
-    _philox2x32_block(rng::CounterRNG{<:Philox}, counter::UInt64)
-"""
+# Evaluate one Philox block at `counter`, returning two 32-bit lanes `(x0, x1)`
 @inline function _philox2x32_block(
     rng::CounterRNG{<:Philox},
     counter::UInt64,
@@ -40,9 +37,7 @@ end
 end
 
 
-"""
-    rand_uint(rng::CounterRNG{<:Philox}, counter::UInt64, UInt32) -> UInt32
-"""
+# Return lane 0 from the single Philox block at `counter`
 @inline function rand_uint(
     rng::CounterRNG{<:Philox},
     counter::UInt64,
@@ -53,14 +48,12 @@ end
 end
 
 
-"""
-    rand_uint(rng::CounterRNG{<:Philox}, counter::UInt64, UInt64) -> UInt64
-"""
+# Build UInt64 from the two lanes `(x0, x1)` of the same Philox block at `counter`
 @inline function rand_uint(
     rng::CounterRNG{<:Philox},
     counter::UInt64,
     ::Type{UInt64},
 )::UInt64
     x0, x1 = _philox2x32_block(rng, counter)
-    return _u64_from_u32(x0, x1)
+    return _u64_from_u32s(x0, x1)
 end
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index 23ef801..e62ba96 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -38,12 +38,12 @@ end
 
 
 function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())
-    @argcheck seed >= 0 "Seed must be a positive integer"
+    @argcheck seed >= 0 "Seed must be a non-negative integer"
     CounterRNG(UInt64(seed), alg)
 end
 
 
-function CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64())
+function CounterRNG(; alg::CounterRNGAlgorithm=Philox())
     CounterRNG(Base.rand(UInt64); alg)
 end
 
@@ -115,20 +115,6 @@ function rand!(
     end
     return x
 end
-
-
-"""
-    rand!(
-        x::AbstractArray{T},
-        args...;
-        kwargs...,
-    )
-
-Convenience overload that creates a fresh `CounterRNG()` and fills `x`.
-
-Each call to `rand!(x, ...)` auto-seeds a new RNG once using `rand(UInt64)`, so repeated calls
-produce different outputs unless an explicit `CounterRNG` is provided.
-"""
 function rand!(
     x::AbstractArray,
     args...;
diff --git a/src/rand/splitmix64.jl b/src/rand/splitmix64.jl
index 0b94f31..d169b73 100644
--- a/src/rand/splitmix64.jl
+++ b/src/rand/splitmix64.jl
@@ -16,9 +16,7 @@ const SPLITMIX64_MIX_B = UInt64(0x94d049bb133111eb)
 end
 
 
-"""
-    rand_uint(rng::CounterRNG{<:SplitMix64}, counter::UInt64, UInt64) -> UInt64
-"""
+# Natural SplitMix64 output path: compute 64 random bits directly from one counter
 @inline function rand_uint(
     rng::CounterRNG{<:SplitMix64},
     counter::UInt64,
@@ -29,9 +27,7 @@ end
 end
 
 
-"""
-    rand_uint(rng::CounterRNG{<:SplitMix64}, counter::UInt64, UInt32) -> UInt32
-"""
+# UInt32 path is derived from the high 32 bits of the UInt64 SplitMix output
 @inline function rand_uint(
     rng::CounterRNG{<:SplitMix64},
     counter::UInt64,
diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl
index 4b1e5af..7326f40 100644
--- a/src/rand/threefry.jl
+++ b/src/rand/threefry.jl
@@ -16,9 +16,7 @@ const THREEFRY_ROUNDS = 20
 end
 
 
-"""
-    _threefry2x32_block(rng::CounterRNG{<:Threefry}, counter::UInt64)
-"""
+# Evaluate one Threefry block at `counter`, returning two 32-bit lanes `(x0, x1)`
 @inline function _threefry2x32_block(
     rng::CounterRNG{<:Threefry},
     counter::UInt64,
@@ -52,9 +50,7 @@ end
 end
 
 
-"""
-    rand_uint(rng::CounterRNG{<:Threefry}, counter::UInt64, UInt32) -> UInt32
-"""
+# Return lane 0 from the single Threefry block at `counter`
 @inline function rand_uint(
     rng::CounterRNG{<:Threefry},
     counter::UInt64,
@@ -65,14 +61,12 @@ end
 end
 
 
-"""
-    rand_uint(rng::CounterRNG{<:Threefry}, counter::UInt64, UInt64) -> UInt64
-"""
+# Build UInt64 from the two lanes `(x0, x1)` of the same Threefry block at `counter`
 @inline function rand_uint(
     rng::CounterRNG{<:Threefry},
     counter::UInt64,
     ::Type{UInt64},
 )::UInt64
     x0, x1 = _threefry2x32_block(rng, counter)
-    return _u64_from_u32(x0, x1)
+    return _u64_from_u32s(x0, x1)
 end
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index c22a863..2623c9d 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -1,25 +1,19 @@
 # lo: rightmost 32 bits, hi: leftmost 32 bits
 @inline _u32_lo(x::UInt64)::UInt32 = UInt32(x & UInt64(0xffffffff))
 @inline _u32_hi(x::UInt64)::UInt32 = UInt32(x >> 32)
-@inline _u64_from_u32(lo::UInt32, hi::UInt32)::UInt64 = (UInt64(hi) << 32) | UInt64(lo)
+@inline _u64_from_u32s(lo::UInt32, hi::UInt32)::UInt64 = (UInt64(hi) << 32) | UInt64(lo)
 
 # leftmost 32 bits of a*b cast to UInt64s
 @inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32)
 
-
-@inline function _rotl32(x::UInt32, r::UInt32)::UInt32
-    return (x << r) | (x >> (UInt32(32) - r))
-end
+# 32-bit rotate left by r positions
+@inline _rotl32(x::UInt32, r::UInt32)::UInt32 = (x << r) | (x >> (UInt32(32) - r))
 
 
 @inline _counter_from_index(i)::UInt64 = UInt64(i - one(i))
 
 
-"""
-    ALLOWED_RAND_SCALARS
-
-Internal scalar eltypes currently supported by [`rand!`](@ref).
-"""
+# Internal scalar eltypes currently supported by rand!.
 const ALLOWED_RAND_SCALARS = Union{
     UInt32, UInt64,
     Int32, Int64,
@@ -43,6 +37,10 @@ const ALLOWED_RAND_SCALARS = Union{
 @inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u)
 
 
+#=
+Every RNG algorithm implements rand_uint(rng, counter, UInt32/UInt64).
+This fallback provides a clear failure for unsupported RNG types.
+=#
 @inline function rand_uint(
     rng::AbstractCounterRNG,
     ::UInt64,
@@ -52,6 +50,12 @@ const ALLOWED_RAND_SCALARS = Union{
 end
 
 
+#=
+Shared scalar generation:
+1) map requested scalar type to corresponding raw UInt width
+2) fill the UInt with random bits
+3) convert bits into requested scalar representation
+=#
 @inline function rand_scalar(
     rng::AbstractCounterRNG,
     counter::UInt64,
@@ -74,11 +78,7 @@ end
 
 
 
-"""
-    uint32_to_unit_float32(u::UInt32) -> Float32
-
-Convert a random `UInt32` to `Float32` in `[0, 1)` by mantissa construction.
-"""
+# Convert random UInt32 bits to Float32 in [0, 1) by mantissa construction.
 @inline function uint32_to_unit_float32(u::UInt32)::Float32
     # Keep 23 random bits for the mantissa (drop 9 rightmost bits from the UInt32)
     # and combine with the bit pattern of 1.0f0 (sign=0, exponent=127).
@@ -89,11 +89,7 @@ Convert a random `UInt32` to `Float32` in `[0, 1)` by mantissa construction.
 end
 
 
-"""
-    uint64_to_unit_float64(u::UInt64) -> Float64
-
-Convert a random `UInt64` to `Float64` in `[0, 1)` by mantissa construction.
-"""
+# Convert random UInt64 bits to Float64 in [0, 1) by mantissa construction.
 @inline function uint64_to_unit_float64(u::UInt64)::Float64
     # Keep 52 random bits for the mantissa (drop 12 rightmost bits from the UInt64)
     # and combine with the bit pattern of 1.0 (sign=0, exponent=1023).
diff --git a/test/rand.jl b/test/rand.jl
index 339a2cb..e3f0bb6 100644
--- a/test/rand.jl
+++ b/test/rand.jl
@@ -1,218 +1,214 @@
-function _is_unit_interval(v)
-    for x in v
-        if isnan(x) || x < zero(x) || x >= one(x)
-            return false
-        end
-    end
-    return true
-end
+const RAND_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry())
+const RAND_SCALAR_TYPES = (UInt32, UInt64, Int32, Int64, Float32, Float64)
+
+
+_is_unit_interval(v) = all(x -> !isnan(x) && zero(x) <= x < one(x), v)
+
 
 function _rand_fill_reference!(rng, x::AbstractArray{T}) where {T <: AK.ALLOWED_RAND_SCALARS}
     @inbounds for i in eachindex(x)
-        counter = UInt64(i - one(i))
-        x[i] = AK.rand_scalar(rng, counter, T)
+        x[i] = AK.rand_scalar(rng, UInt64(i - one(i)), T)
     end
     return x
 end
 
-@testset "rand" begin
-    @test AK.CounterRNG(0x1; alg=AK.SplitMix64()) isa AK.CounterRNG{AK.SplitMix64}
-    @test AK.CounterRNG(UInt32(0x1); alg=AK.Philox()) isa AK.CounterRNG{AK.Philox}
-    @test AK.CounterRNG(UInt16(123); alg=AK.Threefry()) isa AK.CounterRNG{AK.Threefry}
-    @test AK.CounterRNG(UInt32(300)).seed == UInt64(300)
-    @test_throws ArgumentError AK.CounterRNG(-1)
-
-    Random.seed!(0x1234)
-    expected_auto_seed = rand(UInt64)
-    Random.seed!(0x1234)
-    rng_auto = AK.CounterRNG()
-    @test rng_auto.seed == expected_auto_seed
-    @test rng_auto.alg isa AK.SplitMix64
-
-    xauto1 = array_from_host(zeros(Float32, 1024))
-    xauto2 = array_from_host(zeros(Float32, 1024))
-    AK.rand!(rng_auto, xauto1; prefer_threads, block_size=64)
-    AK.rand!(rng_auto, xauto2; prefer_threads, block_size=257)
-    @test Array(xauto1) == Array(xauto2)
-
-    Random.seed!(0xabcdef)
-    seed1 = rand(UInt64)
-    seed2 = rand(UInt64)
-    ref1 = array_from_host(zeros(Float32, 1024))
-    ref2 = array_from_host(zeros(Float32, 1024))
-    AK.rand!(AK.CounterRNG(seed1; alg=AK.SplitMix64()), ref1; prefer_threads, block_size=64)
-    AK.rand!(AK.CounterRNG(seed2; alg=AK.SplitMix64()), ref2; prefer_threads, block_size=64)
-
-    Random.seed!(0xabcdef)
-    xconv1 = array_from_host(zeros(Float32, 1024))
-    xconv2 = array_from_host(zeros(Float32, 1024))
-    AK.rand!(xconv1; prefer_threads, block_size=64)
-    AK.rand!(xconv2; prefer_threads, block_size=64)
-    @test Array(xconv1) == Array(ref1)
-    @test Array(xconv2) == Array(ref2)
-
-    rng_algs = (AK.SplitMix64(), AK.Philox(), AK.Threefry())
-    scalar_types = (UInt32, UInt64, Int32, Int64, Float32, Float64)
-
-    for alg in rng_algs
-        rng_alg = AK.CounterRNG(0x123456789abcdef; alg)
-        for U in (UInt32, UInt64)
-            @test AK.rand_uint(rng_alg, UInt64(0), U) == AK.rand_uint(rng_alg, UInt64(0), U)
-            @test AK.rand_uint(rng_alg, UInt64(1), U) != AK.rand_uint(rng_alg, UInt64(0), U)
-
-            vals_alg = [AK.rand_uint(rng_alg, UInt64(i), U) for i in 0:1023]
-            @test length(unique(vals_alg)) > 900
-        end
 
-        for T in scalar_types
-            x_alg = array_from_host(zeros(T, 2048))
-            AK.rand!(rng_alg, x_alg; prefer_threads, block_size=64)
-            x_alg_h = Array(x_alg)
-            ref_alg = zeros(T, 2048)
-            _rand_fill_reference!(rng_alg, ref_alg)
-            @test x_alg_h == ref_alg
-            if T <: AbstractFloat
-                @test _is_unit_interval(x_alg_h)
-            end
-        end
-    end
+function _assert_rand_matches_reference!(rng, x; kwargs...)
+    AK.rand!(rng, x; kwargs...)
+    ref = zeros(eltype(x), size(x))
+    _rand_fill_reference!(rng, ref)
+    @test Array(x) == ref
+    return x
+end
 
-    rng = AK.CounterRNG(0x123456789abcdef)
-
-    @test AK.from_uint(UInt32, UInt32(0xdeadbeef)) == UInt32(0xdeadbeef)
-    @test AK.from_uint(UInt64, UInt64(0x0123456789abcdef)) == UInt64(0x0123456789abcdef)
-    @test AK.from_uint(Int32, UInt32(0xdeadbeef)) == reinterpret(Int32, UInt32(0xdeadbeef))
-    @test AK.from_uint(Int64, UInt64(0x0123456789abcdef)) == reinterpret(Int64, UInt64(0x0123456789abcdef))
-
-    @test AK.rand_uint(rng, UInt64(0), UInt32) == AK.rand_uint(rng, UInt64(0), UInt32)
-    @test AK.rand_uint(rng, UInt64(1), UInt32) != AK.rand_uint(rng, UInt64(0), UInt32)
-    @test AK.rand_uint(rng, UInt64(17), UInt32) != AK.rand_uint(rng, UInt64(18), UInt32)
-    @test AK.rand_uint(rng, UInt64(0), UInt64) == AK.rand_uint(rng, UInt64(0), UInt64)
-    @test AK.rand_uint(rng, UInt64(1), UInt64) != AK.rand_uint(rng, UInt64(0), UInt64)
-    @test AK.rand_uint(rng, UInt64(17), UInt64) != AK.rand_uint(rng, UInt64(18), UInt64)
-
-    vals_u32 = [AK.rand_uint(rng, UInt64(i), UInt32) for i in 0:2047]
-    vals_u64 = [AK.rand_uint(rng, UInt64(i), UInt64) for i in 0:2047]
-    @test length(unique(vals_u32)) > 1800
-    @test length(unique(vals_u64)) > 2000
-
-    for u in (
-        UInt32(0x00000000),
-        UInt32(0x00000001),
-        UInt32(0x7fffffff),
-        UInt32(0x80000000),
-        UInt32(0xffffffff),
-    )
-        x = AK.uint32_to_unit_float32(u)
-        @test !isnan(x)
-        @test 0.0f0 <= x < 1.0f0
+
+@testset "rand" begin
+    @testset "constructors" begin
+        @test AK.CounterRNG(0x1; alg=AK.SplitMix64()) isa AK.CounterRNG{AK.SplitMix64}
+        @test AK.CounterRNG(UInt32(0x1); alg=AK.Philox()) isa AK.CounterRNG{AK.Philox}
+        @test AK.CounterRNG(UInt16(123); alg=AK.Threefry()) isa AK.CounterRNG{AK.Threefry}
+        @test AK.CounterRNG(UInt32(300)).seed == UInt64(300)
+        @test_throws ArgumentError AK.CounterRNG(-1)
+
+        Random.seed!(0x1234)
+        expected_seed = rand(UInt64)
+        Random.seed!(0x1234)
+        rng_auto = AK.CounterRNG()
+        @test rng_auto.seed == expected_seed
+        @test rng_auto.alg isa AK.Philox
+
+        x1 = array_from_host(zeros(Float32, 1024))
+        x2 = array_from_host(zeros(Float32, 1024))
+        AK.rand!(rng_auto, x1; prefer_threads, block_size=64)
+        AK.rand!(rng_auto, x2; prefer_threads, block_size=257)
+        @test Array(x1) == Array(x2)
     end
 
-    for u in (
-        UInt64(0x0000000000000000),
-        UInt64(0x0000000000000001),
-        UInt64(0x7fffffffffffffff),
-        UInt64(0x8000000000000000),
-        UInt64(0xffffffffffffffff),
-    )
-        x = AK.uint64_to_unit_float64(u)
-        @test !isnan(x)
-        @test 0.0 <= x < 1.0
+
+    @testset "bit helpers" begin
+        hi = UInt32(0b10101010101010101010101010101010)
+        lo = UInt32(0b01010101010101010101010101010101)
+        word = UInt64(hi) << 32 | UInt64(lo)
+
+        @test AK._u32_hi(word) == hi
+        @test AK._u32_lo(word) == lo
+        @test AK._u64_from_u32s(lo, hi) == word
+        @test AK._mulhi_u32(0xffffffff % UInt32, 0xffffffff % UInt32) == 0xfffffffe % UInt32
+        @test AK._rotl32(0b10000000000000000000000000000001 % UInt32, UInt32(1)) == 0b11 % UInt32
+        @test AK._counter_from_index(1) == UInt64(0)
+        @test AK._counter_from_index(17) == UInt64(16)
+
+        @test AK.raw_uint_type(UInt32) === UInt32
+        @test AK.raw_uint_type(Int32) === UInt32
+        @test AK.raw_uint_type(Float32) === UInt32
+        @test AK.raw_uint_type(UInt64) === UInt64
+        @test AK.raw_uint_type(Int64) === UInt64
+        @test AK.raw_uint_type(Float64) === UInt64
+
+        @test AK.from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32
+        @test AK.from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64
+        @test AK.from_uint(Int32, 0b11111111111111111111111111111111 % UInt32) == Int32(-1)
+        @test AK.from_uint(
+            Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64
+        ) == Int64(-1)
+
+        @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0
+        @test AK.uint64_to_unit_float64(UInt64(0)) == 0.0
+        @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0
+        @test 0.0 <= AK.uint64_to_unit_float64(typemax(UInt64)) < 1.0
     end
 
-    for T in scalar_types
-        s0 = AK.rand_scalar(rng, UInt64(0), T)
-        s1 = AK.rand_scalar(rng, UInt64(1), T)
-        @test s0 isa T
-        @test s1 isa T
-        @test s0 != s1
-        if T <: AbstractFloat
-            @test zero(T) <= s0 < one(T)
-            @test zero(T) <= s1 < one(T)
+
+    @testset "rand_uint" begin
+        for alg in RAND_ALGS
+            rng = AK.CounterRNG(0x123456789abcdef; alg)
+            for U in (UInt32, UInt64)
+                @test AK.rand_uint(rng, UInt64(0), U) == AK.rand_uint(rng, UInt64(0), U)
+                @test AK.rand_uint(rng, UInt64(1), U) != AK.rand_uint(rng, UInt64(0), U)
+
+                vals = [AK.rand_uint(rng, UInt64(i), U) for i in 0:511]
+                @test length(unique(vals)) > 460
+            end
+        end
+
+        rng_splitmix = AK.CounterRNG(0x31415926; alg=AK.SplitMix64())
+        for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023))
+            @test AK.rand_uint(rng_splitmix, c, UInt32) == AK._u32_hi(
+                AK.rand_uint(rng_splitmix, c, UInt64)
+            )
         end
-    end
 
-    @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16)
+        for alg in (AK.Philox(), AK.Threefry())
+            rng = AK.CounterRNG(0xabcdef1234567890; alg)
+            for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023))
+                @test AK._u32_lo(AK.rand_uint(rng, c, UInt64)) == AK.rand_uint(rng, c, UInt32)
+            end
+        end
+    end
 
-    lengths = (0, 1, 31, 32, 33, 1024, 1025)
-    for T in scalar_types
-        for len in lengths
-            x = array_from_host(zeros(T, len))
-            AK.rand!(rng, x; prefer_threads, block_size=64)
-            xh = Array(x)
 
-            ref = zeros(T, len)
-            _rand_fill_reference!(rng, ref)
+    @testset "rand_scalar" begin
+        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
 
-            @test xh == ref
+        for T in RAND_SCALAR_TYPES
+            s0 = AK.rand_scalar(rng, UInt64(0), T)
+            s1 = AK.rand_scalar(rng, UInt64(1), T)
+            @test s0 isa T
+            @test s1 isa T
+            @test s0 != s1
             if T <: AbstractFloat
-                @test _is_unit_interval(xh)
+                @test zero(T) <= s0 < one(T)
+                @test zero(T) <= s1 < one(T)
             end
         end
+
+        c = UInt64(42)
+        @test AK.rand_scalar(rng, c, Int32) == reinterpret(Int32, AK.rand_uint(rng, c, UInt32))
+        @test AK.rand_scalar(rng, c, Int64) == reinterpret(Int64, AK.rand_uint(rng, c, UInt64))
+        @test AK.rand_scalar(rng, c, Float32) == AK.uint32_to_unit_float32(
+            AK.rand_uint(rng, c, UInt32)
+        )
+        @test AK.rand_scalar(rng, c, Float64) == AK.uint64_to_unit_float64(
+            AK.rand_uint(rng, c, UInt64)
+        )
+        @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16)
     end
 
-    rng2 = AK.CounterRNG(rng.seed + UInt64(1))
-    for T in scalar_types
-        x1 = array_from_host(zeros(T, 4096))
-        x2 = array_from_host(zeros(T, 4096))
-        AK.rand!(rng, x1; prefer_threads, block_size=64)
-        AK.rand!(rng, x2; prefer_threads, block_size=257)
-        @test Array(x1) == Array(x2)
 
-        x3 = array_from_host(zeros(T, 4096))
-        AK.rand!(rng2, x3; prefer_threads, block_size=64)
-        @test Array(x3) != Array(x1)
+    @testset "rand! explicit rng" begin
+        lengths = (0, 1, 31, 32, 33, 257, 1024)
+        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
 
-        xnd = array_from_host(zeros(T, 7, 11, 5))
-        AK.rand!(rng, xnd; prefer_threads, block_size=128)
-        xndh = Array(xnd)
-        refnd = zeros(T, 7, 11, 5)
-        _rand_fill_reference!(rng, refnd)
-        @test xndh == refnd
-    end
+        for T in RAND_SCALAR_TYPES
+            for len in lengths
+                x = array_from_host(zeros(T, len))
+                _assert_rand_matches_reference!(rng, x; prefer_threads, block_size=64)
+                if T <: AbstractFloat
+                    @test _is_unit_interval(Array(x))
+                end
+            end
+        end
 
-    if IS_CPU_BACKEND
-        for T in scalar_types
-            base = zeros(T, 64)
-            view_x = @view base[2:2:end]
-            AK.rand!(
-                rng,
-                view_x;
-                max_tasks=Threads.nthreads(),
-                min_elems=1,
-                prefer_threads=true,
-            )
+        for T in RAND_SCALAR_TYPES
+            x1 = array_from_host(zeros(T, 2048))
+            x2 = array_from_host(zeros(T, 2048))
+            AK.rand!(rng, x1; prefer_threads, block_size=64)
+            AK.rand!(rng, x2; prefer_threads, block_size=257)
+            @test Array(x1) == Array(x2)
+        end
+
+        rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg)
+        for T in RAND_SCALAR_TYPES
+            x1 = array_from_host(zeros(T, 2048))
+            x2 = array_from_host(zeros(T, 2048))
+            AK.rand!(rng, x1; prefer_threads, block_size=64)
+            AK.rand!(rng2, x2; prefer_threads, block_size=64)
+            @test Array(x1) != Array(x2)
+        end
 
-            ref_view = zeros(T, length(view_x))
-            _rand_fill_reference!(rng, ref_view)
-            @test collect(view_x) == ref_view
+        for T in (Float32, UInt64)
+            xnd = array_from_host(zeros(T, 7, 11, 5))
+            _assert_rand_matches_reference!(rng, xnd; prefer_threads, block_size=128)
+        end
+
+        if IS_CPU_BACKEND
+            for T in RAND_SCALAR_TYPES
+                base = zeros(T, 64)
+                view_x = @view base[2:2:end]
+                AK.rand!(
+                    rng, view_x;
+                    max_tasks=Threads.nthreads(),
+                    min_elems=1,
+                    prefer_threads=true
+                )
+                ref_view = zeros(T, length(view_x))
+                _rand_fill_reference!(rng, ref_view)
+                @test collect(view_x) == ref_view
+            end
         end
     end
 
-    nstats = 200_000
-    xstats = array_from_host(zeros(Float32, nstats))
-    AK.rand!(rng, xstats; prefer_threads, block_size=256)
-    xh = Array(xstats)
 
-    @test _is_unit_interval(xh)
+    @testset "rand! convenience" begin
+        Random.seed!(0xabcdef)
+        seed1 = rand(UInt64)
+        seed2 = rand(UInt64)
 
-    m = sum(xh) / nstats
-    v = sum((x - m)^2 for x in xh) / nstats
-    @test abs(m - 0.5) < 0.01
-    @test abs(v - (1 / 12)) < 0.01
+        ref1 = array_from_host(zeros(Float32, 1024))
+        ref2 = array_from_host(zeros(Float32, 1024))
+        AK.rand!(AK.CounterRNG(seed1; alg=AK.Philox()), ref1; prefer_threads, block_size=64)
+        AK.rand!(AK.CounterRNG(seed2; alg=AK.Philox()), ref2; prefer_threads, block_size=64)
 
-    nbins = 16
-    counts = zeros(Int, nbins)
-    for x in xh
-        ibin = Int(floor(x * nbins)) + 1
-        ibin = min(ibin, nbins)
-        counts[ibin] += 1
-    end
-    expected = nstats / nbins
-    max_rel_dev = maximum(abs(c - expected) / expected for c in counts)
-    @test max_rel_dev < 0.1
+        Random.seed!(0xabcdef)
+        x1 = array_from_host(zeros(Float32, 1024))
+        x2 = array_from_host(zeros(Float32, 1024))
+        AK.rand!(x1; prefer_threads, block_size=64)
+        AK.rand!(x2; prefer_threads, block_size=64)
+        @test Array(x1) == Array(ref1)
+        @test Array(x2) == Array(ref2)
 
-    x16 = array_from_host(zeros(UInt16, 16))
-    @test_throws ArgumentError AK.rand!(x16; prefer_threads)
-    @test_throws ArgumentError AK.rand!(rng, x16; prefer_threads)
+        x_bad = array_from_host(zeros(UInt16, 16))
+        @test_throws ArgumentError AK.rand!(x_bad; prefer_threads)
+        @test_throws ArgumentError AK.rand!(AK.CounterRNG(0x1), x_bad; prefer_threads)
+    end
 end

From 65caf10021c371500fc785e927da4ef00ee645f8 Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Tue, 17 Mar 2026 00:16:49 +0000
Subject: [PATCH 05/18] ensure deteminism in convenience tests by avoiding race
 causing intermittent test failure. fence Float64 tests to CPU-only to avoid
 breaking CI on Metal and oneAPI

---
 Project.toml              |  4 +++-
 src/AcceleratedKernels.jl |  1 +
 src/rand/rand.jl          | 16 ++++++-------
 test/rand.jl              | 48 +++++++++++++++++++++++----------------
 4 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/Project.toml b/Project.toml
index 2fccea8..60ddf3a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,13 +1,14 @@
 name = "AcceleratedKernels"
 uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
-authors = ["Andrei-Leonard Nicusan <leonard@evophase.co.uk> and contributors"]
 version = "0.4.3"
+authors = ["Andrei-Leonard Nicusan <leonard@evophase.co.uk> and contributors"]
 
 [deps]
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
 
 [weakdeps]
@@ -21,6 +22,7 @@ ArgCheck = "2"
 GPUArraysCore = "0.2.0"
 KernelAbstractions = "0.9.34"
 Markdown = "1"
+Random = "1.11.0"
 UnsafeAtomics = "0.3.0"
 julia = "1.10"
 oneAPI = "1, 2"
diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
index 06a401a..7262cbe 100644
--- a/src/AcceleratedKernels.jl
+++ b/src/AcceleratedKernels.jl
@@ -15,6 +15,7 @@ using ArgCheck: @argcheck
 using GPUArraysCore: AnyGPUArray, @allowscalar
 using KernelAbstractions
 using KernelAbstractions: @context
+using Random
 import UnsafeAtomics
 
 
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index e62ba96..20a2fe8 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -27,8 +27,8 @@ The default algorithm is `Philox()`.
 Constructors:
 - `CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())`
   Uses an explicit non-negative seed.
-- `CounterRNG(; alg::CounterRNGAlgorithm=SplitMix64())`
-  Auto-seeds once using `rand(UInt64)`. Reusing the same `CounterRNG` instance is deterministic
+- `CounterRNG(; alg::CounterRNGAlgorithm=Philox())`
+  Auto-seeds once using `Random.rand(Random.default_rng(), UInt64)`. Reusing the same `CounterRNG` instance is deterministic
   for fixed seed, algorithm, array shape, and eltype.
 """
 struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG
@@ -44,7 +44,7 @@ end
 
 
 function CounterRNG(; alg::CounterRNGAlgorithm=Philox())
-    CounterRNG(Base.rand(UInt64); alg)
+    CounterRNG(Random.rand(Random.default_rng(), UInt64); alg)
 end
 
 
@@ -58,6 +58,9 @@ include("splitmix64.jl")
 include("philox.jl")
 include("threefry.jl")
 
+
+
+
 """
     rand!(
         rng::AbstractCounterRNG,
@@ -83,11 +86,6 @@ Supported scalar element types are:
 - `Int32`, `Int64`
 - `Float32`, `Float64`
 
-Semantics:
-- Unsigned integers: raw random bit patterns of requested width.
-- Signed integers: corresponding unsigned patterns reinterpreted as signed.
-- Floats: mantissa-based conversion from `UInt32`/`UInt64` into `[0, 1)`, uniform over the
-  produced mantissa grid (not over all representable floats).
 """
 function rand!(
     rng::AbstractCounterRNG,
@@ -115,6 +113,8 @@ function rand!(
     end
     return x
 end
+
+
 function rand!(
     x::AbstractArray,
     args...;
diff --git a/test/rand.jl b/test/rand.jl
index e3f0bb6..1084637 100644
--- a/test/rand.jl
+++ b/test/rand.jl
@@ -1,5 +1,9 @@
 const RAND_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry())
-const RAND_SCALAR_TYPES = (UInt32, UInt64, Int32, Int64, Float32, Float64)
+const RAND_SCALAR_TYPES_ALL = (UInt32, UInt64, Int32, Int64, Float32, Float64)
+const RAND_SCALAR_TYPES_BACKEND = IS_CPU_BACKEND ?
+                                  RAND_SCALAR_TYPES_ALL :
+                                  (UInt32, UInt64, Int32, Int64, Float32)
+const RUN_FLOAT64_RAND_TESTS = IS_CPU_BACKEND
 
 
 _is_unit_interval(v) = all(x -> !isnan(x) && zero(x) <= x < one(x), v)
@@ -31,7 +35,7 @@ end
         @test_throws ArgumentError AK.CounterRNG(-1)
 
         Random.seed!(0x1234)
-        expected_seed = rand(UInt64)
+        expected_seed = Random.rand(Random.default_rng(), UInt64)
         Random.seed!(0x1234)
         rng_auto = AK.CounterRNG()
         @test rng_auto.seed == expected_seed
@@ -63,7 +67,9 @@ end
         @test AK.raw_uint_type(Float32) === UInt32
         @test AK.raw_uint_type(UInt64) === UInt64
         @test AK.raw_uint_type(Int64) === UInt64
-        @test AK.raw_uint_type(Float64) === UInt64
+        if RUN_FLOAT64_RAND_TESTS
+            @test AK.raw_uint_type(Float64) === UInt64
+        end
 
         @test AK.from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32
         @test AK.from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64
@@ -73,9 +79,11 @@ end
         ) == Int64(-1)
 
         @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0
-        @test AK.uint64_to_unit_float64(UInt64(0)) == 0.0
         @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0
-        @test 0.0 <= AK.uint64_to_unit_float64(typemax(UInt64)) < 1.0
+        if RUN_FLOAT64_RAND_TESTS
+            @test AK.uint64_to_unit_float64(UInt64(0)) == 0.0
+            @test 0.0 <= AK.uint64_to_unit_float64(typemax(UInt64)) < 1.0
+        end
     end
 
 
@@ -110,7 +118,7 @@ end
     @testset "rand_scalar" begin
         rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
 
-        for T in RAND_SCALAR_TYPES
+        for T in RAND_SCALAR_TYPES_BACKEND
             s0 = AK.rand_scalar(rng, UInt64(0), T)
             s1 = AK.rand_scalar(rng, UInt64(1), T)
             @test s0 isa T
@@ -128,9 +136,11 @@ end
         @test AK.rand_scalar(rng, c, Float32) == AK.uint32_to_unit_float32(
             AK.rand_uint(rng, c, UInt32)
         )
-        @test AK.rand_scalar(rng, c, Float64) == AK.uint64_to_unit_float64(
-            AK.rand_uint(rng, c, UInt64)
-        )
+        if RUN_FLOAT64_RAND_TESTS
+            @test AK.rand_scalar(rng, c, Float64) == AK.uint64_to_unit_float64(
+                AK.rand_uint(rng, c, UInt64)
+            )
+        end
         @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16)
     end
 
@@ -139,7 +149,7 @@ end
         lengths = (0, 1, 31, 32, 33, 257, 1024)
         rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
 
-        for T in RAND_SCALAR_TYPES
+        for T in RAND_SCALAR_TYPES_BACKEND
             for len in lengths
                 x = array_from_host(zeros(T, len))
                 _assert_rand_matches_reference!(rng, x; prefer_threads, block_size=64)
@@ -149,7 +159,7 @@ end
             end
         end
 
-        for T in RAND_SCALAR_TYPES
+        for T in RAND_SCALAR_TYPES_BACKEND
             x1 = array_from_host(zeros(T, 2048))
             x2 = array_from_host(zeros(T, 2048))
             AK.rand!(rng, x1; prefer_threads, block_size=64)
@@ -158,7 +168,7 @@ end
         end
 
         rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg)
-        for T in RAND_SCALAR_TYPES
+        for T in RAND_SCALAR_TYPES_BACKEND
             x1 = array_from_host(zeros(T, 2048))
             x2 = array_from_host(zeros(T, 2048))
             AK.rand!(rng, x1; prefer_threads, block_size=64)
@@ -172,7 +182,7 @@ end
         end
 
         if IS_CPU_BACKEND
-            for T in RAND_SCALAR_TYPES
+            for T in RAND_SCALAR_TYPES_BACKEND
                 base = zeros(T, 64)
                 view_x = @view base[2:2:end]
                 AK.rand!(
@@ -190,18 +200,18 @@ end
 
 
     @testset "rand! convenience" begin
-        Random.seed!(0xabcdef)
-        seed1 = rand(UInt64)
-        seed2 = rand(UInt64)
-
         ref1 = array_from_host(zeros(Float32, 1024))
         ref2 = array_from_host(zeros(Float32, 1024))
+        x1 = array_from_host(zeros(Float32, 1024))
+        x2 = array_from_host(zeros(Float32, 1024))
+
+        Random.seed!(0xabcdef)
+        seed1 = Random.rand(Random.default_rng(), UInt64)
         AK.rand!(AK.CounterRNG(seed1; alg=AK.Philox()), ref1; prefer_threads, block_size=64)
+        seed2 = Random.rand(Random.default_rng(), UInt64)
         AK.rand!(AK.CounterRNG(seed2; alg=AK.Philox()), ref2; prefer_threads, block_size=64)
 
         Random.seed!(0xabcdef)
-        x1 = array_from_host(zeros(Float32, 1024))
-        x2 = array_from_host(zeros(Float32, 1024))
         AK.rand!(x1; prefer_threads, block_size=64)
         AK.rand!(x2; prefer_threads, block_size=64)
         @test Array(x1) == Array(ref1)

From 3d1e0c05b2e6c5f6ddc2008ba3de5e1c6de37453 Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Tue, 17 Mar 2026 00:25:28 +0000
Subject: [PATCH 06/18] fix Random dep for use with julia v1.10

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 60ddf3a..4a3a6b3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -22,7 +22,7 @@ ArgCheck = "2"
 GPUArraysCore = "0.2.0"
 KernelAbstractions = "0.9.34"
 Markdown = "1"
-Random = "1.11.0"
+Random = "1"
 UnsafeAtomics = "0.3.0"
 julia = "1.10"
 oneAPI = "1, 2"

From 48cd247b86a55b5fc4c896c151f6bb3f128e47d2 Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Tue, 17 Mar 2026 21:28:32 +0000
Subject: [PATCH 07/18] add Bool scalar support with isodd() on a UInt32

---
 docs/src/api/rand.md  |  6 ++++--
 src/rand/rand.jl      |  8 ++++++++
 src/rand/utilities.jl |  5 ++++-
 test/rand.jl          | 17 +++++++++++++----
 4 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md
index 2fa468f..d9ffc71 100644
--- a/docs/src/api/rand.md
+++ b/docs/src/api/rand.md
@@ -4,19 +4,21 @@ Counter-based random generation for CPU and GPU backends with deterministic beha
 `seed`, algorithm, array shape, and eltype.
 
 Use an explicit `CounterRNG(seed; alg=...)` when reproducibility matters. For convenience,
-`AK.rand!(x)` creates a fresh `CounterRNG()` on each call using one auto-seeded `Base.rand(UInt64)`
-draw, so repeated calls usually produce different outputs.
+`AK.rand!(x)` creates a fresh `CounterRNG()` on each call using one auto-seeded
+`Base.rand(Random.default_rng(), UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used.
 
 Supported output element types:
 - `UInt32`, `UInt64`
 - `Int32`, `Int64`
 - `Float32`, `Float64`
+- `Bool`
 
 The core of the random number generation produces a `UInt` of the requested scalar width.
 That `UInt` is then either:
 - Unsigned integers: returned as-is
 - Signed integers: reinterpreted as a signed integer bit pattern.
 - Floats: mantissa construction into a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/)).
+- Bool: `true` if the `UInt` draw is odd (`isodd(u)`), otherwise `false`.
 
 Algorithms currently available:
 - `SplitMix64` ([read more](https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64))
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index 20a2fe8..fdd7076 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -85,6 +85,14 @@ Supported scalar element types are:
 - `UInt32`, `UInt64`
 - `Int32`, `Int64`
 - `Float32`, `Float64`
+- `Bool`
+
+Semantics:
+- Unsigned integers: raw random bit patterns of requested width.
+- Signed integers: corresponding unsigned patterns reinterpreted as signed.
+- Floats: mantissa-based conversion from `UInt32`/`UInt64` into `[0, 1)`, uniform over the
+  produced mantissa grid (not over all representable floats).
+- Bool: `true` if the raw `UInt` draw is odd (`isodd(u)`), otherwise `false`.
 
 """
 function rand!(
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index 2623c9d..c9f2cc5 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -18,6 +18,7 @@ const ALLOWED_RAND_SCALARS = Union{
     UInt32, UInt64,
     Int32, Int64,
     Float32, Float64,
+    Bool
 }
 
 
@@ -27,6 +28,7 @@ const ALLOWED_RAND_SCALARS = Union{
 @inline raw_uint_type(::Type{UInt64}) = UInt64
 @inline raw_uint_type(::Type{Int64}) = UInt64
 @inline raw_uint_type(::Type{Float64}) = UInt64
+@inline raw_uint_type(::Type{Bool}) = UInt32
 
 
 @inline from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u
@@ -35,6 +37,7 @@ const ALLOWED_RAND_SCALARS = Union{
 @inline from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u)
 @inline from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u)
 @inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u)
+@inline from_uint(::Type{Bool}, u::UInt32)::Bool = isodd(u)
 
 
 #=
@@ -71,7 +74,7 @@ end
 
 @inline function rand_scalar(::AbstractCounterRNG, ::UInt64, ::Type{T}) where {T}
     throw(ArgumentError(
-        "Unsupported random scalar type $(T). Supported: UInt32, UInt64, Int32, Int64, Float32, Float64."
+        "Unsupported random scalar type $(T). Supported: $(ALLOWED_RAND_SCALARS)"
     ))
 end
 
diff --git a/test/rand.jl b/test/rand.jl
index 1084637..a9d0c46 100644
--- a/test/rand.jl
+++ b/test/rand.jl
@@ -1,8 +1,8 @@
 const RAND_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry())
-const RAND_SCALAR_TYPES_ALL = (UInt32, UInt64, Int32, Int64, Float32, Float64)
+const RAND_SCALAR_TYPES_ALL = (UInt32, UInt64, Int32, Int64, Float32, Float64, Bool)
 const RAND_SCALAR_TYPES_BACKEND = IS_CPU_BACKEND ?
                                   RAND_SCALAR_TYPES_ALL :
-                                  (UInt32, UInt64, Int32, Int64, Float32)
+                                  (UInt32, UInt64, Int32, Int64, Float32, Bool)
 const RUN_FLOAT64_RAND_TESTS = IS_CPU_BACKEND
 
 
@@ -67,6 +67,7 @@ end
         @test AK.raw_uint_type(Float32) === UInt32
         @test AK.raw_uint_type(UInt64) === UInt64
         @test AK.raw_uint_type(Int64) === UInt64
+        @test AK.raw_uint_type(Bool) === UInt32
         if RUN_FLOAT64_RAND_TESTS
             @test AK.raw_uint_type(Float64) === UInt64
         end
@@ -77,6 +78,8 @@ end
         @test AK.from_uint(
             Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64
         ) == Int64(-1)
+        @test AK.from_uint(Bool, UInt32(0)) == false
+        @test AK.from_uint(Bool, UInt32(1)) == true
 
         @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0
         @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0
@@ -123,7 +126,9 @@ end
             s1 = AK.rand_scalar(rng, UInt64(1), T)
             @test s0 isa T
             @test s1 isa T
-            @test s0 != s1
+            if T !== Bool
+                @test s0 != s1
+            end
             if T <: AbstractFloat
                 @test zero(T) <= s0 < one(T)
                 @test zero(T) <= s1 < one(T)
@@ -136,11 +141,15 @@ end
         @test AK.rand_scalar(rng, c, Float32) == AK.uint32_to_unit_float32(
             AK.rand_uint(rng, c, UInt32)
         )
+        @test AK.rand_scalar(rng, c, Bool) == isodd(AK.rand_uint(rng, c, UInt32))
         if RUN_FLOAT64_RAND_TESTS
             @test AK.rand_scalar(rng, c, Float64) == AK.uint64_to_unit_float64(
                 AK.rand_uint(rng, c, UInt64)
             )
         end
+        bools = [AK.rand_scalar(rng, UInt64(i), Bool) for i in 0:511]
+        @test any(identity, bools)
+        @test any(!, bools)
         @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16)
     end
 
@@ -176,7 +185,7 @@ end
             @test Array(x1) != Array(x2)
         end
 
-        for T in (Float32, UInt64)
+        for T in (Float32, UInt64, Bool)
             xnd = array_from_host(zeros(T, 7, 11, 5))
             _assert_rand_matches_reference!(rng, xnd; prefer_threads, block_size=128)
         end

From 68e03c3b98f5a703a422a03e0ab2e2e88f1ec53c Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Tue, 17 Mar 2026 22:04:13 +0000
Subject: [PATCH 08/18] add support for Float16, UInt8, UInt16, Int8, Int16
 (was bored)

---
 Project.toml          |  2 +-
 docs/src/api/rand.md  | 16 ++++++++--------
 src/rand/rand.jl      |  6 +++---
 src/rand/utilities.jl | 27 ++++++++++++++++++++++++---
 test/rand.jl          | 40 ++++++++++++++++++++++++++++++++++------
 5 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/Project.toml b/Project.toml
index 4a3a6b3..b0c601d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "AcceleratedKernels"
 uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
-version = "0.4.3"
 authors = ["Andrei-Leonard Nicusan <leonard@evophase.co.uk> and contributors"]
+version = "0.4.3"
 
 [deps]
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md
index d9ffc71..93ed339 100644
--- a/docs/src/api/rand.md
+++ b/docs/src/api/rand.md
@@ -5,18 +5,18 @@ Counter-based random generation for CPU and GPU backends with deterministic beha
 
 Use an explicit `CounterRNG(seed; alg=...)` when reproducibility matters. For convenience,
 `AK.rand!(x)` creates a fresh `CounterRNG()` on each call using one auto-seeded
-`Base.rand(Random.default_rng(), UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used.
+`Random.rand(Random.default_rng(), UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used.
 
-Supported output element types:
-- `UInt32`, `UInt64`
-- `Int32`, `Int64`
-- `Float32`, `Float64`
+Supported element types:
+- `UInt8`, `UInt16`, `UInt32`, `UInt64`
+- `Int8`, `Int16`, `Int32`, `Int64`
+- `Float16`, `Float32`, `Float64`
 - `Bool`
 
-The core of the random number generation produces a `UInt` of the requested scalar width.
+The core of the random number generation produces either a `UInt32` or `UInt64` depending on the width of the requested element type.
 That `UInt` is then either:
-- Unsigned integers: returned as-is
-- Signed integers: reinterpreted as a signed integer bit pattern.
+- Unsigned integers: returned as-is or truncated if necessary.
+- Signed integers: reinterpreted as a signed integer bit pattern and truncated if necessary.
 - Floats: mantissa construction into a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/)).
 - Bool: `true` if the `UInt` draw is odd (`isodd(u)`), otherwise `false`.
 
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index fdd7076..dc48f03 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -82,9 +82,9 @@ Fill `x` in-place with pseudo-random values using a stateless counter-based RNG.
 counter is exactly `UInt64(i - 1)` in linear indexing order.
 
 Supported scalar element types are:
-- `UInt32`, `UInt64`
-- `Int32`, `Int64`
-- `Float32`, `Float64`
+- `UInt8`, `UInt16`, `UInt32`, `UInt64`
+- `Int8`, `Int16`, `Int32`, `Int64`
+- `Float16`, `Float32`, `Float64`
 - `Bool`
 
 Semantics:
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index c9f2cc5..3d5dab7 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -15,15 +15,20 @@
 
 # Internal scalar eltypes currently supported by rand!.
 const ALLOWED_RAND_SCALARS = Union{
-    UInt32, UInt64,
-    Int32, Int64,
-    Float32, Float64,
+    UInt8, UInt16, UInt32, UInt64,
+    Int8, Int16, Int32, Int64,
+    Float16, Float32, Float64,
     Bool
 }
 
 
+@inline raw_uint_type(::Type{UInt8}) = UInt32
+@inline raw_uint_type(::Type{UInt16}) = UInt32
 @inline raw_uint_type(::Type{UInt32}) = UInt32
+@inline raw_uint_type(::Type{Int8}) = UInt32
+@inline raw_uint_type(::Type{Int16}) = UInt32
 @inline raw_uint_type(::Type{Int32}) = UInt32
+@inline raw_uint_type(::Type{Float16}) = UInt32
 @inline raw_uint_type(::Type{Float32}) = UInt32
 @inline raw_uint_type(::Type{UInt64}) = UInt64
 @inline raw_uint_type(::Type{Int64}) = UInt64
@@ -31,10 +36,15 @@ const ALLOWED_RAND_SCALARS = Union{
 @inline raw_uint_type(::Type{Bool}) = UInt32
 
 
+@inline from_uint(::Type{UInt8}, u::UInt32)::UInt8 = trunc(UInt8, u >> 24)
+@inline from_uint(::Type{UInt16}, u::UInt32)::UInt16 = trunc(UInt16, u >> 16)
 @inline from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u
 @inline from_uint(::Type{UInt64}, u::UInt64)::UInt64 = u
+@inline from_uint(::Type{Int8}, u::UInt32)::Int8 = reinterpret(Int8, trunc(UInt8, u >> 24))
+@inline from_uint(::Type{Int16}, u::UInt32)::Int16 = reinterpret(Int16, trunc(UInt16, u >> 16))
 @inline from_uint(::Type{Int32}, u::UInt32)::Int32 = reinterpret(Int32, u)
 @inline from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u)
+@inline from_uint(::Type{Float16}, u::UInt32)::Float16 = uint32_to_unit_float16(u)
 @inline from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u)
 @inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u)
 @inline from_uint(::Type{Bool}, u::UInt32)::Bool = isodd(u)
@@ -81,6 +91,17 @@ end
 
 
 
+# Convert random UInt32 bits to Float16 in [0, 1) by mantissa construction.
+@inline function uint32_to_unit_float16(u::UInt32)::Float16
+    # Keep 10 random bits for the mantissa (drop 22 rightmost bits from the UInt32)
+    # and combine with the bit pattern of Float16(1.0) (sign=0, exponent=15).
+    bits = UInt16(0x3c00) | UInt16(u >> 22)
+
+    # Interpret as 1.mantissa, then subtract 1 for [0, 1)
+    return reinterpret(Float16, bits) - Float16(1)
+end
+
+
 # Convert random UInt32 bits to Float32 in [0, 1) by mantissa construction.
 @inline function uint32_to_unit_float32(u::UInt32)::Float32
     # Keep 23 random bits for the mantissa (drop 9 rightmost bits from the UInt32)
diff --git a/test/rand.jl b/test/rand.jl
index a9d0c46..9e03e59 100644
--- a/test/rand.jl
+++ b/test/rand.jl
@@ -1,8 +1,13 @@
 const RAND_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry())
-const RAND_SCALAR_TYPES_ALL = (UInt32, UInt64, Int32, Int64, Float32, Float64, Bool)
+const RAND_SCALAR_TYPES_ALL = (
+    UInt8, UInt16, UInt32, UInt64,
+    Int8, Int16, Int32, Int64,
+    Float16, Float32, Float64,
+    Bool,
+)
 const RAND_SCALAR_TYPES_BACKEND = IS_CPU_BACKEND ?
                                   RAND_SCALAR_TYPES_ALL :
-                                  (UInt32, UInt64, Int32, Int64, Float32, Bool)
+                                  (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float16, Float32, Bool)
 const RUN_FLOAT64_RAND_TESTS = IS_CPU_BACKEND
 
 
@@ -62,8 +67,13 @@ end
         @test AK._counter_from_index(1) == UInt64(0)
         @test AK._counter_from_index(17) == UInt64(16)
 
+        @test AK.raw_uint_type(UInt8) === UInt32
+        @test AK.raw_uint_type(UInt16) === UInt32
         @test AK.raw_uint_type(UInt32) === UInt32
+        @test AK.raw_uint_type(Int8) === UInt32
+        @test AK.raw_uint_type(Int16) === UInt32
         @test AK.raw_uint_type(Int32) === UInt32
+        @test AK.raw_uint_type(Float16) === UInt32
         @test AK.raw_uint_type(Float32) === UInt32
         @test AK.raw_uint_type(UInt64) === UInt64
         @test AK.raw_uint_type(Int64) === UInt64
@@ -72,15 +82,22 @@ end
             @test AK.raw_uint_type(Float64) === UInt64
         end
 
+        @test AK.from_uint(UInt8, UInt32(0xabcdef01)) == UInt8(0xab)
+        @test AK.from_uint(UInt16, UInt32(0xabcdef01)) == UInt16(0xabcd)
         @test AK.from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32
         @test AK.from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64
+        @test AK.from_uint(Int8, UInt32(0xff000000)) == Int8(-1)
+        @test AK.from_uint(Int16, UInt32(0xffff0000)) == Int16(-1)
         @test AK.from_uint(Int32, 0b11111111111111111111111111111111 % UInt32) == Int32(-1)
         @test AK.from_uint(
             Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64
         ) == Int64(-1)
+        @test AK.from_uint(Float16, UInt32(0)) == Float16(0)
         @test AK.from_uint(Bool, UInt32(0)) == false
         @test AK.from_uint(Bool, UInt32(1)) == true
 
+        @test AK.uint32_to_unit_float16(UInt32(0)) == Float16(0)
+        @test Float16(0) <= AK.uint32_to_unit_float16(typemax(UInt32)) < Float16(1)
         @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0
         @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0
         if RUN_FLOAT64_RAND_TESTS
@@ -126,7 +143,7 @@ end
             s1 = AK.rand_scalar(rng, UInt64(1), T)
             @test s0 isa T
             @test s1 isa T
-            if T !== Bool
+            if !(T in (Bool, Float16, UInt8, UInt16, Int8, Int16))
                 @test s0 != s1
             end
             if T <: AbstractFloat
@@ -136,8 +153,19 @@ end
         end
 
         c = UInt64(42)
+        @test AK.rand_scalar(rng, c, UInt8) == trunc(UInt8, AK.rand_uint(rng, c, UInt32) >> 24)
+        @test AK.rand_scalar(rng, c, UInt16) == trunc(UInt16, AK.rand_uint(rng, c, UInt32) >> 16)
+        @test AK.rand_scalar(
+            rng, c, Int8
+        ) == reinterpret(Int8, trunc(UInt8, AK.rand_uint(rng, c, UInt32) >> 24))
+        @test AK.rand_scalar(
+            rng, c, Int16
+        ) == reinterpret(Int16, trunc(UInt16, AK.rand_uint(rng, c, UInt32) >> 16))
         @test AK.rand_scalar(rng, c, Int32) == reinterpret(Int32, AK.rand_uint(rng, c, UInt32))
         @test AK.rand_scalar(rng, c, Int64) == reinterpret(Int64, AK.rand_uint(rng, c, UInt64))
+        @test AK.rand_scalar(rng, c, Float16) == AK.uint32_to_unit_float16(
+            AK.rand_uint(rng, c, UInt32)
+        )
         @test AK.rand_scalar(rng, c, Float32) == AK.uint32_to_unit_float32(
             AK.rand_uint(rng, c, UInt32)
         )
@@ -150,7 +178,7 @@ end
         bools = [AK.rand_scalar(rng, UInt64(i), Bool) for i in 0:511]
         @test any(identity, bools)
         @test any(!, bools)
-        @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt16)
+        @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt128)
     end
 
 
@@ -185,7 +213,7 @@ end
             @test Array(x1) != Array(x2)
         end
 
-        for T in (Float32, UInt64, Bool)
+        for T in (Float16, Float32, UInt64, Bool)
             xnd = array_from_host(zeros(T, 7, 11, 5))
             _assert_rand_matches_reference!(rng, xnd; prefer_threads, block_size=128)
         end
@@ -226,7 +254,7 @@ end
         @test Array(x1) == Array(ref1)
         @test Array(x2) == Array(ref2)
 
-        x_bad = array_from_host(zeros(UInt16, 16))
+        x_bad = zeros(UInt128, 16)
         @test_throws ArgumentError AK.rand!(x_bad; prefer_threads)
         @test_throws ArgumentError AK.rand!(AK.CounterRNG(0x1), x_bad; prefer_threads)
     end

From 8677d84d4b15592a31cf5a3e73587fe7c77b454b Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Wed, 18 Mar 2026 16:29:00 +0000
Subject: [PATCH 09/18] Add more faithful seed-key mapping for Philox, and
 improve thoroughness of docs

---
 docs/src/api/rand.md                    | 20 ++++++++++++++++++--
 src/rand/philox.jl                      |  4 +---
 src/rand/rand.jl                        |  2 +-
 src/rand/{splitmix64.jl => splitmix.jl} |  6 ++++++
 4 files changed, 26 insertions(+), 6 deletions(-)
 rename src/rand/{splitmix64.jl => splitmix.jl} (82%)

diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md
index 93ed339..997e407 100644
--- a/docs/src/api/rand.md
+++ b/docs/src/api/rand.md
@@ -25,8 +25,24 @@ Algorithms currently available:
 - `Philox` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf))
 - `Threefry` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf))
 
-`Philox` is the default algorithm for `CounterRNG()`, as it is more thoroughly
-statistically tested and measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an RTX
+Statistical-testing note:
+- Published/reference versions of `SplitMix64`, `Philox`, and `Threefry` are reported to pass
+  TestU01 BigCrush.
+- These results refer to specific constructions and test setups; wrapper choices (such as
+  seed/key mapping conventions) can change bitwise output streams.
+- These generators are not intended to be cryptographically secure.
+
+Philox keying note:
+- AK uses `Philox2x32` internally (one 32-bit Philox key word).
+- Users can pass any non-negative `Integer` seed; AK normalises to `UInt64` then derives the
+  32-bit Philox key via a SplitMix-based mapping.
+- This is a deliberate wrapper choice for ease of use (simple `seed` API with deterministic
+  streams), not a change to the Philox round function itself.
+- Therefore, AK Philox streams are deterministic and high-quality, but not guaranteed to be
+  bit-for-bit identical to a raw Random123 Philox stream unless the same seed-to-key mapping and
+  counter convention are used.
+
+`Philox` is the default algorithm for `CounterRNG()`, as it is very thorough and very fast; it has been measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an Nvidia GeForce RTX
 5060 (advertised 448 GB/s), i.e. effectively memory-bound throughput.
 
 Examples:
diff --git a/src/rand/philox.jl b/src/rand/philox.jl
index bdebc8e..37149f1 100644
--- a/src/rand/philox.jl
+++ b/src/rand/philox.jl
@@ -24,9 +24,7 @@ end
     x0 = _u32_lo(counter)
     x1 = _u32_hi(counter)
 
-    seed = UInt64(rng.seed)
-    k0 = _u32_lo(seed)
-    x1 = xor(x1, _u32_hi(seed))
+    k0 = splitmix32_from_u64(UInt64(rng.seed))
 
     @inbounds for _ in 1:PHILOX_ROUNDS
         x0, x1 = _philox2x32_round(x0, x1, k0)
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index dc48f03..d1fcb42 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -54,7 +54,7 @@ end
 include("utilities.jl")
 
 # Algorithm-specific integer generators
-include("splitmix64.jl")
+include("splitmix.jl")
 include("philox.jl")
 include("threefry.jl")
 
diff --git a/src/rand/splitmix64.jl b/src/rand/splitmix.jl
similarity index 82%
rename from src/rand/splitmix64.jl
rename to src/rand/splitmix.jl
index d169b73..8d7992b 100644
--- a/src/rand/splitmix64.jl
+++ b/src/rand/splitmix.jl
@@ -16,6 +16,12 @@ const SPLITMIX64_MIX_B = UInt64(0x94d049bb133111eb)
 end
 
 
+# Derive a 32-bit seed word from a 64-bit seed using SplitMix64 mixing.
+@inline function splitmix32_from_u64(seed::UInt64)::UInt32
+    return _u32_hi(_splitmix64_mix(seed + SPLITMIX64_INCREMENT))
+end
+
+
 # Natural SplitMix64 output path: compute 64 random bits directly from one counter
 @inline function rand_uint(
     rng::CounterRNG{<:SplitMix64},

From d687e2a84ab7816422273aea86b2a59b626177d8 Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Mon, 23 Mar 2026 01:40:44 +0000
Subject: [PATCH 10/18] streaming rng by now including an offset, incremented
 by length(x) when AK.rand!(x) is called. This gives stream-like behaviour
 without any on-device state but still faster than CURAND. Add RNGTest in
 prototyping for BigCrush and SmallCrush (Philox, SM64, and Threefry all now
 confirmed to pass in the AK implementation)

---
 docs/src/api/rand.md                |  52 ++++++--
 prototype/RNGTest/Project.toml      |   3 +
 prototype/RNGTest/README.md         |  33 +++++
 prototype/RNGTest/run_bigcrush.jl   |  26 ++++
 prototype/RNGTest/run_smallcrush.jl |  23 ++++
 prototype/RNGTest/stream.jl         |  80 ++++++++++++
 prototype/rand/Project.toml         |   1 +
 src/rand/philox.jl                  |  14 +-
 src/rand/rand.jl                    |  95 ++++++++++----
 src/rand/splitmix.jl                |   9 +-
 src/rand/threefry.jl                |  13 +-
 src/rand/utilities.jl               |  24 ++--
 test/rand.jl                        | 195 ++++++++++++++++++++++------
 13 files changed, 459 insertions(+), 109 deletions(-)
 create mode 100644 prototype/RNGTest/Project.toml
 create mode 100644 prototype/RNGTest/README.md
 create mode 100644 prototype/RNGTest/run_bigcrush.jl
 create mode 100644 prototype/RNGTest/run_smallcrush.jl
 create mode 100644 prototype/RNGTest/stream.jl

diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md
index 997e407..5c5596e 100644
--- a/docs/src/api/rand.md
+++ b/docs/src/api/rand.md
@@ -1,11 +1,35 @@
 ### Random Number Generation
 
-Counter-based random generation for CPU and GPU backends with deterministic behavior for fixed
-`seed`, algorithm, array shape, and eltype.
+Counter-based random generation for CPU and GPU backends with deterministic stream behavior for
+fixed `seed`, algorithm, and call sequence.
 
-Use an explicit `CounterRNG(seed; alg=...)` when reproducibility matters. For convenience,
-`AK.rand!(x)` creates a fresh `CounterRNG()` on each call using one auto-seeded
-`Random.rand(Random.default_rng(), UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used.
+`CounterRNG` carries an internal `offset` (starting at `0`) that advances by `length(v)` on each
+`AK.rand!(rng, v)` call. This means chunked fills are stream-consistent:
+- filling `100` then `100` elements yields the same `200` values as one `200`-element fill.
+- calls that share the same `CounterRNG` instance concurrently are not thread-safe.
+- call `AK.reset!(rng)` to rewind a mutable offset-bearing RNG back to offset `0x0`.
+
+`AK.rand!` also accepts custom `AbstractCounterRNG` implementations:
+- if they have a mutable `offset` field, streaming advancement is applied
+- if they have no `offset` field, each call behaves statelessly from counter `0`
+- if they have an immutable `offset` field, that offset is used as a fixed start and is not advanced
+
+Use an explicit `CounterRNG` when reproducibility is required. For
+convenience,
+`AK.rand!(v)` creates a fresh `CounterRNG()` on each call using one auto-seeded
+`Base.rand(UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used.
+
+`AK.reset!(rng)` rewinds offset to `0x0` for mutable RNGs that have an `offset` field.
+
+Custom RNGs:
+- Define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`.
+- Define an RNG type `MyRNG <: AK.AbstractCounterRNG{MyAlg}` with fields `seed` and `alg`.
+- Add a mutable `offset::UInt64` field if you want stream advancement across calls; omit it for stateless calls from counter `0`.
+- Implement typed `rand_uint` methods:
+  - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32`
+  - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64`
+
+Both widths should be implemented so `AK.rand!` supports all integer/float output types without falling back or error.
 
 Supported element types:
 - `UInt8`, `UInt16`, `UInt32`, `UInt64`
@@ -26,10 +50,7 @@ Algorithms currently available:
 - `Threefry` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf))
 
 Statistical-testing note:
-- Published/reference versions of `SplitMix64`, `Philox`, and `Threefry` are reported to pass
-  TestU01 BigCrush.
-- These results refer to specific constructions and test setups; wrapper choices (such as
-  seed/key mapping conventions) can change bitwise output streams.
+- In this repository, `SplitMix64`, `Philox`, and `Threefry` have passed TestU01 BigCrush
 - These generators are not intended to be cryptographically secure.
 
 Philox keying note:
@@ -42,7 +63,7 @@ Philox keying note:
   bit-for-bit identical to a raw Random123 Philox stream unless the same seed-to-key mapping and
   counter convention are used.
 
-`Philox` is the default algorithm for `CounterRNG()`, as it is very thorough and very fast; it has been measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an Nvidia GeForce RTX
+`Philox` is the default algorithm for `CounterRNG()` because it is thorough and very fast; it has been measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an Nvidia GeForce RTX
 5060 (advertised 448 GB/s), i.e. effectively memory-bound throughput.
 
 Examples:
@@ -52,8 +73,14 @@ using oneAPI
 
 # Reproducible
 rng = AK.CounterRNG(0x12345678; alg=AK.Philox())
-x = oneArray{Float32}(undef, 1024)
-AK.rand!(rng, x)
+v = oneArray{Float32}(undef, 1024)
+AK.rand!(rng, v)
+
+# Stream-consistent chunking
+v1 = oneArray{Float32}(undef, 100)
+v2 = oneArray{Float32}(undef, 100)
+AK.rand!(rng, v1)
+AK.rand!(rng, v2)
 
 # Convenience (fresh auto-seeded RNG on each call)
 y = oneArray{Float32}(undef, 1024)
@@ -62,5 +89,6 @@ AK.rand!(y)
 
 ```@docs
 AcceleratedKernels.CounterRNG
+AcceleratedKernels.reset!
 AcceleratedKernels.rand!
 ```
diff --git a/prototype/RNGTest/Project.toml b/prototype/RNGTest/Project.toml
new file mode 100644
index 0000000..7536db2
--- /dev/null
+++ b/prototype/RNGTest/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
+RNGTest = "97cc5700-e6cb-5ca1-8fb2-7f6b45264ecd"
diff --git a/prototype/RNGTest/README.md b/prototype/RNGTest/README.md
new file mode 100644
index 0000000..2bc6a7b
--- /dev/null
+++ b/prototype/RNGTest/README.md
@@ -0,0 +1,33 @@
+# AK + RNGTest SmallCrush Prototype
+
+This folder provides a chunked random stream generator based on `AcceleratedKernels.jl` that can be fed into `RNGTest.jl`.
+
+The stream is deterministic and effectively unbounded:
+- each refill generates `chunk` random `UInt64` values with `AK.rand!`
+- each refill advances one persistent `CounterRNG` stream offset
+- this is a practical chunked stream for RNGTest callback mode
+
+`RNGTest.jl` (in this local checkout) expects a callback returning `Float64` in `[0,1]`, so `UInt64` words are mapped to `Float64` via top-53-bit scaling.
+
+Current status in this harness: `SplitMix64`, `Philox`, and `Threefry` all pass BigCrush using `run_bigcrush.jl`.
+
+## Run SmallCrush
+
+From this directory:
+
+```powershell
+julia --project=. run_smallcrush.jl
+```
+
+## Run BigCrush
+
+```powershell
+julia --project=. run_bigcrush.jl
+```
+
+Notes:
+- Configure `ALG`, `SEED`, and `CHUNK` at the top of
+  `run_smallcrush.jl` / `run_bigcrush.jl`.
+- The stream refills directly into host scratch using `AK.rand!` on CPU.
+- `chunk` controls refill amortization and memory usage.
+- `chunk=100000000` means ~800 MB host scratch (`UInt64`).
diff --git a/prototype/RNGTest/run_bigcrush.jl b/prototype/RNGTest/run_bigcrush.jl
new file mode 100644
index 0000000..a1298a0
--- /dev/null
+++ b/prototype/RNGTest/run_bigcrush.jl
@@ -0,0 +1,26 @@
+using RNGTest
+
+include("stream.jl")
+
+
+const ALG = :philox
+const SEED = 0x1234
+const CHUNK = 10_000_000
+const HOST_SCRATCH = Vector{UInt64}(undef, CHUNK)
+
+
+stream = AKUInt64Stream(
+    HOST_SCRATCH;
+    seed=SEED,
+    alg=ALG,
+    start_counter=UInt64(0),
+)
+gen = make_rngtest_generator!(stream)
+genname = "AK_Vector_$(ALG)_seed$(SEED)"
+
+println("Beginning BigCrush. This may take hours...")
+
+RNGTest.bigcrushTestU01(gen, genname)
+
+println("refills: ", stream.refill_count)
+println("numbers consumed (approx): ", (stream.refill_count - 1) * stream.chunk + (stream.idx - 1))
diff --git a/prototype/RNGTest/run_smallcrush.jl b/prototype/RNGTest/run_smallcrush.jl
new file mode 100644
index 0000000..db1251c
--- /dev/null
+++ b/prototype/RNGTest/run_smallcrush.jl
@@ -0,0 +1,23 @@
+using RNGTest
+
+include("stream.jl")
+
+
+const ALG = :philox
+const SEED = 0x1234
+const CHUNK = 100_000_000
+const HOST_SCRATCH = Vector{UInt64}(undef, CHUNK)
+
+
+stream = AKUInt64Stream(
+    HOST_SCRATCH;
+    seed=SEED,
+    alg=ALG,
+    start_counter=UInt64(0),
+)
+gen = make_rngtest_generator!(stream)
+genname = "AK_Vector_$(ALG)_seed$(SEED)"
+
+println("Beginning SmallCrush...")
+
+RNGTest.smallcrushTestU01(gen, genname)
diff --git a/prototype/RNGTest/stream.jl b/prototype/RNGTest/stream.jl
new file mode 100644
index 0000000..ba65ce9
--- /dev/null
+++ b/prototype/RNGTest/stream.jl
@@ -0,0 +1,80 @@
+import AcceleratedKernels as AK
+
+
+function make_rng(seed::Integer, alg::Symbol; offset::Integer=0)
+    if alg === :philox
+        return AK.CounterRNG(seed; alg=AK.Philox(), offset=offset)
+    elseif alg === :threefry
+        return AK.CounterRNG(seed; alg=AK.Threefry(), offset=offset)
+    elseif alg === :splitmix64
+        return AK.CounterRNG(seed; alg=AK.SplitMix64(), offset=offset)
+    end
+    throw(ArgumentError("alg must be :philox, :threefry, or :splitmix64; got $alg"))
+end
+
+
+mutable struct AKUInt64Stream{R <: AK.AbstractCounterRNG}
+    rng::R
+    chunk::Int
+    idx::Int
+    host_scratch::Vector{UInt64}
+    refill_count::Int
+end
+
+
+function AKUInt64Stream(
+    host_scratch::Vector{UInt64};
+    seed::Integer=0x1234,
+    alg::Symbol=:philox,
+    start_counter::UInt64=0x0000000000000000,
+)
+    chunk = length(host_scratch)
+    chunk > 0 || throw(ArgumentError("host_scratch must be non-empty"))
+    rng = make_rng(seed, alg; offset=start_counter)
+
+    return AKUInt64Stream(
+        rng,
+        chunk,
+        chunk + 1,
+        host_scratch,
+        0,
+    )
+end
+
+
+@inline _u01_from_u64(u::UInt64)::Float64 = Float64(u >>> 11) * 0x1.0p-53
+
+
+function _fill_chunk!(s::AKUInt64Stream)
+    AK.rand!(s.rng, s.host_scratch)
+    return nothing
+end
+
+
+function refill!(s::AKUInt64Stream)
+    _fill_chunk!(s)
+    s.idx = 1
+    s.refill_count += 1
+    return s
+end
+
+
+function next_u64!(s::AKUInt64Stream)::UInt64
+    if s.idx > s.chunk
+        refill!(s)
+    end
+    @inbounds u = s.host_scratch[s.idx]
+    s.idx += 1
+    return u
+end
+
+
+@inline next_float64!(s::AKUInt64Stream)::Float64 = _u01_from_u64(next_u64!(s))
+
+
+function make_rngtest_generator!(s::AKUInt64Stream)
+    if s.idx > s.chunk
+        refill!(s)
+    end
+    return () -> next_float64!(s)
+end
diff --git a/prototype/rand/Project.toml b/prototype/rand/Project.toml
index 7e92c89..7757b4d 100644
--- a/prototype/rand/Project.toml
+++ b/prototype/rand/Project.toml
@@ -2,5 +2,6 @@
 AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
 PProf = "e4faabce-9ead-11e9-39d9-4379958e3056"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
diff --git a/src/rand/philox.jl b/src/rand/philox.jl
index 37149f1..61e5f97 100644
--- a/src/rand/philox.jl
+++ b/src/rand/philox.jl
@@ -18,13 +18,13 @@ end
 
 # Evaluate one Philox block at `counter`, returning two 32-bit lanes `(x0, x1)`
 @inline function _philox2x32_block(
-    rng::CounterRNG{<:Philox},
+    seed::UInt64,
     counter::UInt64,
 )::Tuple{UInt32, UInt32}
     x0 = _u32_lo(counter)
     x1 = _u32_hi(counter)
 
-    k0 = splitmix32_from_u64(UInt64(rng.seed))
+    k0 = splitmix32_from_u64(seed)
 
     @inbounds for _ in 1:PHILOX_ROUNDS
         x0, x1 = _philox2x32_round(x0, x1, k0)
@@ -37,21 +37,23 @@ end
 
 # Return lane 0 from the single Philox block at `counter`
 @inline function rand_uint(
-    rng::CounterRNG{<:Philox},
+    seed::UInt64,
+    alg::Philox,
     counter::UInt64,
     ::Type{UInt32},
 )::UInt32
-    x0, _ = _philox2x32_block(rng, counter)
+    x0, _ = _philox2x32_block(seed, counter)
     return x0
 end
 
 
 # Build UInt64 from the two lanes `(x0, x1)` of the same Philox block at `counter`
 @inline function rand_uint(
-    rng::CounterRNG{<:Philox},
+    seed::UInt64,
+    alg::Philox,
     counter::UInt64,
     ::Type{UInt64},
 )::UInt64
-    x0, x1 = _philox2x32_block(rng, counter)
+    x0, x1 = _philox2x32_block(seed, counter)
     return _u64_from_u32s(x0, x1)
 end
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index d1fcb42..229a899 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -1,50 +1,73 @@
 """
-    abstract type AbstractCounterRNG end
     abstract type CounterRNGAlgorithm end
+    abstract type AbstractCounterRNG{A <: CounterRNGAlgorithm} end
 
 RNG interface for counter-based random generation with AcceleratedKernels.
 """
 
-abstract type AbstractCounterRNG end
 abstract type CounterRNGAlgorithm end
+abstract type AbstractCounterRNG{A <: CounterRNGAlgorithm} end
 
 
 """
     CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())
 
-Stateless counter-based RNG configuration for [`rand!`](@ref).
+Counter-based RNG for [`rand!`](@ref).
 
-`CounterRNG` is immutable and does not hold mutable thread-local or global state. Each generated
-value is a pure function of:
+`CounterRNG` stores:
 - `seed`
-- logical linear element index
 - algorithm (`alg`)
+- stream `offset`
 
 The default algorithm is `Philox()`.
 
 `seed` may be any non-negative `Integer`. It is normalised to `UInt64` internally.
+`offset` is initialised to `0` by default and advances by `length(v)` after each [`rand!`](@ref)
+call.
 
 Constructors:
-- `CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())`
-  Uses an explicit non-negative seed.
-- `CounterRNG(; alg::CounterRNGAlgorithm=Philox())`
-  Auto-seeds once using `Random.rand(Random.default_rng(), UInt64)`. Reusing the same `CounterRNG` instance is deterministic
-  for fixed seed, algorithm, array shape, and eltype.
+- `CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)`
+  Uses an explicit non-negative seed and offset.
+- `CounterRNG(; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)`
+  Auto-seeds once using `Base.rand(UInt64)`, with default `offset == 0`.
 """
-struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG
+mutable struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG{A}
     seed::UInt64
     alg::A
+    offset::UInt64
 end
 
 
-function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())
+function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)
     @argcheck seed >= 0 "Seed must be a non-negative integer"
-    CounterRNG(UInt64(seed), alg)
+    @argcheck offset >= 0 "Offset must be a non-negative integer"
+    CounterRNG(UInt64(seed), alg, UInt64(offset))
 end
 
 
-function CounterRNG(; alg::CounterRNGAlgorithm=Philox())
-    CounterRNG(Random.rand(Random.default_rng(), UInt64); alg)
+function CounterRNG(; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)
+    CounterRNG(Base.rand(UInt64); alg, offset)
+end
+
+
+CounterRNG(seed::Integer, alg::CounterRNGAlgorithm) = CounterRNG(seed; alg)
+
+
+"""
+    reset!(rng::AbstractCounterRNG)
+
+Reset `rng.offset` to `0x0` for RNGs that support mutable stream offsets.
+
+This requires `rng` to:
+- have an `offset` field
+- be mutable
+"""
+@inline function reset!(rng::AbstractCounterRNG)
+    @argcheck hasfield(typeof(rng), :offset) "reset! requires an `offset` field"
+    @argcheck ismutabletype(typeof(rng)) "reset! requires a mutable RNG type"
+
+    rng.offset = UInt64(0)
+    return rng
 end
 
 
@@ -64,8 +87,8 @@ include("threefry.jl")
 """
     rand!(
         rng::AbstractCounterRNG,
-        x::AbstractArray{T},
-        backend::Backend=get_backend(x);
+        v::AbstractArray{T},
+        backend::Backend=get_backend(v);
 
         # CPU settings
         max_tasks::Int=Threads.nthreads(),
@@ -78,8 +101,13 @@ include("threefry.jl")
         block_size::Int=256,
     )
 
-Fill `x` in-place with pseudo-random values using a stateless counter-based RNG. For `x[i]`, the
-counter is exactly `UInt64(i - 1)` in linear indexing order.
+Fill `v` in-place with pseudo-random values using a counter-based RNG stream. For `v[i]`, the
+counter is `start_offset + UInt64(i - 1)` in linear indexing order, where `start_offset` is:
+- `rng.offset` if `rng` has an `offset` field
+- `0` otherwise
+
+After filling `v`, `rng.offset` advances by `length(v)` only when `rng` has a mutable `offset`
+field.
 
 Supported scalar element types are:
 - `UInt8`, `UInt16`, `UInt32`, `UInt64`
@@ -97,8 +125,8 @@ Semantics:
 """
 function rand!(
     rng::AbstractCounterRNG,
-    x::AbstractArray{T},
-    backend::Backend=get_backend(x);
+    v::AbstractArray{T},
+    backend::Backend=get_backend(v);
 
     # CPU settings
     max_tasks::Int=Threads.nthreads(),
@@ -110,23 +138,36 @@ function rand!(
 ) where T
 
     @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)"
+
+    initial_offset = hasfield(typeof(rng), :offset) ? UInt64(getproperty(rng, :offset)) : UInt64(0)
+
+    # local isbits captures from potentially mutable rng object
+    seed, alg = rng.seed, rng.alg
+    
     foreachindex(
-        1:length(x), backend;
+        v, backend;
         max_tasks,
         min_elems,
         prefer_threads,
         block_size,
     ) do i
-        @inbounds x[i] = rand_scalar(rng, _counter_from_index(i), T)
+        @inbounds v[i] = rand_scalar(seed, alg, initial_offset + _counter_from_index(i), T)
+    end
+
+    if hasfield(typeof(rng), :offset) && ismutabletype(typeof(rng))
+        # XXX: maybe should be atomic add? would only be needed if AK.rand! were called
+        #      concurrently on the same rng... ??
+        rng.offset = initial_offset + UInt64(length(v))
     end
-    return x
+    
+    v
 end
 
 
 function rand!(
-    x::AbstractArray,
+    v::AbstractArray,
     args...;
     kwargs...,
 )
-    return rand!(CounterRNG(), x, args...; kwargs...)
+    return rand!(CounterRNG(), v, args...; kwargs...)
 end
diff --git a/src/rand/splitmix.jl b/src/rand/splitmix.jl
index 8d7992b..cc474be 100644
--- a/src/rand/splitmix.jl
+++ b/src/rand/splitmix.jl
@@ -24,20 +24,21 @@ end
 
 # Natural SplitMix64 output path: compute 64 random bits directly from one counter
 @inline function rand_uint(
-    rng::CounterRNG{<:SplitMix64},
+    seed::UInt64,
+    alg::SplitMix64,
     counter::UInt64,
     ::Type{UInt64},
 )::UInt64
-    seed = UInt64(rng.seed)
     return _splitmix64_mix(counter + seed + SPLITMIX64_INCREMENT)
 end
 
 
 # UInt32 path is derived from the high 32 bits of the UInt64 SplitMix output
 @inline function rand_uint(
-    rng::CounterRNG{<:SplitMix64},
+    seed::UInt64,
+    alg::SplitMix64,
     counter::UInt64,
     ::Type{UInt32},
 )::UInt32
-    return _u32_hi(rand_uint(rng, counter, UInt64))
+    return _u32_hi(rand_uint(seed, alg, counter, UInt64))
 end
diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl
index 7326f40..2006886 100644
--- a/src/rand/threefry.jl
+++ b/src/rand/threefry.jl
@@ -18,13 +18,12 @@ end
 
 # Evaluate one Threefry block at `counter`, returning two 32-bit lanes `(x0, x1)`
 @inline function _threefry2x32_block(
-    rng::CounterRNG{<:Threefry},
+    seed::UInt64,
     counter::UInt64,
 )::Tuple{UInt32, UInt32}
     x0 = _u32_lo(counter)
     x1 = _u32_hi(counter)
 
-    seed = UInt64(rng.seed)
     k0 = _u32_lo(seed)
     k1 = _u32_hi(seed)
     k2 = xor(THREEFRY_PARITY, xor(k0, k1))
@@ -52,21 +51,23 @@ end
 
 # Return lane 0 from the single Threefry block at `counter`
 @inline function rand_uint(
-    rng::CounterRNG{<:Threefry},
+    seed::UInt64,
+    alg::Threefry,
     counter::UInt64,
     ::Type{UInt32},
 )::UInt32
-    x0, _ = _threefry2x32_block(rng, counter)
+    x0, _ = _threefry2x32_block(seed, counter)
     return x0
 end
 
 
 # Build UInt64 from the two lanes `(x0, x1)` of the same Threefry block at `counter`
 @inline function rand_uint(
-    rng::CounterRNG{<:Threefry},
+    seed::UInt64,
+    alg::Threefry,
     counter::UInt64,
     ::Type{UInt64},
 )::UInt64
-    x0, x1 = _threefry2x32_block(rng, counter)
+    x0, x1 = _threefry2x32_block(seed, counter)
     return _u64_from_u32s(x0, x1)
 end
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index 3d5dab7..bc3da2f 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -51,15 +51,16 @@ const ALLOWED_RAND_SCALARS = Union{
 
 
 #=
-Every RNG algorithm implements rand_uint(rng, counter, UInt32/UInt64).
-This fallback provides a clear failure for unsupported RNG types.
+Every RNG algorithm implements rand_uint(seed, alg, counter, UInt32/UInt64).
+This is the fallback for unsupported RNG algorithms.
 =#
 @inline function rand_uint(
-    rng::AbstractCounterRNG,
+    ::UInt64,
+    alg::CounterRNGAlgorithm,
     ::UInt64,
     ::Type{UIntType}
 )::UIntType where {UIntType <: Union{UInt32, UInt64}}
-    throw(ArgumentError("No rand_uint implementation for RNG: $rng"))
+    throw(ArgumentError("No rand_uint implementation for RNG algorithm: $(typeof(alg))"))
 end
 
 
@@ -70,27 +71,26 @@ Shared scalar generation:
 3) convert bits into requested scalar representation
 =#
 @inline function rand_scalar(
-    rng::AbstractCounterRNG,
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
     counter::UInt64,
     ::Type{T}
 )::T where {T <: ALLOWED_RAND_SCALARS}
 
     UIntType = raw_uint_type(T)
-    u = rand_uint(rng, counter, UIntType)
+    u = rand_uint(seed, alg, counter, UIntType)
 
     return from_uint(T, u)
 end
 
 
-@inline function rand_scalar(::AbstractCounterRNG, ::UInt64, ::Type{T}) where {T}
+@inline function rand_scalar(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T}
     throw(ArgumentError(
         "Unsupported random scalar type $(T). Supported: $(ALLOWED_RAND_SCALARS)"
     ))
 end
 
 
-
-
 # Convert random UInt32 bits to Float16 in [0, 1) by mantissa construction.
 @inline function uint32_to_unit_float16(u::UInt32)::Float16
     # Keep 10 random bits for the mantissa (drop 22 rightmost bits from the UInt32)
@@ -98,7 +98,7 @@ end
     bits = UInt16(0x3c00) | UInt16(u >> 22)
 
     # Interpret as 1.mantissa, then subtract 1 for [0, 1)
-    return reinterpret(Float16, bits) - Float16(1)
+    reinterpret(Float16, bits) - Float16(1)
 end
 
 
@@ -109,7 +109,7 @@ end
     bits = UInt32(0x3f800000) | (u >> 9)
 
     # Interpret as 1.mantissa, then subtract 1 for [0, 1)
-    return reinterpret(Float32, bits) - 1.0f0
+    reinterpret(Float32, bits) - 1.0f0
 end
 
 
@@ -120,5 +120,5 @@ end
     bits = UInt64(0x3ff0000000000000) | (u >> 12)
 
     # Interpret as 1.mantissa, then subtract 1 for [0, 1)
-    return reinterpret(Float64, bits) - 1.0
+    reinterpret(Float64, bits) - 1.0
 end
diff --git a/test/rand.jl b/test/rand.jl
index 9e03e59..b7ef167 100644
--- a/test/rand.jl
+++ b/test/rand.jl
@@ -7,25 +7,31 @@ const RAND_SCALAR_TYPES_ALL = (
 )
 const RAND_SCALAR_TYPES_BACKEND = IS_CPU_BACKEND ?
                                   RAND_SCALAR_TYPES_ALL :
-                                  (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float16, Float32, Bool)
+                                  (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Bool)
+const RUN_FLOAT16_RAND_TESTS = IS_CPU_BACKEND
 const RUN_FLOAT64_RAND_TESTS = IS_CPU_BACKEND
 
 
 _is_unit_interval(v) = all(x -> !isnan(x) && zero(x) <= x < one(x), v)
 
 
-function _rand_fill_reference!(rng, x::AbstractArray{T}) where {T <: AK.ALLOWED_RAND_SCALARS}
+function _rand_fill_reference!(
+    rng,
+    x::AbstractArray{T};
+    counter_offset::UInt64=UInt64(0),
+) where {T <: AK.ALLOWED_RAND_SCALARS}
     @inbounds for i in eachindex(x)
-        x[i] = AK.rand_scalar(rng, UInt64(i - one(i)), T)
+        x[i] = AK.rand_scalar(rng.seed, rng.alg, counter_offset + UInt64(i - one(i)), T)
     end
     return x
 end
 
 
 function _assert_rand_matches_reference!(rng, x; kwargs...)
+    counter_offset = rng.offset
     AK.rand!(rng, x; kwargs...)
     ref = zeros(eltype(x), size(x))
-    _rand_fill_reference!(rng, ref)
+    _rand_fill_reference!(rng, ref; counter_offset)
     @test Array(x) == ref
     return x
 end
@@ -37,7 +43,11 @@ end
         @test AK.CounterRNG(UInt32(0x1); alg=AK.Philox()) isa AK.CounterRNG{AK.Philox}
         @test AK.CounterRNG(UInt16(123); alg=AK.Threefry()) isa AK.CounterRNG{AK.Threefry}
         @test AK.CounterRNG(UInt32(300)).seed == UInt64(300)
+        @test AK.CounterRNG(UInt32(300)).offset == UInt64(0)
+        @test AK.CounterRNG(0x1; offset=17).offset == UInt64(17)
+        @test AK.CounterRNG(0x1, AK.Philox()).offset == UInt64(0)
         @test_throws ArgumentError AK.CounterRNG(-1)
+        @test_throws ArgumentError AK.CounterRNG(1; offset=-1)
 
         Random.seed!(0x1234)
         expected_seed = Random.rand(Random.default_rng(), UInt64)
@@ -45,11 +55,80 @@ end
         rng_auto = AK.CounterRNG()
         @test rng_auto.seed == expected_seed
         @test rng_auto.alg isa AK.Philox
+        @test rng_auto.offset == UInt64(0)
+
+        rng_auto_off = AK.CounterRNG(; offset=42)
+        @test rng_auto_off.offset == UInt64(42)
 
         x1 = array_from_host(zeros(Float32, 1024))
         x2 = array_from_host(zeros(Float32, 1024))
         AK.rand!(rng_auto, x1; prefer_threads, block_size=64)
         AK.rand!(rng_auto, x2; prefer_threads, block_size=257)
+        @test rng_auto.offset == UInt64(2048)
+        @test Array(x1) != Array(x2)
+    end
+
+
+    @testset "abstract rng offset behavior" begin
+        mutable struct MutableNoOffsetRNG <: AK.AbstractCounterRNG{AK.Philox}
+            seed::UInt64
+            alg::AK.Philox
+        end
+
+        mutable struct MutableWithOffsetRNG <: AK.AbstractCounterRNG{AK.Philox}
+            seed::UInt64
+            alg::AK.Philox
+            offset::UInt64
+        end
+
+        struct ImmutableWithOffsetRNG <: AK.AbstractCounterRNG{AK.Philox}
+            seed::UInt64
+            alg::AK.Philox
+            offset::UInt64
+        end
+
+        rng_no_offset = MutableNoOffsetRNG(UInt64(0x1234), AK.Philox())
+        x1 = array_from_host(zeros(Float32, 256))
+        x2 = array_from_host(zeros(Float32, 256))
+        AK.rand!(rng_no_offset, x1; prefer_threads, block_size=64)
+        AK.rand!(rng_no_offset, x2; prefer_threads, block_size=64)
+        @test Array(x1) == Array(x2)
+
+        rng_stream = MutableWithOffsetRNG(UInt64(0x1234), AK.Philox(), UInt64(0))
+        s1 = array_from_host(zeros(Float32, 100))
+        s2 = array_from_host(zeros(Float32, 100))
+        s12 = array_from_host(zeros(Float32, 200))
+        AK.rand!(rng_stream, s1; prefer_threads, block_size=64)
+        AK.rand!(rng_stream, s2; prefer_threads, block_size=64)
+        AK.rand!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), s12; prefer_threads, block_size=64)
+        @test vcat(Array(s1), Array(s2)) == Array(s12)
+        @test rng_stream.offset == UInt64(200)
+
+        rng_imm = ImmutableWithOffsetRNG(UInt64(0x1234), AK.Philox(), UInt64(17))
+        y1 = array_from_host(zeros(Float32, 64))
+        y2 = array_from_host(zeros(Float32, 64))
+        AK.rand!(rng_imm, y1; prefer_threads, block_size=64)
+        AK.rand!(rng_imm, y2; prefer_threads, block_size=64)
+        @test Array(y1) == Array(y2)
+
+        @test AK.reset!(rng_stream) === rng_stream
+        @test rng_stream.offset == UInt64(0)
+        @test_throws ArgumentError AK.reset!(rng_no_offset)
+        @test_throws ArgumentError AK.reset!(rng_imm)
+    end
+
+
+    @testset "reset!" begin
+        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+        x1 = array_from_host(zeros(Float32, 512))
+        x2 = array_from_host(zeros(Float32, 512))
+
+        AK.rand!(rng, x1; prefer_threads, block_size=64)
+        @test rng.offset == UInt64(512)
+        @test AK.reset!(rng) === rng
+        @test rng.offset == UInt64(0)
+        AK.rand!(rng, x2; prefer_threads, block_size=64)
+
         @test Array(x1) == Array(x2)
     end
 
@@ -73,7 +152,9 @@ end
         @test AK.raw_uint_type(Int8) === UInt32
         @test AK.raw_uint_type(Int16) === UInt32
         @test AK.raw_uint_type(Int32) === UInt32
-        @test AK.raw_uint_type(Float16) === UInt32
+        if RUN_FLOAT16_RAND_TESTS
+            @test AK.raw_uint_type(Float16) === UInt32
+        end
         @test AK.raw_uint_type(Float32) === UInt32
         @test AK.raw_uint_type(UInt64) === UInt64
         @test AK.raw_uint_type(Int64) === UInt64
@@ -92,12 +173,16 @@ end
         @test AK.from_uint(
             Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64
         ) == Int64(-1)
-        @test AK.from_uint(Float16, UInt32(0)) == Float16(0)
+        if RUN_FLOAT16_RAND_TESTS
+            @test AK.from_uint(Float16, UInt32(0)) == Float16(0)
+        end
         @test AK.from_uint(Bool, UInt32(0)) == false
         @test AK.from_uint(Bool, UInt32(1)) == true
 
-        @test AK.uint32_to_unit_float16(UInt32(0)) == Float16(0)
-        @test Float16(0) <= AK.uint32_to_unit_float16(typemax(UInt32)) < Float16(1)
+        if RUN_FLOAT16_RAND_TESTS
+            @test AK.uint32_to_unit_float16(UInt32(0)) == Float16(0)
+            @test Float16(0) <= AK.uint32_to_unit_float16(typemax(UInt32)) < Float16(1)
+        end
         @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0
         @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0
         if RUN_FLOAT64_RAND_TESTS
@@ -111,25 +196,26 @@ end
         for alg in RAND_ALGS
             rng = AK.CounterRNG(0x123456789abcdef; alg)
             for U in (UInt32, UInt64)
-                @test AK.rand_uint(rng, UInt64(0), U) == AK.rand_uint(rng, UInt64(0), U)
-                @test AK.rand_uint(rng, UInt64(1), U) != AK.rand_uint(rng, UInt64(0), U)
+                @test AK.rand_uint(rng.seed, rng.alg, UInt64(0), U) == AK.rand_uint(rng.seed, rng.alg, UInt64(0), U)
+                @test AK.rand_uint(rng.seed, rng.alg, UInt64(1), U) != AK.rand_uint(rng.seed, rng.alg, UInt64(0), U)
 
-                vals = [AK.rand_uint(rng, UInt64(i), U) for i in 0:511]
+                vals = [AK.rand_uint(rng.seed, rng.alg, UInt64(i), U) for i in 0:511]
                 @test length(unique(vals)) > 460
             end
         end
 
         rng_splitmix = AK.CounterRNG(0x31415926; alg=AK.SplitMix64())
         for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023))
-            @test AK.rand_uint(rng_splitmix, c, UInt32) == AK._u32_hi(
-                AK.rand_uint(rng_splitmix, c, UInt64)
+            @test AK.rand_uint(rng_splitmix.seed, rng_splitmix.alg, c, UInt32) == AK._u32_hi(
+                AK.rand_uint(rng_splitmix.seed, rng_splitmix.alg, c, UInt64)
             )
         end
 
         for alg in (AK.Philox(), AK.Threefry())
             rng = AK.CounterRNG(0xabcdef1234567890; alg)
             for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023))
-                @test AK._u32_lo(AK.rand_uint(rng, c, UInt64)) == AK.rand_uint(rng, c, UInt32)
+                @test AK._u32_lo(AK.rand_uint(rng.seed, rng.alg, c, UInt64)) ==
+                      AK.rand_uint(rng.seed, rng.alg, c, UInt32)
             end
         end
     end
@@ -139,10 +225,11 @@ end
         rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
 
         for T in RAND_SCALAR_TYPES_BACKEND
-            s0 = AK.rand_scalar(rng, UInt64(0), T)
-            s1 = AK.rand_scalar(rng, UInt64(1), T)
+            s0 = AK.rand_scalar(rng.seed, rng.alg, UInt64(0), T)
+            s1 = AK.rand_scalar(rng.seed, rng.alg, UInt64(1), T)
             @test s0 isa T
             @test s1 isa T
+            @test s0 == AK.rand_scalar(rng.seed, rng.alg, UInt64(0), T)
             if !(T in (Bool, Float16, UInt8, UInt16, Int8, Int16))
                 @test s0 != s1
             end
@@ -153,32 +240,37 @@ end
         end
 
         c = UInt64(42)
-        @test AK.rand_scalar(rng, c, UInt8) == trunc(UInt8, AK.rand_uint(rng, c, UInt32) >> 24)
-        @test AK.rand_scalar(rng, c, UInt16) == trunc(UInt16, AK.rand_uint(rng, c, UInt32) >> 16)
-        @test AK.rand_scalar(
-            rng, c, Int8
-        ) == reinterpret(Int8, trunc(UInt8, AK.rand_uint(rng, c, UInt32) >> 24))
-        @test AK.rand_scalar(
-            rng, c, Int16
-        ) == reinterpret(Int16, trunc(UInt16, AK.rand_uint(rng, c, UInt32) >> 16))
-        @test AK.rand_scalar(rng, c, Int32) == reinterpret(Int32, AK.rand_uint(rng, c, UInt32))
-        @test AK.rand_scalar(rng, c, Int64) == reinterpret(Int64, AK.rand_uint(rng, c, UInt64))
-        @test AK.rand_scalar(rng, c, Float16) == AK.uint32_to_unit_float16(
-            AK.rand_uint(rng, c, UInt32)
-        )
-        @test AK.rand_scalar(rng, c, Float32) == AK.uint32_to_unit_float32(
-            AK.rand_uint(rng, c, UInt32)
+        @test AK.rand_scalar(rng.seed, rng.alg, c, UInt8) ==
+              trunc(UInt8, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 24)
+        @test AK.rand_scalar(rng.seed, rng.alg, c, UInt16) ==
+              trunc(UInt16, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 16)
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Int8) ==
+              reinterpret(Int8, trunc(UInt8, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 24))
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Int16) ==
+              reinterpret(Int16, trunc(UInt16, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 16))
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Int32) ==
+              reinterpret(Int32, AK.rand_uint(rng.seed, rng.alg, c, UInt32))
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Int64) ==
+              reinterpret(Int64, AK.rand_uint(rng.seed, rng.alg, c, UInt64))
+        if RUN_FLOAT16_RAND_TESTS
+            @test AK.rand_scalar(rng.seed, rng.alg, c, Float16) == AK.uint32_to_unit_float16(
+                AK.rand_uint(rng.seed, rng.alg, c, UInt32)
+            )
+        end
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Float32) == AK.uint32_to_unit_float32(
+            AK.rand_uint(rng.seed, rng.alg, c, UInt32)
         )
-        @test AK.rand_scalar(rng, c, Bool) == isodd(AK.rand_uint(rng, c, UInt32))
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Bool) ==
+              isodd(AK.rand_uint(rng.seed, rng.alg, c, UInt32))
         if RUN_FLOAT64_RAND_TESTS
-            @test AK.rand_scalar(rng, c, Float64) == AK.uint64_to_unit_float64(
-                AK.rand_uint(rng, c, UInt64)
+            @test AK.rand_scalar(rng.seed, rng.alg, c, Float64) == AK.uint64_to_unit_float64(
+                AK.rand_uint(rng.seed, rng.alg, c, UInt64)
             )
         end
-        bools = [AK.rand_scalar(rng, UInt64(i), Bool) for i in 0:511]
+        bools = [AK.rand_scalar(rng.seed, rng.alg, UInt64(i), Bool) for i in 0:511]
         @test any(identity, bools)
         @test any(!, bools)
-        @test_throws ArgumentError AK.rand_scalar(rng, UInt64(0), UInt128)
+        @test_throws ArgumentError AK.rand_scalar(rng.seed, rng.alg, UInt64(0), UInt128)
     end
 
 
@@ -199,21 +291,37 @@ end
         for T in RAND_SCALAR_TYPES_BACKEND
             x1 = array_from_host(zeros(T, 2048))
             x2 = array_from_host(zeros(T, 2048))
-            AK.rand!(rng, x1; prefer_threads, block_size=64)
-            AK.rand!(rng, x2; prefer_threads, block_size=257)
+            rng1 = AK.CounterRNG(rng.seed; alg=rng.alg)
+            rng2 = AK.CounterRNG(rng.seed; alg=rng.alg)
+            AK.rand!(rng1, x1; prefer_threads, block_size=64)
+            AK.rand!(rng2, x2; prefer_threads, block_size=257)
             @test Array(x1) == Array(x2)
         end
 
-        rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg)
         for T in RAND_SCALAR_TYPES_BACKEND
+            rng1 = AK.CounterRNG(rng.seed; alg=rng.alg)
+            rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg)
             x1 = array_from_host(zeros(T, 2048))
             x2 = array_from_host(zeros(T, 2048))
-            AK.rand!(rng, x1; prefer_threads, block_size=64)
+            AK.rand!(rng1, x1; prefer_threads, block_size=64)
             AK.rand!(rng2, x2; prefer_threads, block_size=64)
             @test Array(x1) != Array(x2)
         end
 
-        for T in (Float16, Float32, UInt64, Bool)
+        begin
+            rng_stream = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+            rng_once = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+            x1 = array_from_host(zeros(Float32, 100))
+            x2 = array_from_host(zeros(Float32, 100))
+            x12 = array_from_host(zeros(Float32, 200))
+            AK.rand!(rng_stream, x1; prefer_threads, block_size=64)
+            AK.rand!(rng_stream, x2; prefer_threads, block_size=64)
+            AK.rand!(rng_once, x12; prefer_threads, block_size=64)
+            @test vcat(Array(x1), Array(x2)) == Array(x12)
+            @test rng_stream.offset == UInt64(200)
+        end
+
+        for T in (RUN_FLOAT16_RAND_TESTS ? (Float16, Float32, UInt64, Bool) : (Float32, UInt64, Bool))
             xnd = array_from_host(zeros(T, 7, 11, 5))
             _assert_rand_matches_reference!(rng, xnd; prefer_threads, block_size=128)
         end
@@ -229,7 +337,10 @@ end
                     prefer_threads=true
                 )
                 ref_view = zeros(T, length(view_x))
-                _rand_fill_reference!(rng, ref_view)
+                _rand_fill_reference!(
+                    rng, ref_view;
+                    counter_offset=rng.offset - UInt64(length(view_x)),
+                )
                 @test collect(view_x) == ref_view
             end
         end

From 71a62d47f57653429b03a5621cf710511a0be49d Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Tue, 24 Mar 2026 20:02:41 +0000
Subject: [PATCH 11/18] style update + remove AbstractCounterRNG type in favour
 of purely allowing CounterRNG which now must have an offset. if needed,
 AK.reset!(::CounterRNG) can be called to reset a streaming CounterRNG every
 time AK.rand! is called, if needed

---
 docs/src/api/rand.md        |  5 +--
 prototype/RNGTest/stream.jl |  2 +-
 prototype/rand/test_rand.jl |  8 ++---
 src/rand/philox.jl          |  2 +-
 src/rand/rand.jl            | 38 +++++++---------------
 src/rand/threefry.jl        |  1 +
 src/rand/utilities.jl       | 65 ++++++++++++++++++++-----------------
 test/rand.jl                |  6 ++--
 test/runtests.jl            |  8 ++++-
 9 files changed, 66 insertions(+), 69 deletions(-)

diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md
index 5c5596e..03ed7f3 100644
--- a/docs/src/api/rand.md
+++ b/docs/src/api/rand.md
@@ -9,7 +9,7 @@ fixed `seed`, algorithm, and call sequence.
 - calls that share the same `CounterRNG` instance concurrently are not thread-safe.
 - call `AK.reset!(rng)` to rewind a mutable offset-bearing RNG back to offset `0x0`.
 
-`AK.rand!` also accepts custom `AbstractCounterRNG` implementations:
+`AK.rand!` also accepts custom `CounterRNG` implementations:
 - if they have a mutable `offset` field, streaming advancement is applied
 - if they have no `offset` field, each call behaves statelessly from counter `0`
 - if they have an immutable `offset` field, that offset is used as a fixed start and is not advanced
@@ -23,7 +23,7 @@ convenience,
 
 Custom RNGs:
 - Define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`.
-- Define an RNG type `MyRNG <: AK.AbstractCounterRNG{MyAlg}` with fields `seed` and `alg`.
+- Define a `CounterRNG` with fields `seed` and `alg`.
 - Add a mutable `offset::UInt64` field if you want stream advancement across calls; omit it for stateless calls from counter `0`.
 - Implement typed `rand_uint` methods:
   - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32`
@@ -89,6 +89,7 @@ AK.rand!(y)
 
 ```@docs
 AcceleratedKernels.CounterRNG
+AcceleratedKernels.CounterRNGAlgorithm
 AcceleratedKernels.reset!
 AcceleratedKernels.rand!
 ```
diff --git a/prototype/RNGTest/stream.jl b/prototype/RNGTest/stream.jl
index ba65ce9..a383645 100644
--- a/prototype/RNGTest/stream.jl
+++ b/prototype/RNGTest/stream.jl
@@ -13,7 +13,7 @@ function make_rng(seed::Integer, alg::Symbol; offset::Integer=0)
 end
 
 
-mutable struct AKUInt64Stream{R <: AK.AbstractCounterRNG}
+mutable struct AKUInt64Stream{R}
     rng::R
     chunk::Int
     idx::Int
diff --git a/prototype/rand/test_rand.jl b/prototype/rand/test_rand.jl
index 329214d..ec3ce65 100644
--- a/prototype/rand/test_rand.jl
+++ b/prototype/rand/test_rand.jl
@@ -43,13 +43,13 @@ is_unit_interval(v) = all(x -> 0.0f0 <= x <= 1.0f0, v)
 
 # warmup compile
 run_cuda_rand!(x_cuda)
-run_ak_rand_gpu!(RNG_SPLITMIX, x_splitmix)
+# run_ak_rand_gpu!(RNG_SPLITMIX, x_splitmix)
 run_ak_rand_gpu!(RNG_PHILOX, x_philox)
 run_ak_rand_gpu!(RNG_THREEFRY, x_threefry)
 run_ak_rand_cpu!(RNG_SPLITMIX, x_cpu)
 
 @assert is_unit_interval(Array(x_cuda))
-@assert is_unit_interval(Array(x_splitmix))
+# @assert is_unit_interval(Array(x_splitmix))
 @assert is_unit_interval(Array(x_philox))
 @assert is_unit_interval(Array(x_threefry))
 @assert is_unit_interval(x_cpu)
@@ -60,8 +60,8 @@ println("CPU threads: ", Threads.nthreads())
 println("\nCUDA.rand! benchmark (CuArray{Float32}, in-place)")
 display(@benchmark run_cuda_rand!($x_cuda))
 
-println("\nAK.rand! SplitMix64 benchmark (GPU, CuArray{Float32})")
-display(@benchmark run_ak_rand_gpu!($RNG_SPLITMIX, $x_splitmix))
+# println("\nAK.rand! SplitMix64 benchmark (GPU, CuArray{Float32})")
+# display(@benchmark run_ak_rand_gpu!($RNG_SPLITMIX, $x_splitmix))
 
 println("\nAK.rand! Philox benchmark (GPU, CuArray{Float32})")
 display(@benchmark run_ak_rand_gpu!($RNG_PHILOX, $x_philox))
diff --git a/src/rand/philox.jl b/src/rand/philox.jl
index 61e5f97..0de9ff4 100644
--- a/src/rand/philox.jl
+++ b/src/rand/philox.jl
@@ -4,7 +4,7 @@ struct Philox <: CounterRNGAlgorithm end
 # Philox magic numbers
 const PHILOX_M0 = UInt32(0xD256D193)
 const PHILOX_W0 = UInt32(0x9E3779B9)
-const PHILOX_ROUNDS = 10
+const PHILOX_ROUNDS = 7
 
 
 @inline function _philox2x32_round(x0::UInt32, x1::UInt32, k0::UInt32)
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index 229a899..b9db756 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -1,12 +1,4 @@
-"""
-    abstract type CounterRNGAlgorithm end
-    abstract type AbstractCounterRNG{A <: CounterRNGAlgorithm} end
-
-RNG interface for counter-based random generation with AcceleratedKernels.
-"""
-
 abstract type CounterRNGAlgorithm end
-abstract type AbstractCounterRNG{A <: CounterRNGAlgorithm} end
 
 
 """
@@ -31,11 +23,12 @@ Constructors:
 - `CounterRNG(; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)`
   Auto-seeds once using `Base.rand(UInt64)`, with default `offset == 0`.
 """
-mutable struct CounterRNG{A <: CounterRNGAlgorithm} <: AbstractCounterRNG{A}
-    seed::UInt64
-    alg::A
+mutable struct CounterRNG{A <: CounterRNGAlgorithm}
+    const seed::UInt64
+    const alg::A
     offset::UInt64
 end
+#TODO: need to figure out a nice way to allow custom counter RNGs
 
 
 function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)
@@ -54,7 +47,7 @@ CounterRNG(seed::Integer, alg::CounterRNGAlgorithm) = CounterRNG(seed; alg)
 
 
 """
-    reset!(rng::AbstractCounterRNG)
+    reset!(rng::CounterRNG)
 
 Reset `rng.offset` to `0x0` for RNGs that support mutable stream offsets.
 
@@ -62,7 +55,7 @@ This requires `rng` to:
 - have an `offset` field
 - be mutable
 """
-@inline function reset!(rng::AbstractCounterRNG)
+@inline function reset!(rng::CounterRNG)
     @argcheck hasfield(typeof(rng), :offset) "reset! requires an `offset` field"
     @argcheck ismutabletype(typeof(rng)) "reset! requires a mutable RNG type"
 
@@ -86,7 +79,7 @@ include("threefry.jl")
 
 """
     rand!(
-        rng::AbstractCounterRNG,
+        rng::CounterRNG,
         v::AbstractArray{T},
         backend::Backend=get_backend(v);
 
@@ -124,7 +117,7 @@ Semantics:
 
 """
 function rand!(
-    rng::AbstractCounterRNG,
+    rng::CounterRNG,
     v::AbstractArray{T},
     backend::Backend=get_backend(v);
 
@@ -139,26 +132,17 @@ function rand!(
 
     @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)"
 
-    initial_offset = hasfield(typeof(rng), :offset) ? UInt64(getproperty(rng, :offset)) : UInt64(0)
-
-    # local isbits captures from potentially mutable rng object
+    # Local isbits captures from potentially mutable rng object
     seed, alg = rng.seed, rng.alg
     
     foreachindex(
         v, backend;
-        max_tasks,
-        min_elems,
-        prefer_threads,
-        block_size,
+        max_tasks, min_elems, prefer_threads, block_size,
     ) do i
         @inbounds v[i] = rand_scalar(seed, alg, initial_offset + _counter_from_index(i), T)
     end
 
-    if hasfield(typeof(rng), :offset) && ismutabletype(typeof(rng))
-        # XXX: maybe should be atomic add? would only be needed if AK.rand! were called
-        #      concurrently on the same rng... ??
-        rng.offset = initial_offset + UInt64(length(v))
-    end
+    rng.offset += UInt64(length(v))
     
     v
 end
diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl
index 2006886..c9052b1 100644
--- a/src/rand/threefry.jl
+++ b/src/rand/threefry.jl
@@ -21,6 +21,7 @@ end
     seed::UInt64,
     counter::UInt64,
 )::Tuple{UInt32, UInt32}
+
     x0 = _u32_lo(counter)
     x1 = _u32_hi(counter)
 
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index bc3da2f..f2159c5 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -1,15 +1,17 @@
 # lo: rightmost 32 bits, hi: leftmost 32 bits
 @inline _u32_lo(x::UInt64)::UInt32 = UInt32(x & UInt64(0xffffffff))
 @inline _u32_hi(x::UInt64)::UInt32 = UInt32(x >> 32)
+
+# Construct UInt64 by bit concatenation of two UInt32s
 @inline _u64_from_u32s(lo::UInt32, hi::UInt32)::UInt64 = (UInt64(hi) << 32) | UInt64(lo)
 
-# leftmost 32 bits of a*b cast to UInt64s
+# Leftmost 32 bits of a*b cast to UInt64s
 @inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32)
 
 # 32-bit rotate left by r positions
 @inline _rotl32(x::UInt32, r::UInt32)::UInt32 = (x << r) | (x >> (UInt32(32) - r))
 
-
+# Get counter used for CounterRNG from element index
 @inline _counter_from_index(i)::UInt64 = UInt64(i - one(i))
 
 
@@ -22,32 +24,32 @@ const ALLOWED_RAND_SCALARS = Union{
 }
 
 
-@inline raw_uint_type(::Type{UInt8}) = UInt32
-@inline raw_uint_type(::Type{UInt16}) = UInt32
-@inline raw_uint_type(::Type{UInt32}) = UInt32
-@inline raw_uint_type(::Type{Int8}) = UInt32
-@inline raw_uint_type(::Type{Int16}) = UInt32
-@inline raw_uint_type(::Type{Int32}) = UInt32
-@inline raw_uint_type(::Type{Float16}) = UInt32
-@inline raw_uint_type(::Type{Float32}) = UInt32
-@inline raw_uint_type(::Type{UInt64}) = UInt64
-@inline raw_uint_type(::Type{Int64}) = UInt64
-@inline raw_uint_type(::Type{Float64}) = UInt64
-@inline raw_uint_type(::Type{Bool}) = UInt32
-
-
-@inline from_uint(::Type{UInt8}, u::UInt32)::UInt8 = trunc(UInt8, u >> 24)
-@inline from_uint(::Type{UInt16}, u::UInt32)::UInt16 = trunc(UInt16, u >> 16)
-@inline from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u
-@inline from_uint(::Type{UInt64}, u::UInt64)::UInt64 = u
-@inline from_uint(::Type{Int8}, u::UInt32)::Int8 = reinterpret(Int8, trunc(UInt8, u >> 24))
-@inline from_uint(::Type{Int16}, u::UInt32)::Int16 = reinterpret(Int16, trunc(UInt16, u >> 16))
-@inline from_uint(::Type{Int32}, u::UInt32)::Int32 = reinterpret(Int32, u)
-@inline from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u)
-@inline from_uint(::Type{Float16}, u::UInt32)::Float16 = uint32_to_unit_float16(u)
-@inline from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u)
-@inline from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u)
-@inline from_uint(::Type{Bool}, u::UInt32)::Bool = isodd(u)
+@inline _rand_scalar_uint_type(::Type{UInt8}) = UInt32
+@inline _rand_scalar_uint_type(::Type{UInt16}) = UInt32
+@inline _rand_scalar_uint_type(::Type{UInt32}) = UInt32
+@inline _rand_scalar_uint_type(::Type{Int8}) = UInt32
+@inline _rand_scalar_uint_type(::Type{Int16}) = UInt32
+@inline _rand_scalar_uint_type(::Type{Int32}) = UInt32
+@inline _rand_scalar_uint_type(::Type{Float16}) = UInt32
+@inline _rand_scalar_uint_type(::Type{Float32}) = UInt32
+@inline _rand_scalar_uint_type(::Type{UInt64}) = UInt64
+@inline _rand_scalar_uint_type(::Type{Int64}) = UInt64
+@inline _rand_scalar_uint_type(::Type{Float64}) = UInt64
+@inline _rand_scalar_uint_type(::Type{Bool}) = UInt32
+
+
+@inline _rand_scalar_from_uint(::Type{UInt8}, u::UInt32)::UInt8 = trunc(UInt8, u >> 24)
+@inline _rand_scalar_from_uint(::Type{UInt16}, u::UInt32)::UInt16 = trunc(UInt16, u >> 16)
+@inline _rand_scalar_from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u
+@inline _rand_scalar_from_uint(::Type{UInt64}, u::UInt64)::UInt64 = u
+@inline _rand_scalar_from_uint(::Type{Int8}, u::UInt32)::Int8 = reinterpret(Int8, trunc(UInt8, u >> 24))
+@inline _rand_scalar_from_uint(::Type{Int16}, u::UInt32)::Int16 = reinterpret(Int16, trunc(UInt16, u >> 16))
+@inline _rand_scalar_from_uint(::Type{Int32}, u::UInt32)::Int32 = reinterpret(Int32, u)
+@inline _rand_scalar_from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u)
+@inline _rand_scalar_from_uint(::Type{Float16}, u::UInt32)::Float16 = uint32_to_unit_float16(u)
+@inline _rand_scalar_from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u)
+@inline _rand_scalar_from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u)
+@inline _rand_scalar_from_uint(::Type{Bool}, u::UInt32)::Bool = isodd(u)
 
 
 #=
@@ -77,10 +79,10 @@ Shared scalar generation:
     ::Type{T}
 )::T where {T <: ALLOWED_RAND_SCALARS}
 
-    UIntType = raw_uint_type(T)
+    UIntType = _rand_scalar_uint_type(T)
     u = rand_uint(seed, alg, counter, UIntType)
 
-    return from_uint(T, u)
+    return _rand_scalar_from_uint(T, u)
 end
 
 
@@ -93,6 +95,7 @@ end
 
 # Convert random UInt32 bits to Float16 in [0, 1) by mantissa construction.
 @inline function uint32_to_unit_float16(u::UInt32)::Float16
+
     # Keep 10 random bits for the mantissa (drop 22 rightmost bits from the UInt32)
     # and combine with the bit pattern of Float16(1.0) (sign=0, exponent=15).
     bits = UInt16(0x3c00) | UInt16(u >> 22)
@@ -104,6 +107,7 @@ end
 
 # Convert random UInt32 bits to Float32 in [0, 1) by mantissa construction.
 @inline function uint32_to_unit_float32(u::UInt32)::Float32
+
     # Keep 23 random bits for the mantissa (drop 9 rightmost bits from the UInt32)
     # and combine with the bit pattern of 1.0f0 (sign=0, exponent=127).
     bits = UInt32(0x3f800000) | (u >> 9)
@@ -115,6 +119,7 @@ end
 
 # Convert random UInt64 bits to Float64 in [0, 1) by mantissa construction.
 @inline function uint64_to_unit_float64(u::UInt64)::Float64
+
     # Keep 52 random bits for the mantissa (drop 12 rightmost bits from the UInt64)
     # and combine with the bit pattern of 1.0 (sign=0, exponent=1023).
     bits = UInt64(0x3ff0000000000000) | (u >> 12)
diff --git a/test/rand.jl b/test/rand.jl
index b7ef167..0495b86 100644
--- a/test/rand.jl
+++ b/test/rand.jl
@@ -70,18 +70,18 @@ end
 
 
     @testset "abstract rng offset behavior" begin
-        mutable struct MutableNoOffsetRNG <: AK.AbstractCounterRNG{AK.Philox}
+        mutable struct MutableNoOffsetRNG
             seed::UInt64
             alg::AK.Philox
         end
 
-        mutable struct MutableWithOffsetRNG <: AK.AbstractCounterRNG{AK.Philox}
+        mutable struct MutableWithOffsetRNG
             seed::UInt64
             alg::AK.Philox
             offset::UInt64
         end
 
-        struct ImmutableWithOffsetRNG <: AK.AbstractCounterRNG{AK.Philox}
+        struct ImmutableWithOffsetRNG
             seed::UInt64
             alg::AK.Philox
             offset::UInt64
diff --git a/test/runtests.jl b/test/runtests.jl
index a2707b6..d1a2d69 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -17,7 +17,13 @@ if "--CUDA" in ARGS
     const BACKEND = CUDABackend()
     TEST_DL[] = true
 elseif "--oneAPI" in ARGS
-    Pkg.add("oneAPI")
+    if Sys.iswindows()
+        # oneAPI v2.6.x can throw `UndefVarError: NEO_jll not defined` on native Windows.
+        # Pin to the latest known-good minor series until upstream fixes are available.
+        Pkg.add(name="oneAPI", version="2.5")
+    else
+        Pkg.add("oneAPI")
+    end
     using oneAPI
     oneAPI.versioninfo()
     const BACKEND = oneAPIBackend()

From 42a434e6c0465593651f84e334838bf53b9327d6 Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Tue, 24 Mar 2026 20:23:31 +0000
Subject: [PATCH 12/18] update tests and docs to match new CounterRNG interface

---
 docs/src/api/rand.md |  15 +++---
 src/rand/rand.jl     |  19 ++------
 test/rand.jl         | 109 ++++++++++++++++++++-----------------------
 3 files changed, 61 insertions(+), 82 deletions(-)

diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md
index 03ed7f3..aab3e5d 100644
--- a/docs/src/api/rand.md
+++ b/docs/src/api/rand.md
@@ -7,27 +7,24 @@ fixed `seed`, algorithm, and call sequence.
 `AK.rand!(rng, v)` call. This means chunked fills are stream-consistent:
 - filling `100` then `100` elements yields the same `200` values as one `200`-element fill.
 - calls that share the same `CounterRNG` instance concurrently are not thread-safe.
-- call `AK.reset!(rng)` to rewind a mutable offset-bearing RNG back to offset `0x0`.
+- call `AK.reset!(rng)` to rewind a `CounterRNG` offset back to `0x0`.
 
-`AK.rand!` also accepts custom `CounterRNG` implementations:
-- if they have a mutable `offset` field, streaming advancement is applied
-- if they have no `offset` field, each call behaves statelessly from counter `0`
-- if they have an immutable `offset` field, that offset is used as a fixed start and is not advanced
+`AK.rand!(rng, v)` accepts `rng::AK.CounterRNG`.
+Passing other RNG container types is not supported and will throw a `MethodError`.
 
 Use an explicit `CounterRNG` when reproducibility is required. For
 convenience,
 `AK.rand!(v)` creates a fresh `CounterRNG()` on each call using one auto-seeded
 `Base.rand(UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used.
 
-`AK.reset!(rng)` rewinds offset to `0x0` for mutable RNGs that have an `offset` field.
+`AK.reset!(rng::AK.CounterRNG)` rewinds `rng.offset` to `0x0`.
 
-Custom RNGs:
+Custom algorithms:
 - Define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`.
-- Define a `CounterRNG` with fields `seed` and `alg`.
-- Add a mutable `offset::UInt64` field if you want stream advancement across calls; omit it for stateless calls from counter `0`.
 - Implement typed `rand_uint` methods:
   - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32`
   - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64`
+- Use your algorithm via `AK.CounterRNG(seed; alg=MyAlg(), offset=...)`.
 
 Both widths should be implemented so `AK.rand!` supports all integer/float output types without falling back or error.
 
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index b9db756..833bcae 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -28,7 +28,6 @@ mutable struct CounterRNG{A <: CounterRNGAlgorithm}
     const alg::A
     offset::UInt64
 end
-#TODO: need to figure out a nice way to allow custom counter RNGs
 
 
 function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)
@@ -49,16 +48,9 @@ CounterRNG(seed::Integer, alg::CounterRNGAlgorithm) = CounterRNG(seed; alg)
 """
     reset!(rng::CounterRNG)
 
-Reset `rng.offset` to `0x0` for RNGs that support mutable stream offsets.
-
-This requires `rng` to:
-- have an `offset` field
-- be mutable
+Reset `rng.offset` to `0x0`.
 """
 @inline function reset!(rng::CounterRNG)
-    @argcheck hasfield(typeof(rng), :offset) "reset! requires an `offset` field"
-    @argcheck ismutabletype(typeof(rng)) "reset! requires a mutable RNG type"
-
     rng.offset = UInt64(0)
     return rng
 end
@@ -95,12 +87,9 @@ include("threefry.jl")
     )
 
 Fill `v` in-place with pseudo-random values using a counter-based RNG stream. For `v[i]`, the
-counter is `start_offset + UInt64(i - 1)` in linear indexing order, where `start_offset` is:
-- `rng.offset` if `rng` has an `offset` field
-- `0` otherwise
+counter is `rng.offset + UInt64(i - 1)` in linear indexing order.
 
-After filling `v`, `rng.offset` advances by `length(v)` only when `rng` has a mutable `offset`
-field.
+After filling `v`, `rng.offset` advances by `length(v)`.
 
 Supported scalar element types are:
 - `UInt8`, `UInt16`, `UInt32`, `UInt64`
@@ -133,7 +122,7 @@ function rand!(
     @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)"
 
     # Local isbits captures from potentially mutable rng object
-    seed, alg = rng.seed, rng.alg
+    seed, alg, initial_offset = rng.seed, rng.alg, rng.offset
     
     foreachindex(
         v, backend;
diff --git a/test/rand.jl b/test/rand.jl
index 0495b86..90e285a 100644
--- a/test/rand.jl
+++ b/test/rand.jl
@@ -69,52 +69,45 @@ end
     end
 
 
-    @testset "abstract rng offset behavior" begin
-        mutable struct MutableNoOffsetRNG
-            seed::UInt64
-            alg::AK.Philox
-        end
-
-        mutable struct MutableWithOffsetRNG
-            seed::UInt64
-            alg::AK.Philox
-            offset::UInt64
-        end
-
-        struct ImmutableWithOffsetRNG
-            seed::UInt64
-            alg::AK.Philox
-            offset::UInt64
-        end
-
-        rng_no_offset = MutableNoOffsetRNG(UInt64(0x1234), AK.Philox())
-        x1 = array_from_host(zeros(Float32, 256))
-        x2 = array_from_host(zeros(Float32, 256))
-        AK.rand!(rng_no_offset, x1; prefer_threads, block_size=64)
-        AK.rand!(rng_no_offset, x2; prefer_threads, block_size=64)
-        @test Array(x1) == Array(x2)
-
-        rng_stream = MutableWithOffsetRNG(UInt64(0x1234), AK.Philox(), UInt64(0))
+    @testset "counter rng offset behavior" begin
+        rng_stream = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17))
         s1 = array_from_host(zeros(Float32, 100))
         s2 = array_from_host(zeros(Float32, 100))
         s12 = array_from_host(zeros(Float32, 200))
         AK.rand!(rng_stream, s1; prefer_threads, block_size=64)
+        @test rng_stream.offset == UInt64(117)
         AK.rand!(rng_stream, s2; prefer_threads, block_size=64)
-        AK.rand!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), s12; prefer_threads, block_size=64)
+        @test rng_stream.offset == UInt64(217)
+
+        rng_once = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17))
+        AK.rand!(rng_once, s12; prefer_threads, block_size=64)
         @test vcat(Array(s1), Array(s2)) == Array(s12)
-        @test rng_stream.offset == UInt64(200)
+        @test rng_once.offset == UInt64(217)
+
+        empty = array_from_host(zeros(Float32, 0))
+        stream_offset = rng_stream.offset
+        AK.rand!(rng_stream, empty; prefer_threads, block_size=64)
+        @test rng_stream.offset == stream_offset
+
+        @test AK.reset!(rng_stream) === rng_stream
+        @test rng_stream.offset == UInt64(0)
 
-        rng_imm = ImmutableWithOffsetRNG(UInt64(0x1234), AK.Philox(), UInt64(17))
         y1 = array_from_host(zeros(Float32, 64))
         y2 = array_from_host(zeros(Float32, 64))
-        AK.rand!(rng_imm, y1; prefer_threads, block_size=64)
-        AK.rand!(rng_imm, y2; prefer_threads, block_size=64)
+        AK.rand!(rng_stream, y1; prefer_threads, block_size=64)
+        AK.rand!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), y2; prefer_threads, block_size=64)
         @test Array(y1) == Array(y2)
 
-        @test AK.reset!(rng_stream) === rng_stream
-        @test rng_stream.offset == UInt64(0)
-        @test_throws ArgumentError AK.reset!(rng_no_offset)
-        @test_throws ArgumentError AK.reset!(rng_imm)
+        mutable struct DummyRNG
+            seed::UInt64
+            alg::AK.Philox
+            offset::UInt64
+        end
+
+        rng_dummy = DummyRNG(UInt64(0x1234), AK.Philox(), UInt64(0))
+        x = array_from_host(zeros(Float32, 16))
+        @test_throws MethodError AK.rand!(rng_dummy, x; prefer_threads, block_size=64)
+        @test_throws MethodError AK.reset!(rng_dummy)
     end
 
 
@@ -146,38 +139,38 @@ end
         @test AK._counter_from_index(1) == UInt64(0)
         @test AK._counter_from_index(17) == UInt64(16)
 
-        @test AK.raw_uint_type(UInt8) === UInt32
-        @test AK.raw_uint_type(UInt16) === UInt32
-        @test AK.raw_uint_type(UInt32) === UInt32
-        @test AK.raw_uint_type(Int8) === UInt32
-        @test AK.raw_uint_type(Int16) === UInt32
-        @test AK.raw_uint_type(Int32) === UInt32
+        @test AK._rand_scalar_uint_type(UInt8) === UInt32
+        @test AK._rand_scalar_uint_type(UInt16) === UInt32
+        @test AK._rand_scalar_uint_type(UInt32) === UInt32
+        @test AK._rand_scalar_uint_type(Int8) === UInt32
+        @test AK._rand_scalar_uint_type(Int16) === UInt32
+        @test AK._rand_scalar_uint_type(Int32) === UInt32
         if RUN_FLOAT16_RAND_TESTS
-            @test AK.raw_uint_type(Float16) === UInt32
+            @test AK._rand_scalar_uint_type(Float16) === UInt32
         end
-        @test AK.raw_uint_type(Float32) === UInt32
-        @test AK.raw_uint_type(UInt64) === UInt64
-        @test AK.raw_uint_type(Int64) === UInt64
-        @test AK.raw_uint_type(Bool) === UInt32
+        @test AK._rand_scalar_uint_type(Float32) === UInt32
+        @test AK._rand_scalar_uint_type(UInt64) === UInt64
+        @test AK._rand_scalar_uint_type(Int64) === UInt64
+        @test AK._rand_scalar_uint_type(Bool) === UInt32
         if RUN_FLOAT64_RAND_TESTS
-            @test AK.raw_uint_type(Float64) === UInt64
+            @test AK._rand_scalar_uint_type(Float64) === UInt64
         end
 
-        @test AK.from_uint(UInt8, UInt32(0xabcdef01)) == UInt8(0xab)
-        @test AK.from_uint(UInt16, UInt32(0xabcdef01)) == UInt16(0xabcd)
-        @test AK.from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32
-        @test AK.from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64
-        @test AK.from_uint(Int8, UInt32(0xff000000)) == Int8(-1)
-        @test AK.from_uint(Int16, UInt32(0xffff0000)) == Int16(-1)
-        @test AK.from_uint(Int32, 0b11111111111111111111111111111111 % UInt32) == Int32(-1)
-        @test AK.from_uint(
+        @test AK._rand_scalar_from_uint(UInt8, UInt32(0xabcdef01)) == UInt8(0xab)
+        @test AK._rand_scalar_from_uint(UInt16, UInt32(0xabcdef01)) == UInt16(0xabcd)
+        @test AK._rand_scalar_from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32
+        @test AK._rand_scalar_from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64
+        @test AK._rand_scalar_from_uint(Int8, UInt32(0xff000000)) == Int8(-1)
+        @test AK._rand_scalar_from_uint(Int16, UInt32(0xffff0000)) == Int16(-1)
+        @test AK._rand_scalar_from_uint(Int32, 0b11111111111111111111111111111111 % UInt32) == Int32(-1)
+        @test AK._rand_scalar_from_uint(
             Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64
         ) == Int64(-1)
         if RUN_FLOAT16_RAND_TESTS
-            @test AK.from_uint(Float16, UInt32(0)) == Float16(0)
+            @test AK._rand_scalar_from_uint(Float16, UInt32(0)) == Float16(0)
         end
-        @test AK.from_uint(Bool, UInt32(0)) == false
-        @test AK.from_uint(Bool, UInt32(1)) == true
+        @test AK._rand_scalar_from_uint(Bool, UInt32(0)) == false
+        @test AK._rand_scalar_from_uint(Bool, UInt32(1)) == true
 
         if RUN_FLOAT16_RAND_TESTS
             @test AK.uint32_to_unit_float16(UInt32(0)) == Float16(0)

From a9582a021b9592708bccf939fdf0fee36b59fc3c Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Wed, 25 Mar 2026 02:39:47 +0000
Subject: [PATCH 13/18] initial randn!

---
 docs/src/api/rand.md             |  10 ++
 prototype/rand/plot/Project.toml |   3 +
 prototype/rand/randn.jl          |  53 +++++++
 src/rand/rand.jl                 |   5 +-
 src/rand/randn.jl                | 258 +++++++++++++++++++++++++++++++
 test/randn.jl                    | 228 +++++++++++++++++++++++++++
 test/runtests.jl                 |   1 +
 7 files changed, 557 insertions(+), 1 deletion(-)
 create mode 100644 prototype/rand/plot/Project.toml
 create mode 100644 prototype/rand/randn.jl
 create mode 100644 src/rand/randn.jl
 create mode 100644 test/randn.jl

diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md
index aab3e5d..2ac9d91 100644
--- a/docs/src/api/rand.md
+++ b/docs/src/api/rand.md
@@ -34,6 +34,11 @@ Supported element types:
 - `Float16`, `Float32`, `Float64`
 - `Bool`
 
+`AK.randn!` fills arrays with standard normal samples and currently supports:
+- `Float16`, `Float32`, `Float64`
+
+`AK.randn!` uses Box-Muller with open-interval uniforms in `(0, 1)` from a branch-free midpoint mapping.
+
 The core of the random number generation produces either a `UInt32` or `UInt64` depending on the width of the requested element type.
 That `UInt` is then either:
 - Unsigned integers: returned as-is or truncated if necessary.
@@ -82,6 +87,10 @@ AK.rand!(rng, v2)
 # Convenience (fresh auto-seeded RNG on each call)
 y = oneArray{Float32}(undef, 1024)
 AK.rand!(y)
+
+# Standard normal samples
+z = oneArray{Float32}(undef, 1024)
+AK.randn!(rng, z)
 ```
 
 ```@docs
@@ -89,4 +98,5 @@ AcceleratedKernels.CounterRNG
 AcceleratedKernels.CounterRNGAlgorithm
 AcceleratedKernels.reset!
 AcceleratedKernels.rand!
+AcceleratedKernels.randn!
 ```
diff --git a/prototype/rand/plot/Project.toml b/prototype/rand/plot/Project.toml
new file mode 100644
index 0000000..a95f271
--- /dev/null
+++ b/prototype/rand/plot/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
diff --git a/prototype/rand/randn.jl b/prototype/rand/randn.jl
new file mode 100644
index 0000000..a21cc08
--- /dev/null
+++ b/prototype/rand/randn.jl
@@ -0,0 +1,53 @@
+using BenchmarkTools
+using CUDA
+
+import AcceleratedKernels as AK
+
+
+const N = 100_000_000
+const GPU_BLOCK_SIZE = 256
+
+const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox())
+
+TestType = Float32
+
+x_cuda = CuArray{TestType}(undef, N)
+x_philox = CuArray{TestType}(undef, N)
+x_cpu = Vector{TestType}(undef, N)
+
+
+function run_cuda_randn!(x)
+    CUDA.randn!(x)
+    CUDA.synchronize()
+    return x
+end
+
+
+function run_ak_randn_gpu!(rng, x)
+    AK.randn!(rng, x; block_size=GPU_BLOCK_SIZE)
+    AK.synchronize(AK.get_backend(x))
+    return x
+end
+
+
+function run_ak_randn_cpu!(rng, x)
+    AK.randn!(rng, x)
+    return x
+end
+
+# warmup compile
+run_cuda_randn!(x_cuda)
+run_ak_randn_gpu!(RNG_PHILOX, x_philox)
+
+println("N = ", N)
+println("CPU threads: ", Threads.nthreads())
+
+println("\nCUDA.randn! benchmark (CuArray{$TestType}, in-place)")
+display(@benchmark run_cuda_randn!($x_cuda))
+
+println("\nAK.randn! Philox benchmark (GPU, CuArray{$TestType})")
+display(@benchmark run_ak_randn_gpu!($RNG_PHILOX, $x_philox))
+
+# println("\nAK.randn! benchmark (CPU, Vector{$TestType}, Philox)")
+# display(@benchmark run_ak_randn_cpu!($RNG_PHILOX, $x_cpu))
+
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index 833bcae..bafd0bd 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -66,6 +66,9 @@ include("splitmix.jl")
 include("philox.jl")
 include("threefry.jl")
 
+# Normally distributed scalar generators and randn!
+include("randn.jl")
+
 
 
 
@@ -121,7 +124,7 @@ function rand!(
 
     @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)"
 
-    # Local isbits captures from potentially mutable rng object
+    # Local isbits captures from mutable rng object
     seed, alg, initial_offset = rng.seed, rng.alg, rng.offset
     
     foreachindex(
diff --git a/src/rand/randn.jl b/src/rand/randn.jl
new file mode 100644
index 0000000..f6acb0e
--- /dev/null
+++ b/src/rand/randn.jl
@@ -0,0 +1,258 @@
+const ALLOWED_RANDN_SCALARS = Union{Float16, Float32, Float64}
+
+const U24_MAX_SAFE_MIDPOINT = UInt32(0x00fffffe)                 # 2^24 - 2
+const U53_MAX_SAFE_MIDPOINT = UInt64(0x001ffffffffffffe)         # 2^53 - 2
+const MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24)                # 2^-24
+const MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53)                # 2^-53
+
+
+
+
+#=
+The below Float constructions are not duplicates of those in utilities.jl - they are needed to
+ensure an interval of (0, 1) as opposed to [0, 1). Achieving this purely logically with midpoint
+mapping means we can avoid a check for producing a 0 (which would normally cause a redraw).
+Avoiding 0 is essential for Box-Muller due to the logarithm functions.
+=#
+
+
+# Convert random UInt32 bits to Float32 in (0, 1) using midpoint mapping on a 24-bit grid.
+@inline function uint32_to_open_unit_float32_midpoint(u::UInt32)::Float32
+    # `min` keeps the top midpoint below one after Float32 rounding.
+    k = min(u >> 8, U24_MAX_SAFE_MIDPOINT)
+    return (Float32(k) + 0.5f0) * MIDPOINT_SCALE_F32
+end
+
+
+# Convert random UInt64 bits to Float64 in (0, 1) using midpoint mapping on a 53-bit grid.
+@inline function uint64_to_open_unit_float64_midpoint(u::UInt64)::Float64
+    # `min` keeps the top midpoint below one after Float64 rounding.
+    k = min(u >> 11, U53_MAX_SAFE_MIDPOINT)
+    return (Float64(k) + 0.5) * MIDPOINT_SCALE_F64
+end
+
+
+# Float16 path reuses Float32 midpoint sampling for robust math in Box-Muller.
+@inline function rand_open01(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    counter::UInt64,
+    ::Type{Float16},
+)::Float16
+    return Float16(rand_open01(seed, alg, counter, Float32))
+end
+
+
+@inline function rand_open01(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    counter::UInt64,
+    ::Type{Float32},
+)::Float32
+    return uint32_to_open_unit_float32_midpoint(rand_uint(seed, alg, counter, UInt32))
+end
+
+
+@inline function rand_open01(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    counter::UInt64,
+    ::Type{Float64},
+)::Float64
+    return uint64_to_open_unit_float64_midpoint(rand_uint(seed, alg, counter, UInt64))
+end
+
+
+@inline function rand_open01(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T}
+    throw(ArgumentError(
+        "Unsupported open-interval random type $(T). Supported: $(ALLOWED_RANDN_SCALARS)"
+    ))
+end
+
+
+@inline function randn_pair(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    pair_counter::UInt64,
+    ::Type{Float16},
+)::Tuple{Float16, Float16}
+    z0, z1 = randn_pair(seed, alg, pair_counter, Float32)
+    return Float16(z0), Float16(z1)
+end
+
+
+@inline function randn_pair(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    pair_counter::UInt64,
+    ::Type{Float32},
+)::Tuple{Float32, Float32}
+    u = rand_uint(seed, alg, pair_counter, UInt64)
+    u1 = uint32_to_open_unit_float32_midpoint(_u32_lo(u))
+    u2 = uint32_to_open_unit_float32_midpoint(_u32_hi(u))
+    radius = sqrt(-2.0f0 * log(u1))
+    theta = Float32(2pi) * u2
+    stheta, ctheta = sincos(theta)
+    return radius * ctheta, radius * stheta
+end
+
+
+@inline function randn_pair(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    pair_counter::UInt64,
+    ::Type{Float64},
+)::Tuple{Float64, Float64}
+    c0 = pair_counter << 1
+    u1 = rand_open01(seed, alg, c0, Float64)
+    u2 = rand_open01(seed, alg, c0 + UInt64(1), Float64)
+    radius = sqrt(-2.0 * log(u1))
+    theta = Float64(2pi) * u2
+    stheta, ctheta = sincos(theta)
+    return radius * ctheta, radius * stheta
+end
+
+
+@inline function randn_pair(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T}
+    throw(ArgumentError(
+        "Unsupported normal random type $(T). Supported: $(ALLOWED_RANDN_SCALARS)"
+    ))
+end
+
+
+@inline function randn_scalar(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    normal_counter::UInt64,
+    ::Type{T},
+)::T where {T <: ALLOWED_RANDN_SCALARS}
+    pair_counter = normal_counter >> 1
+    z0, z1 = randn_pair(seed, alg, pair_counter, T)
+    return iszero(normal_counter & UInt64(0x1)) ? z0 : z1
+end
+
+
+@inline function randn_scalar(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T}
+    throw(ArgumentError(
+        "Unsupported normal random scalar type $(T). Supported: $(ALLOWED_RANDN_SCALARS)"
+    ))
+end
+
+
+"""
+    randn!(
+        rng::CounterRNG,
+        v::AbstractArray{T},
+        backend::Backend=get_backend(v);
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+
+        # Implementation choice
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+    ) where {T <: AbstractFloat}
+
+Fill `v` in-place with pseudo-random samples from a standard normal distribution.
+
+For `v[i]`, the normal stream counter is `rng.offset + UInt64(i - 1)` in linear indexing order.
+Values are generated using Box-Muller from midpoint-open uniforms in `(0, 1)`.
+
+After filling `v`, `rng.offset` advances by `length(v)`.
+"""
+function randn!(
+    rng::CounterRNG,
+    v::AbstractArray{T},
+    backend::Backend=get_backend(v);
+
+    # CPU settings
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int=256,
+) where T
+
+    @argcheck T <: ALLOWED_RANDN_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RANDN_SCALARS)"
+
+    isempty(v) && return v
+
+    # Local isbits captures from mutable rng object.
+    seed, alg, initial_offset = rng.seed, rng.alg, rng.offset
+    len = length(v)
+    pair_start = initial_offset >> 1
+
+    # Even stream offset is the common path and maps pair `i` to output indices `(2i-1, 2i)`.
+    if iszero(initial_offset & UInt64(0x1))
+        pair_count = cld(len, 2)
+        pair_indices = Base.OneTo(pair_count)
+
+        # Fully branch-free hot path when both offset and length are even.
+        if iseven(len)
+            foreachindex(
+                pair_indices, backend;
+                max_tasks, min_elems, prefer_threads, block_size,
+            ) do i
+                pair_counter = pair_start + _counter_from_index(i)
+                z0, z1 = randn_pair(seed, alg, pair_counter, T)
+                i0 = (i << 1) - 1
+                @inbounds v[i0] = z0
+                @inbounds v[i0 + 1] = z1
+            end
+        else
+            foreachindex(
+                pair_indices, backend;
+                max_tasks, min_elems, prefer_threads, block_size,
+            ) do i
+                pair_counter = pair_start + _counter_from_index(i)
+                z0, z1 = randn_pair(seed, alg, pair_counter, T)
+                i0 = (i << 1) - 1
+                @inbounds v[i0] = z0
+                i1 = i0 + 1
+
+                if i1 <= len
+                    @inbounds v[i1] = z1
+                end
+            end
+        end
+    else
+        # Odd stream offset shifts pair `i` to `(2i-2, 2i-1)`; only the first z0 is out of range.
+        pair_count = cld(len + 1, 2)
+        pair_indices = Base.OneTo(pair_count)
+
+        foreachindex(
+            pair_indices, backend;
+            max_tasks, min_elems, prefer_threads, block_size,
+        ) do i
+            pair_counter = pair_start + _counter_from_index(i)
+            z0, z1 = randn_pair(seed, alg, pair_counter, T)
+            i0 = (i << 1) - 2
+
+            if i0 >= 1
+                @inbounds v[i0] = z0
+            end
+
+            i1 = i0 + 1
+            if i1 <= len
+                @inbounds v[i1] = z1
+            end
+        end
+    end
+
+    rng.offset += UInt64(len)
+
+    v
+end
+
+
+function randn!(
+    v::AbstractArray,
+    args...;
+    kwargs...,
+)
+    return randn!(CounterRNG(), v, args...; kwargs...)
+end
diff --git a/test/randn.jl b/test/randn.jl
new file mode 100644
index 0000000..c0e9c56
--- /dev/null
+++ b/test/randn.jl
@@ -0,0 +1,228 @@
+const RANDN_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry())
+const RANDN_FLOAT_TYPES_BACKEND = IS_CPU_BACKEND ? (Float16, Float32, Float64) : (Float32,)
+
+
+_is_finite(v) = all(isfinite, v)
+
+
+function _randn_fill_reference!(
+    rng,
+    x::AbstractArray{T};
+    counter_offset::UInt64=UInt64(0),
+) where {T <: AK.ALLOWED_RANDN_SCALARS}
+    @inbounds for i in eachindex(x)
+        x[i] = AK.randn_scalar(rng.seed, rng.alg, counter_offset + UInt64(i - one(i)), T)
+    end
+    return x
+end
+
+
+function _assert_randn_matches_reference!(rng, x; kwargs...)
+    counter_offset = rng.offset
+    AK.randn!(rng, x; kwargs...)
+    ref = zeros(eltype(x), size(x))
+    _randn_fill_reference!(rng, ref; counter_offset)
+    @test Array(x) == ref
+    return x
+end
+
+
+@testset "randn" begin
+    @testset "open interval helpers" begin
+        @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(UInt32(0)) < 1.0f0
+        @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(typemax(UInt32)) < 1.0f0
+        @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0
+        @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0
+    end
+
+
+    @testset "rand_open01 and randn_scalar" begin
+        seed = UInt64(0x123456789abcdef)
+        for alg in RANDN_ALGS
+            for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023))
+                u32 = AK.rand_open01(seed, alg, c, Float32)
+                @test 0.0f0 < u32 < 1.0f0
+                if IS_CPU_BACKEND
+                    u64 = AK.rand_open01(seed, alg, c, Float64)
+                    @test 0.0 < u64 < 1.0
+                end
+            end
+
+            for T in RANDN_FLOAT_TYPES_BACKEND
+                s0 = AK.randn_scalar(seed, alg, UInt64(42), T)
+                s1 = AK.randn_scalar(seed, alg, UInt64(43), T)
+                @test s0 isa T
+                @test s1 isa T
+                @test isfinite(s0)
+                @test isfinite(s1)
+                @test s0 == AK.randn_scalar(seed, alg, UInt64(42), T)
+                @test s1 == AK.randn_scalar(seed, alg, UInt64(43), T)
+
+                p0, p1 = AK.randn_pair(seed, alg, UInt64(21), T)
+                @test AK.randn_scalar(seed, alg, UInt64(42), T) == p0
+                @test AK.randn_scalar(seed, alg, UInt64(43), T) == p1
+            end
+        end
+
+        @test_throws ArgumentError AK.randn_scalar(seed, AK.Philox(), UInt64(0), UInt32)
+    end
+
+
+    @testset "randn! explicit rng" begin
+        lengths = (0, 1, 31, 32, 33, 257, 1024)
+
+        for alg in RANDN_ALGS
+            rng = AK.CounterRNG(0x123456789abcdef; alg)
+
+            for T in RANDN_FLOAT_TYPES_BACKEND
+                for len in lengths
+                    x = array_from_host(zeros(T, len))
+                    _assert_randn_matches_reference!(rng, x; prefer_threads, block_size=64)
+                    @test _is_finite(Array(x))
+                end
+            end
+
+            for T in RANDN_FLOAT_TYPES_BACKEND
+                x1 = array_from_host(zeros(T, 2048))
+                x2 = array_from_host(zeros(T, 2048))
+                rng1 = AK.CounterRNG(rng.seed; alg=rng.alg)
+                rng2 = AK.CounterRNG(rng.seed; alg=rng.alg)
+                AK.randn!(rng1, x1; prefer_threads, block_size=64)
+                AK.randn!(rng2, x2; prefer_threads, block_size=257)
+                @test Array(x1) == Array(x2)
+            end
+
+            for T in RANDN_FLOAT_TYPES_BACKEND
+                rng1 = AK.CounterRNG(rng.seed; alg=rng.alg)
+                rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg)
+                x1 = array_from_host(zeros(T, 2048))
+                x2 = array_from_host(zeros(T, 2048))
+                AK.randn!(rng1, x1; prefer_threads, block_size=64)
+                AK.randn!(rng2, x2; prefer_threads, block_size=64)
+                @test Array(x1) != Array(x2)
+            end
+        end
+    end
+
+
+    @testset "counter rng offset behavior" begin
+        rng_stream = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17))
+        s1 = array_from_host(zeros(Float32, 99))
+        s2 = array_from_host(zeros(Float32, 101))
+        s12 = array_from_host(zeros(Float32, 200))
+        AK.randn!(rng_stream, s1; prefer_threads, block_size=64)
+        @test rng_stream.offset == UInt64(116)
+        AK.randn!(rng_stream, s2; prefer_threads, block_size=64)
+        @test rng_stream.offset == UInt64(217)
+
+        rng_once = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17))
+        AK.randn!(rng_once, s12; prefer_threads, block_size=64)
+        @test vcat(Array(s1), Array(s2)) == Array(s12)
+        @test rng_once.offset == UInt64(217)
+
+        empty = array_from_host(zeros(Float32, 0))
+        stream_offset = rng_stream.offset
+        AK.randn!(rng_stream, empty; prefer_threads, block_size=64)
+        @test rng_stream.offset == stream_offset
+
+        @test AK.reset!(rng_stream) === rng_stream
+        @test rng_stream.offset == UInt64(0)
+
+        y1 = array_from_host(zeros(Float32, 64))
+        y2 = array_from_host(zeros(Float32, 64))
+        AK.randn!(rng_stream, y1; prefer_threads, block_size=64)
+        AK.randn!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), y2; prefer_threads, block_size=64)
+        @test Array(y1) == Array(y2)
+    end
+
+
+    @testset "reset!" begin
+        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+        x1 = array_from_host(zeros(Float32, 512))
+        x2 = array_from_host(zeros(Float32, 512))
+
+        AK.randn!(rng, x1; prefer_threads, block_size=64)
+        @test rng.offset == UInt64(512)
+        @test AK.reset!(rng) === rng
+        @test rng.offset == UInt64(0)
+        AK.randn!(rng, x2; prefer_threads, block_size=64)
+
+        @test Array(x1) == Array(x2)
+    end
+
+
+    @testset "randn! n-dimensional and views" begin
+        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+
+        for T in RANDN_FLOAT_TYPES_BACKEND
+            xnd = array_from_host(zeros(T, 7, 11, 5))
+            _assert_randn_matches_reference!(rng, xnd; prefer_threads, block_size=128)
+        end
+
+        if IS_CPU_BACKEND
+            for T in RANDN_FLOAT_TYPES_BACKEND
+                base = zeros(T, 64)
+                view_x = @view base[2:2:end]
+                AK.randn!(
+                    rng, view_x;
+                    max_tasks=Threads.nthreads(),
+                    min_elems=1,
+                    prefer_threads=true
+                )
+                ref_view = zeros(T, length(view_x))
+                _randn_fill_reference!(
+                    rng, ref_view;
+                    counter_offset=rng.offset - UInt64(length(view_x)),
+                )
+                @test collect(view_x) == ref_view
+            end
+        end
+    end
+
+
+    @testset "randn! convenience" begin
+        ref1 = array_from_host(zeros(Float32, 1024))
+        ref2 = array_from_host(zeros(Float32, 1024))
+        x1 = array_from_host(zeros(Float32, 1024))
+        x2 = array_from_host(zeros(Float32, 1024))
+
+        Random.seed!(0xabcdef)
+        seed1 = Random.rand(Random.default_rng(), UInt64)
+        AK.randn!(AK.CounterRNG(seed1; alg=AK.Philox()), ref1; prefer_threads, block_size=64)
+        seed2 = Random.rand(Random.default_rng(), UInt64)
+        AK.randn!(AK.CounterRNG(seed2; alg=AK.Philox()), ref2; prefer_threads, block_size=64)
+
+        Random.seed!(0xabcdef)
+        AK.randn!(x1; prefer_threads, block_size=64)
+        AK.randn!(x2; prefer_threads, block_size=64)
+        @test Array(x1) == Array(ref1)
+        @test Array(x2) == Array(ref2)
+
+        x_bad = zeros(UInt32, 16)
+        @test_throws ArgumentError AK.randn!(x_bad; prefer_threads)
+        @test_throws ArgumentError AK.randn!(AK.CounterRNG(0x1), x_bad; prefer_threads)
+    end
+
+
+    @testset "moments sanity" begin
+        n = 200_000
+        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+
+        for T in RANDN_FLOAT_TYPES_BACKEND
+            x = array_from_host(zeros(T, n))
+            AK.randn!(rng, x; prefer_threads, block_size=128)
+            xa = Float64.(Array(x))
+
+            m = sum(xa) / length(xa)
+            v = sum((xi - m)^2 for xi in xa) / length(xa)
+
+            if T === Float16
+                @test abs(m) < 0.1
+                @test abs(v - one(v)) < 0.15
+            else
+                @test abs(m) < 0.01
+                @test abs(v - one(v)) < 0.03
+            end
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index d1a2d69..5d3a6ad 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -76,6 +76,7 @@ include("partition.jl")
 include("looping.jl")
 include("map.jl")
 include("rand.jl")
+include("randn.jl")
 include("sort.jl")
 include("reduce.jl")
 include("accumulate.jl")

From 61aa11dc582ac474df2e05bda3cbad7d6f34c208 Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Thu, 26 Mar 2026 13:23:44 +0000
Subject: [PATCH 14/18] ensure compile-time initial index bias, now beating
 CUDA for both odd and even offsets + array lengths. Added convenience rand()
 and randn() convenience constructors using KernelAbstractions.allocate based
 on the provided backend, Type, and dims

---
 prototype/rand/Project.toml |   1 +
 prototype/rand/randn.jl     |   2 +-
 src/rand/rand.jl            |  56 ++++++++++++
 src/rand/randn.jl           | 167 ++++++++++++++++++++++++------------
 src/rand/utilities.jl       |  19 ++++
 test/rand.jl                |  40 +++++++++
 test/randn.jl               | 118 +++++++++++++++++--------
 7 files changed, 309 insertions(+), 94 deletions(-)

diff --git a/prototype/rand/Project.toml b/prototype/rand/Project.toml
index 7757b4d..675e6c7 100644
--- a/prototype/rand/Project.toml
+++ b/prototype/rand/Project.toml
@@ -3,5 +3,6 @@ AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 PProf = "e4faabce-9ead-11e9-39d9-4379958e3056"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
diff --git a/prototype/rand/randn.jl b/prototype/rand/randn.jl
index a21cc08..8370d91 100644
--- a/prototype/rand/randn.jl
+++ b/prototype/rand/randn.jl
@@ -7,7 +7,7 @@ import AcceleratedKernels as AK
 const N = 100_000_000
 const GPU_BLOCK_SIZE = 256
 
-const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox())
+const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox(), offset=0x0)
 
 TestType = Float32
 
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index bafd0bd..c36a61e 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -147,3 +147,59 @@ function rand!(
 )
     return rand!(CounterRNG(), v, args...; kwargs...)
 end
+
+
+"""
+    rand(
+        rng::CounterRNG,
+        backend::Backend,
+        ::Type{T},
+        dims::Integer...;
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+        block_size::Int=256,
+    ) where T
+
+Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via
+[`rand!`](@ref), and return it.
+"""
+function rand(
+    rng::CounterRNG,
+    backend::Backend,
+    ::Type{T},
+    dims::Integer...;
+
+    # CPU settings
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int=256,
+) where T
+    return _allocate_and_fill(
+        rand!, rng, backend, T, dims...;
+        max_tasks, min_elems, prefer_threads, block_size,
+    )
+end
+
+
+function rand(
+    backend::Backend,
+    ::Type{T},
+    dims::Integer...;
+    
+    # CPU settings
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int=256,
+) where T
+    return rand(
+        CounterRNG(), backend, T, dims...;
+        max_tasks, min_elems, prefer_threads, block_size,
+    )
+end
diff --git a/src/rand/randn.jl b/src/rand/randn.jl
index f6acb0e..f3cce14 100644
--- a/src/rand/randn.jl
+++ b/src/rand/randn.jl
@@ -139,6 +139,59 @@ end
 end
 
 
+# `Val{ODD}` keeps parity in the type domain so each specialization (`ODD==0` / `ODD==1`)
+# can fold index bias at compile time.
+# - `Val{0}` => even-offset pair writes at indices `(2i-1, 2i)` so bias is `-1`
+# - `Val{1}` => odd-offset pair writes at indices `(2i, 2i+1)` after prefix handling so bias is `0`
+@inline _randn_i0_bias(::Val{0}) = -1
+@inline _randn_i0_bias(::Val{1}) = 0
+
+
+@inline function _randn_core!(
+    v::AbstractArray{T}, seed, alg, initial_offset,
+    backend, max_tasks, min_elems, prefer_threads, block_size,
+    ::Val{ODD},
+) where {T, ODD}
+
+    len = length(v)
+    prefix_len = ODD
+
+    # If offset is odd, need to individually handle the first element.
+    prefix_len == 1 && @allowscalar @inbounds v[1] = randn_scalar(seed, alg, initial_offset, T)
+
+    # Stream is now even-aligned, so can foreachindex through the pairs.
+    pair_start = (initial_offset + UInt64(prefix_len)) >> 1
+
+    # Capture `Val(ODD)` into the closure so bias stays a compile-time constant inside the loop.
+    odd_val = Val(ODD)
+    i0_bias = _randn_i0_bias(odd_val)
+    remaining_len = len - prefix_len
+    pair_count = remaining_len >> 1
+
+    if pair_count > 0
+        foreachindex(
+            Base.OneTo(pair_count), backend;
+            max_tasks, min_elems, prefer_threads, block_size,
+        ) do i
+            pair_counter = pair_start + _counter_from_index(i)
+            z0, z1 = randn_pair(seed, alg, pair_counter, T)
+            i0 = (i << 1) + _randn_i0_bias(odd_val)
+            @inbounds v[i0] = z0
+            @inbounds v[i0 + 1] = z1
+        end
+    end
+
+    # If an extra element remains after pair writing, fill it individually.
+    tail_index = (pair_count << 1) + i0_bias + 2
+    if tail_index <= len
+        tail_counter = initial_offset + UInt64(tail_index - 1)
+        @allowscalar @inbounds v[tail_index] = randn_scalar(seed, alg, tail_counter, T)
+    end
+
+    return v
+end
+
+
 """
     randn!(
         rng::CounterRNG,
@@ -183,67 +236,19 @@ function randn!(
 
     # Local isbits captures from mutable rng object.
     seed, alg, initial_offset = rng.seed, rng.alg, rng.offset
-    len = length(v)
-    pair_start = initial_offset >> 1
-
-    # Even stream offset is the common path and maps pair `i` to output indices `(2i-1, 2i)`.
-    if iszero(initial_offset & UInt64(0x1))
-        pair_count = cld(len, 2)
-        pair_indices = Base.OneTo(pair_count)
-
-        # Fully branch-free hot path when both offset and length are even.
-        if iseven(len)
-            foreachindex(
-                pair_indices, backend;
-                max_tasks, min_elems, prefer_threads, block_size,
-            ) do i
-                pair_counter = pair_start + _counter_from_index(i)
-                z0, z1 = randn_pair(seed, alg, pair_counter, T)
-                i0 = (i << 1) - 1
-                @inbounds v[i0] = z0
-                @inbounds v[i0 + 1] = z1
-            end
-        else
-            foreachindex(
-                pair_indices, backend;
-                max_tasks, min_elems, prefer_threads, block_size,
-            ) do i
-                pair_counter = pair_start + _counter_from_index(i)
-                z0, z1 = randn_pair(seed, alg, pair_counter, T)
-                i0 = (i << 1) - 1
-                @inbounds v[i0] = z0
-                i1 = i0 + 1
-
-                if i1 <= len
-                    @inbounds v[i1] = z1
-                end
-            end
-        end
-    else
-        # Odd stream offset shifts pair `i` to `(2i-2, 2i-1)`; only the first z0 is out of range.
-        pair_count = cld(len + 1, 2)
-        pair_indices = Base.OneTo(pair_count)
-
-        foreachindex(
-            pair_indices, backend;
-            max_tasks, min_elems, prefer_threads, block_size,
-        ) do i
-            pair_counter = pair_start + _counter_from_index(i)
-            z0, z1 = randn_pair(seed, alg, pair_counter, T)
-            i0 = (i << 1) - 2
 
-            if i0 >= 1
-                @inbounds v[i0] = z0
-            end
+    core_args = (
+        v, seed, alg, initial_offset, backend, max_tasks, min_elems, prefer_threads, block_size
+    )
 
-            i1 = i0 + 1
-            if i1 <= len
-                @inbounds v[i1] = z1
-            end
-        end
+    # Dispatch depending on required initial index bias
+    if iseven(initial_offset)
+        _randn_core!(core_args..., Val(0))
+    else
+        _randn_core!(core_args..., Val(1))
     end
 
-    rng.offset += UInt64(len)
+    rng.offset += UInt64(length(v))
 
     v
 end
@@ -256,3 +261,51 @@ function randn!(
 )
     return randn!(CounterRNG(), v, args...; kwargs...)
 end
+
+
+"""
+    randn(
+        rng::CounterRNG,
+        backend::Backend,
+        ::Type{T},
+        dims::Integer...;
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+        block_size::Int=256,
+    ) where T
+
+Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via
+[`randn!`](@ref), and return it.
+"""
+function randn(
+    rng::CounterRNG,
+    backend::Backend,
+    ::Type{T},
+    dims::Integer...;
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+    block_size::Int=256,
+) where T
+    return _allocate_and_fill(
+        randn!, rng, backend, T, dims...;
+        max_tasks, min_elems, prefer_threads, block_size,
+    )
+end
+
+
+function randn(
+    backend::Backend,
+    ::Type{T},
+    dims::Integer...;
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+    block_size::Int=256,
+) where T
+    return randn(
+        CounterRNG(), backend, T, dims...;
+        max_tasks, min_elems, prefer_threads, block_size,
+    )
+end
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index f2159c5..b2c60db 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -15,6 +15,25 @@
 @inline _counter_from_index(i)::UInt64 = UInt64(i - one(i))
 
 
+# Shared allocation + fill helper for rand/randn convenience constructors.
+@inline function _allocate_and_fill(
+    fill!,
+    rng::CounterRNG,
+    backend::Backend,
+    ::Type{T},
+    dims::Integer...;
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+    block_size::Int=256,
+) where {T}
+    dims_int = Base.map(Int, dims)
+    v = KernelAbstractions.allocate(backend, T, dims_int)
+    fill!(rng, v, backend; max_tasks, min_elems, prefer_threads, block_size)
+    return v
+end
+
+
 # Internal scalar eltypes currently supported by rand!.
 const ALLOWED_RAND_SCALARS = Union{
     UInt8, UInt16, UInt32, UInt64,
diff --git a/test/rand.jl b/test/rand.jl
index 90e285a..d3eacca 100644
--- a/test/rand.jl
+++ b/test/rand.jl
@@ -362,4 +362,44 @@ end
         @test_throws ArgumentError AK.rand!(x_bad; prefer_threads)
         @test_throws ArgumentError AK.rand!(AK.CounterRNG(0x1), x_bad; prefer_threads)
     end
+
+
+    @testset "rand allocation convenience" begin
+        rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox())
+        y = AK.rand(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64)
+        @test size(y) == (6, 7)
+        @test eltype(y) === Float32
+        @test _is_unit_interval(Array(y))
+        @test rng.offset == UInt64(length(y))
+
+        rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
+        rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
+        y_alloc = AK.rand(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64)
+        y_fill = array_from_host(zeros(Float32, 128))
+        AK.rand!(rng_fill, y_fill; prefer_threads, block_size=64)
+        @test Array(y_alloc) == Array(y_fill)
+        @test rng_alloc.offset == rng_fill.offset == UInt64(128)
+
+        # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks.
+        AK.rand(BACKEND, Float32, 1; prefer_threads, block_size=64)
+
+        # Auto-seeded constructor should match explicit seed capture from default RNG.
+        Random.seed!(0x9abc)
+        seed = Random.rand(Random.default_rng(), UInt64)
+        ref = AK.rand(AK.CounterRNG(seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64)
+        Random.seed!(0x9abc)
+        x = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        @test Array(x) == Array(ref)
+
+        # Reseeding should reproduce the same auto-seeded draw.
+        Random.seed!(0x7777)
+        x1 = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        Random.seed!(0x7777)
+        x2 = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        @test Array(x1) == Array(x2)
+
+        @test_throws ArgumentError AK.rand(AK.CounterRNG(0x1), BACKEND, UInt128, 16; prefer_threads)
+        @test_throws MethodError AK.rand(AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.rand(BACKEND, Float32, 16; prefer_threads, bad=:kwarg)
+    end
 end
diff --git a/test/randn.jl b/test/randn.jl
index c0e9c56..82261c8 100644
--- a/test/randn.jl
+++ b/test/randn.jl
@@ -1,8 +1,12 @@
 const RANDN_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry())
 const RANDN_FLOAT_TYPES_BACKEND = IS_CPU_BACKEND ? (Float16, Float32, Float64) : (Float32,)
+const RANDN_LENGTHS = (0, 1, 2, 31, 32, 33, 257, 1024)
 
 
-_is_finite(v) = all(isfinite, v)
+_all_finite(v) = all(isfinite, v)
+_randn_reference_atol(::Type{Float16}) = 16 * eps(Float16)
+_randn_reference_atol(::Type{Float32}) = 64 * eps(Float32)
+_randn_reference_atol(::Type{Float64}) = 64 * eps(Float64)
 
 
 function _randn_fill_reference!(
@@ -22,28 +26,40 @@ function _assert_randn_matches_reference!(rng, x; kwargs...)
     AK.randn!(rng, x; kwargs...)
     ref = zeros(eltype(x), size(x))
     _randn_fill_reference!(rng, ref; counter_offset)
-    @test Array(x) == ref
+    xa = Array(x)
+
+    if IS_CPU_BACKEND
+        @test xa == ref
+    else
+        # randn uses Box-Muller (`log`, `sqrt`, `sincos`), and GPU libm implementations are not
+        # bit-identical to CPU scalar libm. Stream/counter mapping is still deterministic, but the
+        # final Float32 values can differ by a few ULP, so we use a tight absolute tolerance here.
+        atol = _randn_reference_atol(eltype(xa))
+        @test all(isapprox.(xa, ref; rtol=zero(atol), atol))
+    end
+
     return x
 end
 
 
 @testset "randn" begin
-    @testset "open interval helpers" begin
+    @testset "scalar helpers" begin
         @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(UInt32(0)) < 1.0f0
         @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(typemax(UInt32)) < 1.0f0
-        @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0
-        @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0
-    end
 
+        if IS_CPU_BACKEND
+            @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0
+            @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0
+        end
 
-    @testset "rand_open01 and randn_scalar" begin
         seed = UInt64(0x123456789abcdef)
         for alg in RANDN_ALGS
-            for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023))
-                u32 = AK.rand_open01(seed, alg, c, Float32)
+            for counter in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023))
+                u32 = AK.rand_open01(seed, alg, counter, Float32)
                 @test 0.0f0 < u32 < 1.0f0
+
                 if IS_CPU_BACKEND
-                    u64 = AK.rand_open01(seed, alg, c, Float64)
+                    u64 = AK.rand_open01(seed, alg, counter, Float64)
                     @test 0.0 < u64 < 1.0
                 end
             end
@@ -51,6 +67,7 @@ end
             for T in RANDN_FLOAT_TYPES_BACKEND
                 s0 = AK.randn_scalar(seed, alg, UInt64(42), T)
                 s1 = AK.randn_scalar(seed, alg, UInt64(43), T)
+
                 @test s0 isa T
                 @test s1 isa T
                 @test isfinite(s0)
@@ -59,26 +76,25 @@ end
                 @test s1 == AK.randn_scalar(seed, alg, UInt64(43), T)
 
                 p0, p1 = AK.randn_pair(seed, alg, UInt64(21), T)
-                @test AK.randn_scalar(seed, alg, UInt64(42), T) == p0
-                @test AK.randn_scalar(seed, alg, UInt64(43), T) == p1
+                @test p0 == AK.randn_scalar(seed, alg, UInt64(42), T)
+                @test p1 == AK.randn_scalar(seed, alg, UInt64(43), T)
             end
         end
 
+        @test_throws ArgumentError AK.rand_open01(seed, AK.Philox(), UInt64(0), UInt32)
         @test_throws ArgumentError AK.randn_scalar(seed, AK.Philox(), UInt64(0), UInt32)
     end
 
 
     @testset "randn! explicit rng" begin
-        lengths = (0, 1, 31, 32, 33, 257, 1024)
-
         for alg in RANDN_ALGS
             rng = AK.CounterRNG(0x123456789abcdef; alg)
 
             for T in RANDN_FLOAT_TYPES_BACKEND
-                for len in lengths
+                for len in RANDN_LENGTHS
                     x = array_from_host(zeros(T, len))
                     _assert_randn_matches_reference!(rng, x; prefer_threads, block_size=64)
-                    @test _is_finite(Array(x))
+                    @test _all_finite(Array(x))
                 end
             end
 
@@ -87,16 +103,18 @@ end
                 x2 = array_from_host(zeros(T, 2048))
                 rng1 = AK.CounterRNG(rng.seed; alg=rng.alg)
                 rng2 = AK.CounterRNG(rng.seed; alg=rng.alg)
+
                 AK.randn!(rng1, x1; prefer_threads, block_size=64)
                 AK.randn!(rng2, x2; prefer_threads, block_size=257)
                 @test Array(x1) == Array(x2)
             end
 
             for T in RANDN_FLOAT_TYPES_BACKEND
-                rng1 = AK.CounterRNG(rng.seed; alg=rng.alg)
-                rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg)
                 x1 = array_from_host(zeros(T, 2048))
                 x2 = array_from_host(zeros(T, 2048))
+                rng1 = AK.CounterRNG(rng.seed; alg=rng.alg)
+                rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg)
+
                 AK.randn!(rng1, x1; prefer_threads, block_size=64)
                 AK.randn!(rng2, x2; prefer_threads, block_size=64)
                 @test Array(x1) != Array(x2)
@@ -105,11 +123,12 @@ end
     end
 
 
-    @testset "counter rng offset behavior" begin
+    @testset "offset and reset semantics" begin
         rng_stream = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17))
         s1 = array_from_host(zeros(Float32, 99))
         s2 = array_from_host(zeros(Float32, 101))
         s12 = array_from_host(zeros(Float32, 200))
+
         AK.randn!(rng_stream, s1; prefer_threads, block_size=64)
         @test rng_stream.offset == UInt64(116)
         AK.randn!(rng_stream, s2; prefer_threads, block_size=64)
@@ -136,22 +155,7 @@ end
     end
 
 
-    @testset "reset!" begin
-        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
-        x1 = array_from_host(zeros(Float32, 512))
-        x2 = array_from_host(zeros(Float32, 512))
-
-        AK.randn!(rng, x1; prefer_threads, block_size=64)
-        @test rng.offset == UInt64(512)
-        @test AK.reset!(rng) === rng
-        @test rng.offset == UInt64(0)
-        AK.randn!(rng, x2; prefer_threads, block_size=64)
-
-        @test Array(x1) == Array(x2)
-    end
-
-
-    @testset "randn! n-dimensional and views" begin
+    @testset "shapes and views" begin
         rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
 
         for T in RANDN_FLOAT_TYPES_BACKEND
@@ -163,12 +167,14 @@ end
             for T in RANDN_FLOAT_TYPES_BACKEND
                 base = zeros(T, 64)
                 view_x = @view base[2:2:end]
+
                 AK.randn!(
                     rng, view_x;
                     max_tasks=Threads.nthreads(),
                     min_elems=1,
-                    prefer_threads=true
+                    prefer_threads=true,
                 )
+
                 ref_view = zeros(T, length(view_x))
                 _randn_fill_reference!(
                     rng, ref_view;
@@ -204,6 +210,46 @@ end
     end
 
 
+    @testset "randn allocation convenience" begin
+        rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox())
+        y = AK.randn(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64)
+        @test size(y) == (6, 7)
+        @test eltype(y) === Float32
+        @test _all_finite(Array(y))
+        @test rng.offset == UInt64(length(y))
+
+        rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
+        rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
+        y_alloc = AK.randn(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64)
+        y_fill = array_from_host(zeros(Float32, 128))
+        AK.randn!(rng_fill, y_fill; prefer_threads, block_size=64)
+        @test Array(y_alloc) == Array(y_fill)
+        @test rng_alloc.offset == rng_fill.offset == UInt64(128)
+
+        # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks.
+        AK.randn(BACKEND, Float32, 1; prefer_threads, block_size=64)
+
+        # Auto-seeded constructor should match explicit seed capture from default RNG.
+        Random.seed!(0x9abc)
+        seed = Random.rand(Random.default_rng(), UInt64)
+        ref = AK.randn(AK.CounterRNG(seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64)
+        Random.seed!(0x9abc)
+        x = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        @test Array(x) == Array(ref)
+
+        # Reseeding should reproduce the same auto-seeded draw.
+        Random.seed!(0x7777)
+        x1 = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        Random.seed!(0x7777)
+        x2 = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        @test Array(x1) == Array(x2)
+
+        @test_throws ArgumentError AK.randn(AK.CounterRNG(0x1), BACKEND, UInt32, 16; prefer_threads)
+        @test_throws MethodError AK.randn(AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.randn(BACKEND, Float32, 16; prefer_threads, bad=:kwarg)
+    end
+
+
     @testset "moments sanity" begin
         n = 200_000
         rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())

From 9abe3881f5afb0ef56b9a794960609a5fd26adc0 Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Thu, 26 Mar 2026 17:11:45 +0000
Subject: [PATCH 15/18] attempt to fix oneAPI test precompilation hang with
 Julia v1.10 by disabling package images

---
 .buildkite/pipeline.yml | 13 ++++++-------
 src/rand/rand.jl        | 17 +++++++++++++++--
 src/rand/randn.jl       | 39 +++++++++++++++++++++++++++------------
 src/rand/utilities.jl   | 13 +++++--------
 4 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 43cfba3..c0e5830 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -88,14 +88,13 @@ steps:
       - JuliaCI/julia#v1:
           version: "1.10"
     command: |
-      julia -e 'using Pkg
+      julia --pkgimages=no -e 'using Pkg
 
-                println("--- :julia: Instantiating environment")
-                Pkg.add("oneAPI")
-                Pkg.develop(path=".")
-
-                println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
+                               println("--- :julia: Instantiating environment")
+                               Pkg.add("oneAPI")
+                               Pkg.develop(path=".") 
+                               println("+++ :julia: Running tests")
+                               Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
     agents:
       queue: "juliagpu"
       intel: "*"
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index c36a61e..af3fdef 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -1,3 +1,10 @@
+const ALLOWED_RAND_SCALARS = Union{
+    UInt8, UInt16, UInt32, UInt64,
+    Int8, Int16, Int32, Int64,
+    Float16, Float32, Float64,
+    Bool
+}
+
 abstract type CounterRNGAlgorithm end
 
 
@@ -92,7 +99,8 @@ include("randn.jl")
 Fill `v` in-place with pseudo-random values using a counter-based RNG stream. For `v[i]`, the
 counter is `rng.offset + UInt64(i - 1)` in linear indexing order.
 
-After filling `v`, `rng.offset` advances by `length(v)`.
+After filling `v`, `rng.offset` advances by `length(v)`. It can be called without `rng`, in which
+case the default `CounterRNG` is used.
 
 Supported scalar element types are:
 - `UInt8`, `UInt16`, `UInt32`, `UInt64`
@@ -155,9 +163,13 @@ end
         backend::Backend,
         ::Type{T},
         dims::Integer...;
+
+        # CPU settings
         max_tasks::Int=Threads.nthreads(),
         min_elems::Int=1,
         prefer_threads::Bool=true,
+
+        # GPU settings
         block_size::Int=256,
     ) where T
 
@@ -178,7 +190,8 @@ function rand(
     # GPU settings
     block_size::Int=256,
 ) where T
-    return _allocate_and_fill(
+    @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)"
+    return _allocate_and_fill_rand(
         rand!, rng, backend, T, dims...;
         max_tasks, min_elems, prefer_threads, block_size,
     )
diff --git a/src/rand/randn.jl b/src/rand/randn.jl
index f3cce14..8d16c77 100644
--- a/src/rand/randn.jl
+++ b/src/rand/randn.jl
@@ -1,9 +1,11 @@
-const ALLOWED_RANDN_SCALARS = Union{Float16, Float32, Float64}
+const ALLOWED_RANDN_SCALARS = Union{
+    Float16, Float32, Float64
+}
 
-const U24_MAX_SAFE_MIDPOINT = UInt32(0x00fffffe)                 # 2^24 - 2
-const U53_MAX_SAFE_MIDPOINT = UInt64(0x001ffffffffffffe)         # 2^53 - 2
-const MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24)                # 2^-24
-const MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53)                # 2^-53
+const OPEN01_MAX_MIDPOINT_INDEX_F32 = UInt32(0x00fffffe)
+const OPEN01_MAX_MIDPOINT_INDEX_F64 = UInt64(0x001ffffffffffffe)
+const OPEN01_MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24)
+const OPEN01_MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53)
 
 
 
@@ -19,16 +21,16 @@ Avoiding 0 is essential for Box-Muller due to the logarithm functions.
 # Convert random UInt32 bits to Float32 in (0, 1) using midpoint mapping on a 24-bit grid.
 @inline function uint32_to_open_unit_float32_midpoint(u::UInt32)::Float32
     # `min` keeps the top midpoint below one after Float32 rounding.
-    k = min(u >> 8, U24_MAX_SAFE_MIDPOINT)
-    return (Float32(k) + 0.5f0) * MIDPOINT_SCALE_F32
+    k = min(u >> 8, OPEN01_MAX_MIDPOINT_INDEX_F32)
+    return (Float32(k) + 0.5f0) * OPEN01_MIDPOINT_SCALE_F32
 end
 
 
 # Convert random UInt64 bits to Float64 in (0, 1) using midpoint mapping on a 53-bit grid.
 @inline function uint64_to_open_unit_float64_midpoint(u::UInt64)::Float64
     # `min` keeps the top midpoint below one after Float64 rounding.
-    k = min(u >> 11, U53_MAX_SAFE_MIDPOINT)
-    return (Float64(k) + 0.5) * MIDPOINT_SCALE_F64
+    k = min(u >> 11, OPEN01_MAX_MIDPOINT_INDEX_F64)
+    return (Float64(k) + 0.5) * OPEN01_MIDPOINT_SCALE_F64
 end
 
 
@@ -201,8 +203,6 @@ end
         # CPU settings
         max_tasks::Int=Threads.nthreads(),
         min_elems::Int=1,
-
-        # Implementation choice
         prefer_threads::Bool=true,
 
         # GPU settings
@@ -215,6 +215,8 @@ For `v[i]`, the normal stream counter is `rng.offset + UInt64(i - 1)` in linear
 Values are generated using Box-Muller from midpoint-open uniforms in `(0, 1)`.
 
 After filling `v`, `rng.offset` advances by `length(v)`.
+
+It can be called without an `rng`, in which case the default `CounterRNG` will be used.
 """
 function randn!(
     rng::CounterRNG,
@@ -269,9 +271,13 @@ end
         backend::Backend,
         ::Type{T},
         dims::Integer...;
+
+        # CPU settings
         max_tasks::Int=Threads.nthreads(),
         min_elems::Int=1,
         prefer_threads::Bool=true,
+
+        # GPU settings
         block_size::Int=256,
     ) where T
 
@@ -283,12 +289,17 @@ function randn(
     backend::Backend,
     ::Type{T},
     dims::Integer...;
+
+    # CPU settings
     max_tasks::Int=Threads.nthreads(),
     min_elems::Int=1,
     prefer_threads::Bool=true,
+
+    # GPU settings
     block_size::Int=256,
 ) where T
-    return _allocate_and_fill(
+    @argcheck T <: ALLOWED_RANDN_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RANDN_SCALARS)"
+    return _allocate_and_fill_rand(
         randn!, rng, backend, T, dims...;
         max_tasks, min_elems, prefer_threads, block_size,
     )
@@ -299,9 +310,13 @@ function randn(
     backend::Backend,
     ::Type{T},
     dims::Integer...;
+
+    # CPU settings
     max_tasks::Int=Threads.nthreads(),
     min_elems::Int=1,
     prefer_threads::Bool=true,
+
+    # GPU settings
     block_size::Int=256,
 ) where T
     return randn(
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index b2c60db..04f6929 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -16,15 +16,19 @@
 
 
 # Shared allocation + fill helper for rand/randn convenience constructors.
-@inline function _allocate_and_fill(
+@inline function _allocate_and_fill_rand(
     fill!,
     rng::CounterRNG,
     backend::Backend,
     ::Type{T},
     dims::Integer...;
+
+    # CPU settings
     max_tasks::Int=Threads.nthreads(),
     min_elems::Int=1,
     prefer_threads::Bool=true,
+
+    # GPU settings
     block_size::Int=256,
 ) where {T}
     dims_int = Base.map(Int, dims)
@@ -34,13 +38,6 @@
 end
 
 
-# Internal scalar eltypes currently supported by rand!.
-const ALLOWED_RAND_SCALARS = Union{
-    UInt8, UInt16, UInt32, UInt64,
-    Int8, Int16, Int32, Int64,
-    Float16, Float32, Float64,
-    Bool
-}
 
 
 @inline _rand_scalar_uint_type(::Type{UInt8}) = UInt32

From 841bafb0c4e2ad0cf81af96c0a6e49caff7d64fd Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Thu, 26 Mar 2026 17:43:40 +0000
Subject: [PATCH 16/18] fix Threefry UInt32 device arithmetic to avoid breaking
 on Metal

---
 src/rand/threefry.jl  | 19 ++++++++++---------
 src/rand/utilities.jl |  2 +-
 test/randn.jl         |  8 --------
 3 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl
index c9052b1..f6ef632 100644
--- a/src/rand/threefry.jl
+++ b/src/rand/threefry.jl
@@ -9,9 +9,9 @@ const THREEFRY_ROTATIONS = (
 const THREEFRY_ROUNDS = 20
 
 
-@inline function _threefry_key_word(k0::UInt32, k1::UInt32, k2::UInt32, idx::Int)::UInt32
-    idx == 0 && return k0
-    idx == 1 && return k1
+@inline function _threefry_key_word(k0::UInt32, k1::UInt32, k2::UInt32, idx::UInt32)::UInt32
+    idx == UInt32(0) && return k0
+    idx == UInt32(1) && return k1
     return k2
 end
 
@@ -33,16 +33,17 @@ end
     x1 += k1
 
     @inbounds for round in 0:(THREEFRY_ROUNDS - 1)
-        rot = THREEFRY_ROTATIONS[(round & 0x7) + 1]
+        round_u32 = UInt32(round)
+        rot = THREEFRY_ROTATIONS[Int((round_u32 & UInt32(0x7)) + UInt32(1))]
         x0 += x1
         x1 = xor(_rotl32(x1, rot), x0)
 
-        if (round & 0x3) == 3
-            s = (round >>> 2) + 1
-            i0 = s % 3
-            i1 = (s + 1) % 3
+        if (round_u32 & UInt32(0x3)) == UInt32(0x3)
+            s = (round_u32 >>> 2) + UInt32(1)
+            i0 = s % UInt32(3)
+            i1 = (s + UInt32(1)) % UInt32(3)
             x0 += _threefry_key_word(k0, k1, k2, i0)
-            x1 += _threefry_key_word(k0, k1, k2, i1) + UInt32(s)
+            x1 += _threefry_key_word(k0, k1, k2, i1) + s
         end
     end
 
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index 04f6929..556ca7b 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -9,7 +9,7 @@
 @inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32)
 
 # 32-bit rotate left by r positions
-@inline _rotl32(x::UInt32, r::UInt32)::UInt32 = (x << r) | (x >> (UInt32(32) - r))
+@inline _rotl32(x::UInt32, r::UInt32)::UInt32 = bitrotate(x, Int32(r))
 
 # Get counter used for CounterRNG from element index
 @inline _counter_from_index(i)::UInt64 = UInt64(i - one(i))
diff --git a/test/randn.jl b/test/randn.jl
index 82261c8..0d239b2 100644
--- a/test/randn.jl
+++ b/test/randn.jl
@@ -90,14 +90,6 @@ end
         for alg in RANDN_ALGS
             rng = AK.CounterRNG(0x123456789abcdef; alg)
 
-            for T in RANDN_FLOAT_TYPES_BACKEND
-                for len in RANDN_LENGTHS
-                    x = array_from_host(zeros(T, len))
-                    _assert_randn_matches_reference!(rng, x; prefer_threads, block_size=64)
-                    @test _all_finite(Array(x))
-                end
-            end
-
             for T in RANDN_FLOAT_TYPES_BACKEND
                 x1 = array_from_host(zeros(T, 2048))
                 x2 = array_from_host(zeros(T, 2048))

From 39e089ffb5a09518b248f5bf085d3830f8b3c069 Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Fri, 27 Mar 2026 01:09:45 +0000
Subject: [PATCH 17/18] expand rand/randn convenience APIs and align
 docs/tests. Move randn open-interval helpers into utilities.jl and switch
 naming of internals. Simplify rand!/randn! wrappers and add convenience
 constructors for omitted rng, backend, or type, with backend-dependent
 defaults (Float64 on CPU, Float32 otherwise). Add explicit zero-arg guards so
 rand()/randn() require at least one dimension. Update rand.md with concise
 convenience semantics and examples, including type-only no-rng calls, plus
 doc entries for rand/randn. Expand tests to cover default-type dispatch, CPU
 fallback routes, typed no-rng overloads, and invalid-signature/kwarg throw
 behavior while preserving deterministic CounterRNG offset progression.

---
 docs/src/api/rand.md             |  31 ++++++++-
 prototype/rand/Project.toml      |   3 +-
 prototype/rand/plot/Project.toml |   3 -
 prototype/rand/randn.jl          |   2 -
 prototype/rand/test_rand.jl      |  25 +++----
 src/rand/rand.jl                 |  42 +++++------
 src/rand/randn.jl                | 116 +++++--------------------------
 src/rand/utilities.jl            |  67 ++++++++++++++++++
 test/rand.jl                     |  83 ++++++++++++++++++++++
 test/randn.jl                    | 105 +++++++++++++++++++++++++---
 test/runtests.jl                 |   8 +--
 11 files changed, 325 insertions(+), 160 deletions(-)
 delete mode 100644 prototype/rand/plot/Project.toml

diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md
index 2ac9d91..d0d4113 100644
--- a/docs/src/api/rand.md
+++ b/docs/src/api/rand.md
@@ -3,6 +3,10 @@
 Counter-based random generation for CPU and GPU backends with deterministic stream behavior for
 fixed `seed`, algorithm, and call sequence.
 
+Both in-place and allocation forms are supported:
+- Uniform: `AK.rand!`, `AK.rand`
+- Standard normal: `AK.randn!`, `AK.randn`
+
 `CounterRNG` carries an internal `offset` (starting at `0`) that advances by `length(v)` on each
 `AK.rand!(rng, v)` call. This means chunked fills are stream-consistent:
 - filling `100` then `100` elements yields the same `200` values as one `200`-element fill.
@@ -16,9 +20,16 @@ Use an explicit `CounterRNG` when reproducibility is required. For
 convenience,
 `AK.rand!(v)` creates a fresh `CounterRNG()` on each call using one auto-seeded
 `Base.rand(UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used.
+Likewise, `AK.rand(backend, args...)` creates a fresh auto-seeded `CounterRNG()` on each call.
 
 `AK.reset!(rng::AK.CounterRNG)` rewinds `rng.offset` to `0x0`.
 
+Allocation convenience:
+- Canonical forms are `AK.rand(rng, backend, T, dims...)` and `AK.randn(rng, backend, T, dims...)`.
+- Defaults are shared: omit `rng` -> fresh `CounterRNG()`; omit `backend` -> CPU backend; omit `T` -> `Float64` on CPU backend and `Float32` otherwise.
+- Common shorthands include `AK.rand(dims...)`, `AK.rand(T, dims...)`, `AK.rand(backend, dims...)`, and the corresponding `AK.randn(...)` variants.
+- For explicit `rng`, both `AK.rand` and `AK.randn` advance `rng.offset` by `prod(dims)`.
+
 Custom algorithms:
 - Define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`.
 - Implement typed `rand_uint` methods:
@@ -39,6 +50,9 @@ Supported element types:
 
 `AK.randn!` uses Box-Muller with open-interval uniforms in `(0, 1)` from a branch-free midpoint mapping.
 
+`AK.randn!(v)` and `AK.randn(backend, args...)` create a fresh auto-seeded `CounterRNG()` on each
+call, so repeated calls produce different outputs unless `Random.seed!()` is used.
+
 The core of the random number generation produces either a `UInt32` or `UInt64` depending on the width of the requested element type.
 That `UInt` is then either:
 - Unsigned integers: returned as-is or truncated if necessary.
@@ -72,6 +86,7 @@ Examples:
 ```julia
 import AcceleratedKernels as AK
 using oneAPI
+using ROCArray
 
 # Reproducible
 rng = AK.CounterRNG(0x12345678; alg=AK.Philox())
@@ -88,9 +103,19 @@ AK.rand!(rng, v2)
 y = oneArray{Float32}(undef, 1024)
 AK.rand!(y)
 
-# Standard normal samples
-z = oneArray{Float32}(undef, 1024)
+# Allocation form
+y_cpu_auto = AK.rand(1024)                               # defaults to CPU, Vector{Float64}
+y_oneArray = AK.rand(oneAPIBackend(), Float32, 1024)     # fresh RNG, allocate and fill oneArray
+y_cpu_typed = AK.rand(rng, Float16, 1024)                # CPU backend, explicit type, explicit RNG
+
+# Standard normal filling
+z = ROCArray{Float32}(undef, 1024)
 AK.randn!(rng, z)
+
+# Standard normal allocation form
+z_cpu_auto = AK.randn(1024)                              # defaults to CPU, Vector{Float64}
+z_ROCArray = AK.randn(oneAPIBackend(), 1024)             # allocate and fill ROCArray{Float32}
+z_cpu_typed = AK.randn(rng, Float16, 1024)               # CPU backend, explicit type, explicit RNG
 ```
 
 ```@docs
@@ -98,5 +123,7 @@ AcceleratedKernels.CounterRNG
 AcceleratedKernels.CounterRNGAlgorithm
 AcceleratedKernels.reset!
 AcceleratedKernels.rand!
+AcceleratedKernels.rand
 AcceleratedKernels.randn!
+AcceleratedKernels.randn
 ```
diff --git a/prototype/rand/Project.toml b/prototype/rand/Project.toml
index 675e6c7..d1926b1 100644
--- a/prototype/rand/Project.toml
+++ b/prototype/rand/Project.toml
@@ -1,8 +1,7 @@
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-PProf = "e4faabce-9ead-11e9-39d9-4379958e3056"
-Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
diff --git a/prototype/rand/plot/Project.toml b/prototype/rand/plot/Project.toml
deleted file mode 100644
index a95f271..0000000
--- a/prototype/rand/plot/Project.toml
+++ /dev/null
@@ -1,3 +0,0 @@
-[deps]
-AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
-Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
diff --git a/prototype/rand/randn.jl b/prototype/rand/randn.jl
index 8370d91..d552b5f 100644
--- a/prototype/rand/randn.jl
+++ b/prototype/rand/randn.jl
@@ -48,6 +48,4 @@ display(@benchmark run_cuda_randn!($x_cuda))
 println("\nAK.randn! Philox benchmark (GPU, CuArray{$TestType})")
 display(@benchmark run_ak_randn_gpu!($RNG_PHILOX, $x_philox))
 
-# println("\nAK.randn! benchmark (CPU, Vector{$TestType}, Philox)")
-# display(@benchmark run_ak_randn_cpu!($RNG_PHILOX, $x_cpu))
 
diff --git a/prototype/rand/test_rand.jl b/prototype/rand/test_rand.jl
index ec3ce65..5a7be0f 100644
--- a/prototype/rand/test_rand.jl
+++ b/prototype/rand/test_rand.jl
@@ -11,11 +11,13 @@ const RNG_SPLITMIX = AK.CounterRNG(0x12345678; alg=AK.SplitMix64())
 const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox())
 const RNG_THREEFRY = AK.CounterRNG(0x12345678; alg=AK.Threefry())
 
-x_cuda = CuArray{Float32}(undef, N)
-x_splitmix = CuArray{Float32}(undef, N)
-x_philox = CuArray{Float32}(undef, N)
-x_threefry = CuArray{Float32}(undef, N)
-x_cpu = Vector{Float32}(undef, N)
+TestType = Float32
+
+x_cuda = CuArray{TestType}(undef, N)
+x_splitmix = CuArray{TestType}(undef, N)
+x_philox = CuArray{TestType}(undef, N)
+x_threefry = CuArray{TestType}(undef, N)
+x_cpu = Vector{TestType}(undef, N)
 
 
 function run_cuda_rand!(x)
@@ -43,13 +45,11 @@ is_unit_interval(v) = all(x -> 0.0f0 <= x <= 1.0f0, v)
 
 # warmup compile
 run_cuda_rand!(x_cuda)
-# run_ak_rand_gpu!(RNG_SPLITMIX, x_splitmix)
 run_ak_rand_gpu!(RNG_PHILOX, x_philox)
 run_ak_rand_gpu!(RNG_THREEFRY, x_threefry)
 run_ak_rand_cpu!(RNG_SPLITMIX, x_cpu)
 
 @assert is_unit_interval(Array(x_cuda))
-# @assert is_unit_interval(Array(x_splitmix))
 @assert is_unit_interval(Array(x_philox))
 @assert is_unit_interval(Array(x_threefry))
 @assert is_unit_interval(x_cpu)
@@ -57,18 +57,15 @@ run_ak_rand_cpu!(RNG_SPLITMIX, x_cpu)
 println("N = ", N)
 println("CPU threads: ", Threads.nthreads())
 
-println("\nCUDA.rand! benchmark (CuArray{Float32}, in-place)")
+println("\nCUDA.rand! benchmark (CuArray{$TestType}, in-place)")
 display(@benchmark run_cuda_rand!($x_cuda))
 
-# println("\nAK.rand! SplitMix64 benchmark (GPU, CuArray{Float32})")
-# display(@benchmark run_ak_rand_gpu!($RNG_SPLITMIX, $x_splitmix))
-
-println("\nAK.rand! Philox benchmark (GPU, CuArray{Float32})")
+println("\nAK.rand! Philox benchmark (GPU, CuArray{$TestType})")
 display(@benchmark run_ak_rand_gpu!($RNG_PHILOX, $x_philox))
 
-println("\nAK.rand! Threefry benchmark (GPU, CuArray{Float32})")
+println("\nAK.rand! Threefry benchmark (GPU, CuArray{$TestType})")
 display(@benchmark run_ak_rand_gpu!($RNG_THREEFRY, $x_threefry))
 
-println("\nAK.rand! benchmark (CPU, Vector{Float32}, SplitMix64)")
+println("\nAK.rand! benchmark (CPU, Vector{$TestType}, SplitMix64)")
 display(@benchmark run_ak_rand_cpu!($RNG_SPLITMIX, $x_cpu))
 
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index af3fdef..06d3619 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -148,13 +148,7 @@ function rand!(
 end
 
 
-function rand!(
-    v::AbstractArray,
-    args...;
-    kwargs...,
-)
-    return rand!(CounterRNG(), v, args...; kwargs...)
-end
+rand!(v::AbstractArray, args...; kwargs...) = rand!(CounterRNG(), v, args...; kwargs...)
 
 
 """
@@ -175,6 +169,11 @@ end
 
 Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via
 [`rand!`](@ref), and return it.
+
+Convenience overloads:
+- `rng` omitted: uses a fresh `CounterRNG()`.
+- `backend` omitted: defaults to `CPU_BACKEND`.
+- `T` omitted: defaults by backend (`Float64` on CPU backend, `Float32` otherwise).
 """
 function rand(
     rng::CounterRNG,
@@ -198,21 +197,16 @@ function rand(
 end
 
 
-function rand(
-    backend::Backend,
-    ::Type{T},
-    dims::Integer...;
-    
-    # CPU settings
-    max_tasks::Int=Threads.nthreads(),
-    min_elems::Int=1,
-    prefer_threads::Bool=true,
-
-    # GPU settings
-    block_size::Int=256,
-) where T
-    return rand(
-        CounterRNG(), backend, T, dims...;
-        max_tasks, min_elems, prefer_threads, block_size,
-    )
+function rand(rng::CounterRNG, backend::Backend, dims::Integer...; kwargs...)
+    DefaultScalarType = (backend == CPU_BACKEND) ? Float64 : Float32
+    rand(rng, backend, DefaultScalarType, dims...; kwargs...)
 end
+
+
+rand(rng::CounterRNG, args...; kwargs...) = rand(rng, CPU_BACKEND, args...; kwargs...)
+rand(backend::Backend, args...; kwargs...) = rand(CounterRNG(), backend, args...; kwargs...)
+rand(::Type{T}, dims::Integer...; kwargs...) where {T} = rand(CPU_BACKEND, T, dims...; kwargs...)
+rand(dims::Integer...; kwargs...) = rand(CPU_BACKEND, dims...; kwargs...)
+rand(; kwargs...) = throw(ArgumentError("rand requires at least one dimension"))
+
+
diff --git a/src/rand/randn.jl b/src/rand/randn.jl
index 8d16c77..b2b5dbb 100644
--- a/src/rand/randn.jl
+++ b/src/rand/randn.jl
@@ -2,76 +2,6 @@ const ALLOWED_RANDN_SCALARS = Union{
     Float16, Float32, Float64
 }
 
-const OPEN01_MAX_MIDPOINT_INDEX_F32 = UInt32(0x00fffffe)
-const OPEN01_MAX_MIDPOINT_INDEX_F64 = UInt64(0x001ffffffffffffe)
-const OPEN01_MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24)
-const OPEN01_MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53)
-
-
-
-
-#=
-The below Float constructions are not duplicates of those in utilities.jl - they are needed to
-ensure an interval of (0, 1) as opposed to [0, 1). Achieving this purely logically with midpoint
-mapping means we can avoid a check for producing a 0 (which would normally cause a redraw).
-Avoiding 0 is essential for Box-Muller due to the logarithm functions.
-=#
-
-
-# Convert random UInt32 bits to Float32 in (0, 1) using midpoint mapping on a 24-bit grid.
-@inline function uint32_to_open_unit_float32_midpoint(u::UInt32)::Float32
-    # `min` keeps the top midpoint below one after Float32 rounding.
-    k = min(u >> 8, OPEN01_MAX_MIDPOINT_INDEX_F32)
-    return (Float32(k) + 0.5f0) * OPEN01_MIDPOINT_SCALE_F32
-end
-
-
-# Convert random UInt64 bits to Float64 in (0, 1) using midpoint mapping on a 53-bit grid.
-@inline function uint64_to_open_unit_float64_midpoint(u::UInt64)::Float64
-    # `min` keeps the top midpoint below one after Float64 rounding.
-    k = min(u >> 11, OPEN01_MAX_MIDPOINT_INDEX_F64)
-    return (Float64(k) + 0.5) * OPEN01_MIDPOINT_SCALE_F64
-end
-
-
-# Float16 path reuses Float32 midpoint sampling for robust math in Box-Muller.
-@inline function rand_open01(
-    seed::UInt64,
-    alg::CounterRNGAlgorithm,
-    counter::UInt64,
-    ::Type{Float16},
-)::Float16
-    return Float16(rand_open01(seed, alg, counter, Float32))
-end
-
-
-@inline function rand_open01(
-    seed::UInt64,
-    alg::CounterRNGAlgorithm,
-    counter::UInt64,
-    ::Type{Float32},
-)::Float32
-    return uint32_to_open_unit_float32_midpoint(rand_uint(seed, alg, counter, UInt32))
-end
-
-
-@inline function rand_open01(
-    seed::UInt64,
-    alg::CounterRNGAlgorithm,
-    counter::UInt64,
-    ::Type{Float64},
-)::Float64
-    return uint64_to_open_unit_float64_midpoint(rand_uint(seed, alg, counter, UInt64))
-end
-
-
-@inline function rand_open01(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T}
-    throw(ArgumentError(
-        "Unsupported open-interval random type $(T). Supported: $(ALLOWED_RANDN_SCALARS)"
-    ))
-end
-
-
 @inline function randn_pair(
     seed::UInt64,
     alg::CounterRNGAlgorithm,
@@ -90,8 +20,8 @@ end
     ::Type{Float32},
 )::Tuple{Float32, Float32}
     u = rand_uint(seed, alg, pair_counter, UInt64)
-    u1 = uint32_to_open_unit_float32_midpoint(_u32_lo(u))
-    u2 = uint32_to_open_unit_float32_midpoint(_u32_hi(u))
+    u1 = _uint32_to_open_unit_float32_midpoint(_u32_lo(u))
+    u2 = _uint32_to_open_unit_float32_midpoint(_u32_hi(u))
     radius = sqrt(-2.0f0 * log(u1))
     theta = Float32(2pi) * u2
     stheta, ctheta = sincos(theta)
@@ -106,8 +36,8 @@ end
     ::Type{Float64},
 )::Tuple{Float64, Float64}
     c0 = pair_counter << 1
-    u1 = rand_open01(seed, alg, c0, Float64)
-    u2 = rand_open01(seed, alg, c0 + UInt64(1), Float64)
+    u1 = rand_float_open01(seed, alg, c0, Float64)
+    u2 = rand_float_open01(seed, alg, c0 + UInt64(1), Float64)
     radius = sqrt(-2.0 * log(u1))
     theta = Float64(2pi) * u2
     stheta, ctheta = sincos(theta)
@@ -256,13 +186,7 @@ function randn!(
 end
 
 
-function randn!(
-    v::AbstractArray,
-    args...;
-    kwargs...,
-)
-    return randn!(CounterRNG(), v, args...; kwargs...)
-end
+randn!(v::AbstractArray, args...; kwargs...) = randn!(CounterRNG(), v, args...; kwargs...)
 
 
 """
@@ -283,6 +207,11 @@ end
 
 Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via
 [`randn!`](@ref), and return it.
+
+Convenience overloads:
+- `rng` omitted: uses a fresh `CounterRNG()`.
+- `backend` omitted: defaults to `CPU_BACKEND`.
+- `T` omitted: defaults by backend (`Float64` on CPU backend, `Float32` otherwise).
 """
 function randn(
     rng::CounterRNG,
@@ -306,21 +235,14 @@ function randn(
 end
 
 
-function randn(
-    backend::Backend,
-    ::Type{T},
-    dims::Integer...;
+function randn(rng::CounterRNG, backend::Backend, dims::Integer...; kwargs...)
+    DefaultScalarType = (backend == CPU_BACKEND) ? Float64 : Float32
+    randn(rng, backend, DefaultScalarType, dims...; kwargs...)
+end
 
-    # CPU settings
-    max_tasks::Int=Threads.nthreads(),
-    min_elems::Int=1,
-    prefer_threads::Bool=true,
 
-    # GPU settings
-    block_size::Int=256,
-) where T
-    return randn(
-        CounterRNG(), backend, T, dims...;
-        max_tasks, min_elems, prefer_threads, block_size,
-    )
-end
+randn(rng::CounterRNG, args...; kwargs...) = randn(rng, CPU_BACKEND, args...; kwargs...)
+randn(backend::Backend, args...; kwargs...) = randn(CounterRNG(), backend, args...; kwargs...)
+randn(::Type{T}, dims::Integer...; kwargs...) where {T} = randn(CPU_BACKEND, T, dims...; kwargs...)
+randn(dims::Integer...; kwargs...) = randn(CPU_BACKEND, dims...; kwargs...)
+randn(; kwargs...) = throw(ArgumentError("randn requires at least one dimension"))
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index 556ca7b..9e8ccf5 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -143,3 +143,70 @@ end
     # Interpret as 1.mantissa, then subtract 1 for [0, 1)
     reinterpret(Float64, bits) - 1.0
 end
+
+
+
+
+
+### Helpers for randn ###
+
+
+# Midpoint-mapped open-interval Float sampling in (0, 1), used for Box-Muller
+const OPEN01_MAX_MIDPOINT_INDEX_F32 = UInt32(0x00fffffe)
+const OPEN01_MAX_MIDPOINT_INDEX_F64 = UInt64(0x001ffffffffffffe)
+const OPEN01_MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24)
+const OPEN01_MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53)
+
+
+# Convert random UInt32 bits to Float32 in (0, 1) using midpoint mapping on a 24-bit grid.
+@inline function _uint32_to_open_unit_float32_midpoint(u::UInt32)::Float32
+    # `min` keeps the top midpoint below one after Float32 rounding.
+    k = min(u >> 8, OPEN01_MAX_MIDPOINT_INDEX_F32)
+    return (Float32(k) + 0.5f0) * OPEN01_MIDPOINT_SCALE_F32
+end
+
+
+# Convert random UInt64 bits to Float64 in (0, 1) using midpoint mapping on a 53-bit grid.
+@inline function _uint64_to_open_unit_float64_midpoint(u::UInt64)::Float64
+    # `min` keeps the top midpoint below one after Float64 rounding.
+    k = min(u >> 11, OPEN01_MAX_MIDPOINT_INDEX_F64)
+    return (Float64(k) + 0.5) * OPEN01_MIDPOINT_SCALE_F64
+end
+
+
+# Float16 path reuses Float32 midpoint sampling for robust math in Box-Muller.
+@inline function rand_float_open01(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    counter::UInt64,
+    ::Type{Float16},
+)::Float16
+    return Float16(rand_float_open01(seed, alg, counter, Float32))
+end
+
+
+@inline function rand_float_open01(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    counter::UInt64,
+    ::Type{Float32},
+)::Float32
+    return _uint32_to_open_unit_float32_midpoint(rand_uint(seed, alg, counter, UInt32))
+end
+
+
+@inline function rand_float_open01(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    counter::UInt64,
+    ::Type{Float64},
+)::Float64
+    return _uint64_to_open_unit_float64_midpoint(rand_uint(seed, alg, counter, UInt64))
+end
+
+
+@inline function rand_float_open01(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T}
+    throw(ArgumentError(
+        "Unsupported open-interval random type $(T). Supported: Union{Float16, Float32, Float64}"
+    ))
+end
diff --git a/test/rand.jl b/test/rand.jl
index d3eacca..a6b62c6 100644
--- a/test/rand.jl
+++ b/test/rand.jl
@@ -365,6 +365,8 @@ end
 
 
     @testset "rand allocation convenience" begin
+        default_alloc_type = IS_CPU_BACKEND ? Float64 : Float32
+
         rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox())
         y = AK.rand(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64)
         @test size(y) == (6, 7)
@@ -372,6 +374,16 @@ end
         @test _is_unit_interval(Array(y))
         @test rng.offset == UInt64(length(y))
 
+        rng_default = AK.CounterRNG(UInt64(0x99); alg=AK.Philox())
+        rng_default_ref = AK.CounterRNG(UInt64(0x99); alg=AK.Philox())
+        y_default = AK.rand(rng_default, BACKEND, 128; prefer_threads, block_size=64)
+        y_default_ref = AK.rand(
+            rng_default_ref, BACKEND, default_alloc_type, 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_default) === default_alloc_type
+        @test Array(y_default) == Array(y_default_ref)
+        @test rng_default.offset == rng_default_ref.offset == UInt64(128)
+
         rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
         rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
         y_alloc = AK.rand(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64)
@@ -380,6 +392,26 @@ end
         @test Array(y_alloc) == Array(y_fill)
         @test rng_alloc.offset == rng_fill.offset == UInt64(128)
 
+        rng_cpu_default = AK.CounterRNG(UInt64(0x66); alg=AK.Philox())
+        rng_cpu_default_ref = AK.CounterRNG(UInt64(0x66); alg=AK.Philox())
+        y_cpu_default = AK.rand(rng_cpu_default, 128; prefer_threads, block_size=64)
+        y_cpu_default_ref = AK.rand(
+            rng_cpu_default_ref, AK.get_backend([]), 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_cpu_default) === Float64
+        @test Array(y_cpu_default) == Array(y_cpu_default_ref)
+        @test rng_cpu_default.offset == rng_cpu_default_ref.offset == UInt64(128)
+
+        rng_cpu_typed = AK.CounterRNG(UInt64(0x77); alg=AK.Philox())
+        rng_cpu_typed_ref = AK.CounterRNG(UInt64(0x77); alg=AK.Philox())
+        y_cpu_typed = AK.rand(rng_cpu_typed, Float32, 128; prefer_threads, block_size=64)
+        y_cpu_typed_ref = AK.rand(
+            rng_cpu_typed_ref, AK.get_backend([]), Float32, 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_cpu_typed) === Float32
+        @test Array(y_cpu_typed) == Array(y_cpu_typed_ref)
+        @test rng_cpu_typed.offset == rng_cpu_typed_ref.offset == UInt64(128)
+
         # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks.
         AK.rand(BACKEND, Float32, 1; prefer_threads, block_size=64)
 
@@ -391,6 +423,54 @@ end
         x = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64)
         @test Array(x) == Array(ref)
 
+        # Auto-seeded convenience without explicit type should use backend-dependent default type.
+        Random.seed!(0x4242)
+        seed_default = Random.rand(Random.default_rng(), UInt64)
+        ref_default = AK.rand(
+            AK.CounterRNG(seed_default; alg=AK.Philox()),
+            BACKEND,
+            default_alloc_type,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x4242)
+        x_default = AK.rand(BACKEND, 64; prefer_threads, block_size=64)
+        @test eltype(x_default) === default_alloc_type
+        @test Array(x_default) == Array(ref_default)
+
+        # Convenience without backend should default to CPU backend and Float64.
+        Random.seed!(0x4545)
+        seed_cpu_default = Random.rand(Random.default_rng(), UInt64)
+        ref_cpu_default = AK.rand(
+            AK.CounterRNG(seed_cpu_default; alg=AK.Philox()),
+            AK.get_backend([]),
+            Float64,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x4545)
+        x_cpu_default = AK.rand(64; prefer_threads, block_size=64)
+        @test eltype(x_cpu_default) === Float64
+        @test Array(x_cpu_default) == Array(ref_cpu_default)
+
+        # Type-only convenience should default to CPU backend.
+        Random.seed!(0x5656)
+        seed_cpu_typed = Random.rand(Random.default_rng(), UInt64)
+        ref_cpu_typed = AK.rand(
+            AK.CounterRNG(seed_cpu_typed; alg=AK.Philox()),
+            AK.get_backend([]),
+            Float32,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x5656)
+        x_cpu_typed_no_rng = AK.rand(Float32, 64; prefer_threads, block_size=64)
+        @test eltype(x_cpu_typed_no_rng) === Float32
+        @test Array(x_cpu_typed_no_rng) == Array(ref_cpu_typed)
+
         # Reseeding should reproduce the same auto-seeded draw.
         Random.seed!(0x7777)
         x1 = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64)
@@ -401,5 +481,8 @@ end
         @test_throws ArgumentError AK.rand(AK.CounterRNG(0x1), BACKEND, UInt128, 16; prefer_threads)
         @test_throws MethodError AK.rand(AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg)
         @test_throws MethodError AK.rand(BACKEND, Float32, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.rand(BACKEND, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.rand(16; prefer_threads, bad=:kwarg)
+        @test_throws ArgumentError AK.rand()
     end
 end
diff --git a/test/randn.jl b/test/randn.jl
index 0d239b2..2d7b5ff 100644
--- a/test/randn.jl
+++ b/test/randn.jl
@@ -44,22 +44,22 @@ end
 
 @testset "randn" begin
     @testset "scalar helpers" begin
-        @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(UInt32(0)) < 1.0f0
-        @test 0.0f0 < AK.uint32_to_open_unit_float32_midpoint(typemax(UInt32)) < 1.0f0
+        @test 0.0f0 < AK._uint32_to_open_unit_float32_midpoint(UInt32(0)) < 1.0f0
+        @test 0.0f0 < AK._uint32_to_open_unit_float32_midpoint(typemax(UInt32)) < 1.0f0
 
         if IS_CPU_BACKEND
-            @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0
-            @test 0.0 < AK.uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0
+            @test 0.0 < AK._uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0
+            @test 0.0 < AK._uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0
         end
 
         seed = UInt64(0x123456789abcdef)
         for alg in RANDN_ALGS
             for counter in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023))
-                u32 = AK.rand_open01(seed, alg, counter, Float32)
+                u32 = AK.rand_float_open01(seed, alg, counter, Float32)
                 @test 0.0f0 < u32 < 1.0f0
 
                 if IS_CPU_BACKEND
-                    u64 = AK.rand_open01(seed, alg, counter, Float64)
+                    u64 = AK.rand_float_open01(seed, alg, counter, Float64)
                     @test 0.0 < u64 < 1.0
                 end
             end
@@ -81,7 +81,7 @@ end
             end
         end
 
-        @test_throws ArgumentError AK.rand_open01(seed, AK.Philox(), UInt64(0), UInt32)
+        @test_throws ArgumentError AK.rand_float_open01(seed, AK.Philox(), UInt64(0), UInt32)
         @test_throws ArgumentError AK.randn_scalar(seed, AK.Philox(), UInt64(0), UInt32)
     end
 
@@ -203,6 +203,8 @@ end
 
 
     @testset "randn allocation convenience" begin
+        default_alloc_type = IS_CPU_BACKEND ? Float64 : Float32
+
         rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox())
         y = AK.randn(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64)
         @test size(y) == (6, 7)
@@ -210,6 +212,16 @@ end
         @test _all_finite(Array(y))
         @test rng.offset == UInt64(length(y))
 
+        rng_default = AK.CounterRNG(UInt64(0x99); alg=AK.Philox())
+        rng_default_ref = AK.CounterRNG(UInt64(0x99); alg=AK.Philox())
+        y_default = AK.randn(rng_default, BACKEND, 128; prefer_threads, block_size=64)
+        y_default_ref = AK.randn(
+            rng_default_ref, BACKEND, default_alloc_type, 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_default) === default_alloc_type
+        @test Array(y_default) == Array(y_default_ref)
+        @test rng_default.offset == rng_default_ref.offset == UInt64(128)
+
         rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
         rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
         y_alloc = AK.randn(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64)
@@ -218,17 +230,87 @@ end
         @test Array(y_alloc) == Array(y_fill)
         @test rng_alloc.offset == rng_fill.offset == UInt64(128)
 
+        rng_cpu_default = AK.CounterRNG(UInt64(0x66); alg=AK.Philox())
+        rng_cpu_default_ref = AK.CounterRNG(UInt64(0x66); alg=AK.Philox())
+        y_cpu_default = AK.randn(rng_cpu_default, 128; prefer_threads, block_size=64)
+        y_cpu_default_ref = AK.randn(
+            rng_cpu_default_ref, AK.get_backend([]), 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_cpu_default) === Float64
+        @test Array(y_cpu_default) == Array(y_cpu_default_ref)
+        @test rng_cpu_default.offset == rng_cpu_default_ref.offset == UInt64(128)
+
+        rng_cpu_typed = AK.CounterRNG(UInt64(0x77); alg=AK.Philox())
+        rng_cpu_typed_ref = AK.CounterRNG(UInt64(0x77); alg=AK.Philox())
+        y_cpu_typed = AK.randn(rng_cpu_typed, Float32, 128; prefer_threads, block_size=64)
+        y_cpu_typed_ref = AK.randn(
+            rng_cpu_typed_ref, AK.get_backend([]), Float32, 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_cpu_typed) === Float32
+        @test Array(y_cpu_typed) == Array(y_cpu_typed_ref)
+        @test rng_cpu_typed.offset == rng_cpu_typed_ref.offset == UInt64(128)
+
         # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks.
         AK.randn(BACKEND, Float32, 1; prefer_threads, block_size=64)
 
         # Auto-seeded constructor should match explicit seed capture from default RNG.
         Random.seed!(0x9abc)
         seed = Random.rand(Random.default_rng(), UInt64)
-        ref = AK.randn(AK.CounterRNG(seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64)
+        ref = AK.randn(AK.CounterRNG(
+            seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64
+        )
         Random.seed!(0x9abc)
         x = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64)
         @test Array(x) == Array(ref)
 
+        # Auto-seeded convenience without explicit type should use backend-dependent default type.
+        Random.seed!(0x4242)
+        seed_default = Random.rand(Random.default_rng(), UInt64)
+        ref_default = AK.randn(
+            AK.CounterRNG(seed_default; alg=AK.Philox()),
+            BACKEND,
+            default_alloc_type,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x4242)
+        x_default = AK.randn(BACKEND, 64; prefer_threads, block_size=64)
+        @test eltype(x_default) === default_alloc_type
+        @test Array(x_default) == Array(ref_default)
+
+        # Convenience without backend should default to CPU backend and Float64.
+        Random.seed!(0x4545)
+        seed_cpu_default = Random.rand(Random.default_rng(), UInt64)
+        ref_cpu_default = AK.randn(
+            AK.CounterRNG(seed_cpu_default; alg=AK.Philox()),
+            AK.get_backend([]),
+            Float64,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x4545)
+        x_cpu_default = AK.randn(64; prefer_threads, block_size=64)
+        @test eltype(x_cpu_default) === Float64
+        @test Array(x_cpu_default) == Array(ref_cpu_default)
+
+        # Type-only convenience should default to CPU backend.
+        Random.seed!(0x5656)
+        seed_cpu_typed = Random.rand(Random.default_rng(), UInt64)
+        ref_cpu_typed = AK.randn(
+            AK.CounterRNG(seed_cpu_typed; alg=AK.Philox()),
+            AK.get_backend([]),
+            Float32,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x5656)
+        x_cpu_typed_no_rng = AK.randn(Float32, 64; prefer_threads, block_size=64)
+        @test eltype(x_cpu_typed_no_rng) === Float32
+        @test Array(x_cpu_typed_no_rng) == Array(ref_cpu_typed)
+
         # Reseeding should reproduce the same auto-seeded draw.
         Random.seed!(0x7777)
         x1 = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64)
@@ -237,8 +319,13 @@ end
         @test Array(x1) == Array(x2)
 
         @test_throws ArgumentError AK.randn(AK.CounterRNG(0x1), BACKEND, UInt32, 16; prefer_threads)
-        @test_throws MethodError AK.randn(AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.randn(
+            AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg
+        )
         @test_throws MethodError AK.randn(BACKEND, Float32, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.randn(BACKEND, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.randn(16; prefer_threads, bad=:kwarg)
+        @test_throws ArgumentError AK.randn()
     end
 
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 5d3a6ad..707afee 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -17,13 +17,7 @@ if "--CUDA" in ARGS
     const BACKEND = CUDABackend()
     TEST_DL[] = true
 elseif "--oneAPI" in ARGS
-    if Sys.iswindows()
-        # oneAPI v2.6.x can throw `UndefVarError: NEO_jll not defined` on native Windows.
-        # Pin to the latest known-good minor series until upstream fixes are available.
-        Pkg.add(name="oneAPI", version="2.5")
-    else
-        Pkg.add("oneAPI")
-    end
+    Pkg.add("oneAPI")
     using oneAPI
     oneAPI.versioninfo()
     const BACKEND = oneAPIBackend()

From 9595086ed35d6005627eeaf8733c305ee20c0a76 Mon Sep 17 00:00:00 2001
From: fjbarter <fjbarter@outlook.com>
Date: Sat, 28 Mar 2026 00:08:13 +0000
Subject: [PATCH 18/18] document rand_uint and CounterRNGAlgorithm properly,
 improve RNG docs page

---
 docs/src/api/rand.md        | 169 +++++++++++++++++++++++-------------
 prototype/rand/test_rand.jl |  26 ++----
 src/rand/rand.jl            |   9 ++
 src/rand/utilities.jl       |  28 ++++++
 4 files changed, 153 insertions(+), 79 deletions(-)

diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md
index d0d4113..381a797 100644
--- a/docs/src/api/rand.md
+++ b/docs/src/api/rand.md
@@ -1,92 +1,141 @@
 ### Random Number Generation
 
-Counter-based random generation for CPU and GPU backends with deterministic stream behavior for
+Counter-based random generation for CPU and GPU backends with deterministic stream behaviour for a
 fixed `seed`, algorithm, and call sequence.
 
 Both in-place and allocation forms are supported:
 - Uniform: `AK.rand!`, `AK.rand`
 - Standard normal: `AK.randn!`, `AK.randn`
 
-`CounterRNG` carries an internal `offset` (starting at `0`) that advances by `length(v)` on each
-`AK.rand!(rng, v)` call. This means chunked fills are stream-consistent:
+`CounterRNG` stores:
+- `seed::UInt64`
+- algorithm `alg`
+- stream `offset::UInt64`
+
+The offset starts at `0` and advances by the number of generated values after each call. For
+`AK.rand!(rng, v)` and `AK.randn!(rng, v)`, element `v[i]` is generated from logical counter
+`rng.offset + UInt64(i - 1)` in linear indexing order.
+
+This gives stream-consistent chunking:
 - filling `100` then `100` elements yields the same `200` values as one `200`-element fill.
-- calls that share the same `CounterRNG` instance concurrently are not thread-safe.
-- call `AK.reset!(rng)` to rewind a `CounterRNG` offset back to `0x0`.
+- `AK.reset!(rng)` rewinds `rng.offset` to `0x0`.
 
-`AK.rand!(rng, v)` accepts `rng::AK.CounterRNG`.
-Passing other RNG container types is not supported and will throw a `MethodError`.
+Calls that share the same `CounterRNG` instance concurrently are not thread-safe and may race on
+`offset`.
 
-Use an explicit `CounterRNG` when reproducibility is required. For
-convenience,
-`AK.rand!(v)` creates a fresh `CounterRNG()` on each call using one auto-seeded
-`Base.rand(UInt64)` draw, so repeated calls produce different outputs unless Random.seed!() is used.
-Likewise, `AK.rand(backend, args...)` creates a fresh auto-seeded `CounterRNG()` on each call.
+`AK.rand!` and `AK.randn!` accept `rng::AK.CounterRNG`. Passing other RNG container types is not
+supported and will throw a `MethodError`.
 
-`AK.reset!(rng::AK.CounterRNG)` rewinds `rng.offset` to `0x0`.
+#### Auto-seeded convenience behaviour
 
-Allocation convenience:
-- Canonical forms are `AK.rand(rng, backend, T, dims...)` and `AK.randn(rng, backend, T, dims...)`.
-- Defaults are shared: omit `rng` -> fresh `CounterRNG()`; omit `backend` -> CPU backend; omit `T` -> `Float64` on CPU backend and `Float32` otherwise.
-- Common shorthands include `AK.rand(dims...)`, `AK.rand(T, dims...)`, `AK.rand(backend, dims...)`, and the corresponding `AK.randn(...)` variants.
-- For explicit `rng`, both `AK.rand` and `AK.randn` advance `rng.offset` by `prod(dims)`.
+Use an explicit `CounterRNG` when reproducibility is required.
 
-Custom algorithms:
-- Define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`.
-- Implement typed `rand_uint` methods:
-  - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32`
-  - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64`
-- Use your algorithm via `AK.CounterRNG(seed; alg=MyAlg(), offset=...)`.
+For convenience, calls without an explicit `rng` construct a fresh `CounterRNG()` on each call,
+using one auto-seeded `Base.rand(UInt64)` draw. Therefore repeated bare calls intentionally produce
+different outputs unless `Random.seed!()` is used first.
+
+Examples:
+- `AK.rand!(v)`
+- `AK.randn!(v)`
+- `AK.rand(backend, args...)`
+- `AK.randn(backend, args...)`
+
+These do **not** continue a shared stream across calls unless you pass the same explicit
+`CounterRNG`.
 
-Both widths should be implemented so `AK.rand!` supports all integer/float output types without falling back or error.
+#### Allocation forms
 
-Supported element types:
+Canonical forms:
+- `AK.rand(rng, backend, T, dims...)`
+- `AK.randn(rng, backend, T, dims...)`
+
+Shared defaults:
+- omit `rng` -> fresh `CounterRNG()`
+- omit `backend` -> CPU backend
+- omit `T` -> `Float64` on CPU backend, `Float32` otherwise
+
+Common shorthands include:
+- `AK.rand(dims...)`
+- `AK.rand(T, dims...)`
+- `AK.rand(backend, dims...)`
+- and the corresponding `AK.randn(...)` variants
+
+For explicit `rng`, both `AK.rand` and `AK.randn` advance `rng.offset` by the number of generated
+elements, i.e. `prod(dims)`.
+
+#### Supported element types
+
+`AK.rand!` / `AK.rand` support:
 - `UInt8`, `UInt16`, `UInt32`, `UInt64`
 - `Int8`, `Int16`, `Int32`, `Int64`
 - `Float16`, `Float32`, `Float64`
 - `Bool`
 
-`AK.randn!` fills arrays with standard normal samples and currently supports:
+`AK.randn!` / `AK.randn` currently support:
 - `Float16`, `Float32`, `Float64`
 
-`AK.randn!` uses Box-Muller with open-interval uniforms in `(0, 1)` from a branch-free midpoint mapping.
+#### Value generation semantics
+
+The core generator produces either a `UInt32` or `UInt64`, depending on the requested output type.
+That raw unsigned value is then mapped as follows:
+- Unsigned integers: returned directly, or truncated if narrower
+- Signed integers: the corresponding unsigned bit pattern reinterpreted as signed, then truncated if narrower
+- Floats: mantissa construction onto a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/))
+- Bool: `true` if the raw `UInt` draw is odd (`isodd(u)`), otherwise `false`
 
-`AK.randn!(v)` and `AK.randn(backend, args...)` create a fresh auto-seeded `CounterRNG()` on each
-call, so repeated calls produce different outputs unless `Random.seed!()` is used.
+`AK.randn!` uses Box-Muller with midpoint-mapped open-interval uniforms in `(0, 1)`.
 
-The core of the random number generation produces either a `UInt32` or `UInt64` depending on the width of the requested element type.
-That `UInt` is then either:
-- Unsigned integers: returned as-is or truncated if necessary.
-- Signed integers: reinterpreted as a signed integer bit pattern and truncated if necessary.
-- Floats: mantissa construction into a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/)).
-- Bool: `true` if the `UInt` draw is odd (`isodd(u)`), otherwise `false`.
+#### Algorithms currently available
 
-Algorithms currently available:
 - `SplitMix64` ([read more](https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64))
 - `Philox` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf))
 - `Threefry` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf))
 
-Statistical-testing note:
+`Philox` is the default algorithm for `CounterRNG()`.
+
+#### Statistical testing and security
+
 - In this repository, `SplitMix64`, `Philox`, and `Threefry` have passed TestU01 BigCrush
-- These generators are not intended to be cryptographically secure.
+- These generators are not intended to be cryptographically secure
+
+#### Philox keying note
+
+AK uses `Philox2x32` internally, which has a single 32-bit Philox key word.
+
+Users may pass any non-negative `Integer` seed with `seed <= typemax(UInt64)`; AK converts it to
+`UInt64` and derives the 32-bit Philox key using SplitMix. This wrapper choice is deliberate for
+ease of use and deterministic streams, not a change to the Philox round function itself.
 
-Philox keying note:
-- AK uses `Philox2x32` internally (one 32-bit Philox key word).
-- Users can pass any non-negative `Integer` seed; AK normalises to `UInt64` then derives the
-  32-bit Philox key via a SplitMix-based mapping.
-- This is a deliberate wrapper choice for ease of use (simple `seed` API with deterministic
-  streams), not a change to the Philox round function itself.
-- Therefore, AK Philox streams are deterministic and high-quality, but not guaranteed to be
-  bit-for-bit identical to a raw Random123 Philox stream unless the same seed-to-key mapping and
-  counter convention are used.
+Therefore, AK Philox streams are deterministic and high-quality, but are not guaranteed to be
+bit-for-bit identical to a raw Random123 Philox stream unless the same seed-to-key mapping and
+counter convention are used.
 
-`Philox` is the default algorithm for `CounterRNG()` because it is thorough and very fast; it has been measured on par with `CUDA.rand!` and `SplitMix64` at ~390 GB/s on an Nvidia GeForce RTX
-5060 (advertised 448 GB/s), i.e. effectively memory-bound throughput.
+#### Custom algorithms
+
+To define a custom counter RNG:
+- define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`
+- implement:
+  - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32`
+  - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64`
+
+Then use it via:
+- `AK.CounterRNG(seed; alg=MyAlg(), offset=...)`
+
+Both widths should be implemented so `AK.rand!` supports all integer and floating-point output
+types without fallback or error.
+
+#### Performance note
+
+`Philox` is the default because it is high-quality and very fast. `AK.rand!` has been measured at
+roughly memory-bound throughput (~390 GB/s) on an Nvidia GeForce RTX 5060, including slightly better
+performance than CURAND for large `CuArray{Float32}` fills and substantially faster `CuArray{Int32}`
+filling than native `CUDA.rand!` in the benchmarks used for this repository.
 
 Examples:
 ```julia
 import AcceleratedKernels as AK
 using oneAPI
-using ROCArray
+using AMDGPU
 
 # Reproducible
 rng = AK.CounterRNG(0x12345678; alg=AK.Philox())
@@ -94,36 +143,38 @@ v = oneArray{Float32}(undef, 1024)
 AK.rand!(rng, v)
 
 # Stream-consistent chunking
+rng = AK.CounterRNG(0x12345678; alg=AK.Philox())
 v1 = oneArray{Float32}(undef, 100)
 v2 = oneArray{Float32}(undef, 100)
 AK.rand!(rng, v1)
 AK.rand!(rng, v2)
 
-# Convenience (fresh auto-seeded RNG on each call)
+# Convenience: fresh auto-seeded RNG on each call
 y = oneArray{Float32}(undef, 1024)
 AK.rand!(y)
 
 # Allocation form
-y_cpu_auto = AK.rand(1024)                               # defaults to CPU, Vector{Float64}
-y_oneArray = AK.rand(oneAPIBackend(), Float32, 1024)     # fresh RNG, allocate and fill oneArray
-y_cpu_typed = AK.rand(rng, Float16, 1024)                # CPU backend, explicit type, explicit RNG
+y_cpu_auto = AK.rand(1024)                            # CPU, Vector{Float64}
+y_one = AK.rand(oneAPIBackend(), Float32, 1024)       # fresh RNG, allocate + fill oneArray
+y_cpu_typed = AK.rand(rng, Float16, 1024)             # CPU backend, explicit type, explicit RNG
 
 # Standard normal filling
 z = ROCArray{Float32}(undef, 1024)
 AK.randn!(rng, z)
 
 # Standard normal allocation form
-z_cpu_auto = AK.randn(1024)                              # defaults to CPU, Vector{Float64}
-z_ROCArray = AK.randn(oneAPIBackend(), 1024)             # allocate and fill ROCArray{Float32}
-z_cpu_typed = AK.randn(rng, Float16, 1024)               # CPU backend, explicit type, explicit RNG
+z_cpu_auto = AK.randn(1024)                           # CPU, Vector{Float64}
+z_roc = AK.randn(ROCBackend(), 1024)                  # fresh RNG, allocate + fill ROCArray{Float32}
+z_cpu_typed = AK.randn(rng, Float16, 1024)            # CPU backend, explicit type, explicit RNG
 ```
 
 ```@docs
 AcceleratedKernels.CounterRNG
 AcceleratedKernels.CounterRNGAlgorithm
+AcceleratedKernels.rand_uint
 AcceleratedKernels.reset!
 AcceleratedKernels.rand!
 AcceleratedKernels.rand
 AcceleratedKernels.randn!
 AcceleratedKernels.randn
-```
+```
\ No newline at end of file
diff --git a/prototype/rand/test_rand.jl b/prototype/rand/test_rand.jl
index 5a7be0f..0d916f2 100644
--- a/prototype/rand/test_rand.jl
+++ b/prototype/rand/test_rand.jl
@@ -7,16 +7,13 @@ import AcceleratedKernels as AK
 const N = 100_000_000
 const GPU_BLOCK_SIZE = 256
 
-const RNG_SPLITMIX = AK.CounterRNG(0x12345678; alg=AK.SplitMix64())
+
 const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox())
-const RNG_THREEFRY = AK.CounterRNG(0x12345678; alg=AK.Threefry())
 
-TestType = Float32
 
+TestType = Float32
 x_cuda = CuArray{TestType}(undef, N)
-x_splitmix = CuArray{TestType}(undef, N)
 x_philox = CuArray{TestType}(undef, N)
-x_threefry = CuArray{TestType}(undef, N)
 x_cpu = Vector{TestType}(undef, N)
 
 
@@ -40,19 +37,11 @@ function run_ak_rand_cpu!(rng, x)
 end
 
 
-# Julia base rand() gives [0, 1) and so does EVERYTHING ELSE EVER! but CuRAND gives (0, 1] ...
-is_unit_interval(v) = all(x -> 0.0f0 <= x <= 1.0f0, v)
-
-# warmup compile
+# warmup
 run_cuda_rand!(x_cuda)
 run_ak_rand_gpu!(RNG_PHILOX, x_philox)
-run_ak_rand_gpu!(RNG_THREEFRY, x_threefry)
-run_ak_rand_cpu!(RNG_SPLITMIX, x_cpu)
+run_ak_rand_cpu!(RNG_PHILOX, x_cpu)
 
-@assert is_unit_interval(Array(x_cuda))
-@assert is_unit_interval(Array(x_philox))
-@assert is_unit_interval(Array(x_threefry))
-@assert is_unit_interval(x_cpu)
 
 println("N = ", N)
 println("CPU threads: ", Threads.nthreads())
@@ -63,9 +52,6 @@ display(@benchmark run_cuda_rand!($x_cuda))
 println("\nAK.rand! Philox benchmark (GPU, CuArray{$TestType})")
 display(@benchmark run_ak_rand_gpu!($RNG_PHILOX, $x_philox))
 
-println("\nAK.rand! Threefry benchmark (GPU, CuArray{$TestType})")
-display(@benchmark run_ak_rand_gpu!($RNG_THREEFRY, $x_threefry))
-
-println("\nAK.rand! benchmark (CPU, Vector{$TestType}, SplitMix64)")
-display(@benchmark run_ak_rand_cpu!($RNG_SPLITMIX, $x_cpu))
+println("\nAK.rand! benchmark (CPU, Vector{$TestType}, Philox)")
+display(@benchmark run_ak_rand_cpu!($RNG_PHILOX, $x_cpu))
 
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
index 06d3619..b2b14bd 100644
--- a/src/rand/rand.jl
+++ b/src/rand/rand.jl
@@ -5,6 +5,15 @@ const ALLOWED_RAND_SCALARS = Union{
     Bool
 }
 
+
+"""
+    CounterRNGAlgorithm
+
+Abstract supertype for algorithms used by [`CounterRNG`](@ref).
+
+To define a custom counter-based RNG algorithm, subtype `CounterRNGAlgorithm` and implement
+[`rand_uint`](@ref) for both `UInt32` and `UInt64` outputs.
+"""
 abstract type CounterRNGAlgorithm end
 
 
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
index 9e8ccf5..35d9084 100644
--- a/src/rand/utilities.jl
+++ b/src/rand/utilities.jl
@@ -72,6 +72,34 @@ end
 Every RNG algorithm implements rand_uint(seed, alg, counter, UInt32/UInt64).
 This is the fallback for unsupported RNG algorithms.
 =#
+"""
+    rand_uint(seed::UInt64, alg::CounterRNGAlgorithm, counter::UInt64, ::Type{UIntType}) -> UIntType
+    where {UIntType <: Union{UInt32, UInt64}}
+
+Low-level extension point for counter-based RNG algorithms used by [`CounterRNG`](@ref).
+
+`rand_uint` must deterministically map `(seed, alg, counter)` to a raw unsigned integer of the
+requested width. Custom algorithms should implement methods for both:
+
+- `rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32`
+- `rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64`
+
+These methods are used internally by [`rand!`](@ref), [`rand`](@ref), [`randn!`](@ref), and
+[`randn`](@ref) to generate integers, floats, and normal samples.
+
+# Requirements
+- The mapping must be deterministic for fixed `seed`, `alg`, and `counter`.
+- Implement both `UInt32` and `UInt64` widths.
+- The method should return raw random bits; higher-level type conversion is handled by AK separately.
+
+# Notes
+- `counter` is the logical stream position (typically the array index).
+- For block-based algorithms such as Philox or Threefry, the `UInt32` and `UInt64` methods may
+  share an internal block computation.
+- The fallback method throws an `ArgumentError` for algorithms that do not implement `rand_uint`.
+
+See also: [`CounterRNGAlgorithm`](@ref), [`CounterRNG`](@ref).
+"""
 @inline function rand_uint(
     ::UInt64,
     alg::CounterRNGAlgorithm,