diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 43cfba3..c0e5830 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -88,14 +88,13 @@ steps: - JuliaCI/julia#v1: version: "1.10" command: | - julia -e 'using Pkg + julia --pkgimages=no -e 'using Pkg - println("--- :julia: Instantiating environment") - Pkg.add("oneAPI") - Pkg.develop(path=".") - - println("+++ :julia: Running tests") - Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' + println("--- :julia: Instantiating environment") + Pkg.add("oneAPI") + Pkg.develop(path=".") + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' agents: queue: "juliagpu" intel: "*" diff --git a/Project.toml b/Project.toml index 2fccea8..b0c601d 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f" [weakdeps] @@ -21,6 +22,7 @@ ArgCheck = "2" GPUArraysCore = "0.2.0" KernelAbstractions = "0.9.34" Markdown = "1" +Random = "1" UnsafeAtomics = "0.3.0" julia = "1.10" oneAPI = "1, 2" diff --git a/docs/make.jl b/docs/make.jl index 58b4632..5e794a2 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -23,6 +23,7 @@ makedocs(; "Using Different Backends" => "api/using_backends.md", "General Loops" => "api/foreachindex.md", "Map" => "api/map.md", + "Random Number Generation" => "api/rand.md", "Sorting" => "api/sort.md", "Reduce" => "api/reduce.md", "MapReduce" => "api/mapreduce.md", diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md new file mode 100644 index 0000000..381a797 --- /dev/null +++ b/docs/src/api/rand.md @@ -0,0 +1,180 @@ +### Random Number Generation + +Counter-based random generation for CPU and GPU backends with deterministic stream behaviour for a +fixed `seed`, algorithm, and call sequence. + +Both in-place and allocation forms are supported: +- Uniform: `AK.rand!`, `AK.rand` +- Standard normal: `AK.randn!`, `AK.randn` + +`CounterRNG` stores: +- `seed::UInt64` +- algorithm `alg` +- stream `offset::UInt64` + +The offset starts at `0` and advances by the number of generated values after each call. For +`AK.rand!(rng, v)` and `AK.randn!(rng, v)`, element `v[i]` is generated from logical counter +`rng.offset + UInt64(i - 1)` in linear indexing order. + +This gives stream-consistent chunking: +- filling `100` then `100` elements yields the same `200` values as one `200`-element fill. +- `AK.reset!(rng)` rewinds `rng.offset` to `0x0`. + +Calls that share the same `CounterRNG` instance concurrently are not thread-safe and may race on +`offset`. + +`AK.rand!` and `AK.randn!` accept `rng::AK.CounterRNG`. Passing other RNG container types is not +supported and will throw a `MethodError`. + +#### Auto-seeded convenience behaviour + +Use an explicit `CounterRNG` when reproducibility is required. + +For convenience, calls without an explicit `rng` construct a fresh `CounterRNG()` on each call, +using one auto-seeded `Base.rand(UInt64)` draw. Therefore repeated bare calls intentionally produce +different outputs unless `Random.seed!()` is used first. + +Examples: +- `AK.rand!(v)` +- `AK.randn!(v)` +- `AK.rand(backend, args...)` +- `AK.randn(backend, args...)` + +These do **not** continue a shared stream across calls unless you pass the same explicit +`CounterRNG`. + +#### Allocation forms + +Canonical forms: +- `AK.rand(rng, backend, T, dims...)` +- `AK.randn(rng, backend, T, dims...)` + +Shared defaults: +- omit `rng` -> fresh `CounterRNG()` +- omit `backend` -> CPU backend +- omit `T` -> `Float64` on CPU backend, `Float32` otherwise + +Common shorthands include: +- `AK.rand(dims...)` +- `AK.rand(T, dims...)` +- `AK.rand(backend, dims...)` +- and the corresponding `AK.randn(...)` variants + +For explicit `rng`, both `AK.rand` and `AK.randn` advance `rng.offset` by the number of generated +elements, i.e. `prod(dims)`. + +#### Supported element types + +`AK.rand!` / `AK.rand` support: +- `UInt8`, `UInt16`, `UInt32`, `UInt64` +- `Int8`, `Int16`, `Int32`, `Int64` +- `Float16`, `Float32`, `Float64` +- `Bool` + +`AK.randn!` / `AK.randn` currently support: +- `Float16`, `Float32`, `Float64` + +#### Value generation semantics + +The core generator produces either a `UInt32` or `UInt64`, depending on the requested output type. +That raw unsigned value is then mapped as follows: +- Unsigned integers: returned directly, or truncated if narrower +- Signed integers: the corresponding unsigned bit pattern reinterpreted as signed, then truncated if narrower +- Floats: mantissa construction onto a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/)) +- Bool: `true` if the raw `UInt` draw is odd (`isodd(u)`), otherwise `false` + +`AK.randn!` uses Box-Muller with midpoint-mapped open-interval uniforms in `(0, 1)`. + +#### Algorithms currently available + +- `SplitMix64` ([read more](https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64)) +- `Philox` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)) +- `Threefry` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)) + +`Philox` is the default algorithm for `CounterRNG()`. + +#### Statistical testing and security + +- In this repository, `SplitMix64`, `Philox`, and `Threefry` have passed TestU01 BigCrush +- These generators are not intended to be cryptographically secure + +#### Philox keying note + +AK uses `Philox2x32` internally, which has a single 32-bit Philox key word. + +Users may pass any non-negative `Integer` seed with `seed <= typemax(UInt64)`; AK converts it to +`UInt64` and derives the 32-bit Philox key using SplitMix. This wrapper choice is deliberate for +ease of use and deterministic streams, not a change to the Philox round function itself. + +Therefore, AK Philox streams are deterministic and high-quality, but are not guaranteed to be +bit-for-bit identical to a raw Random123 Philox stream unless the same seed-to-key mapping and +counter convention are used. + +#### Custom algorithms + +To define a custom counter RNG: +- define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm` +- implement: + - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32` + - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64` + +Then use it via: +- `AK.CounterRNG(seed; alg=MyAlg(), offset=...)` + +Both widths should be implemented so `AK.rand!` supports all integer and floating-point output +types without fallback or error. + +#### Performance note + +`Philox` is the default because it is high-quality and very fast. `AK.rand!` has been measured at +roughly memory-bound throughput (~390 GB/s) on an Nvidia GeForce RTX 5060, including slightly better +performance than CURAND for large `CuArray{Float32}` fills and substantially faster `CuArray{Int32}` +filling than native `CUDA.rand!` in the benchmarks used for this repository. + +Examples: +```julia +import AcceleratedKernels as AK +using oneAPI +using AMDGPU + +# Reproducible +rng = AK.CounterRNG(0x12345678; alg=AK.Philox()) +v = oneArray{Float32}(undef, 1024) +AK.rand!(rng, v) + +# Stream-consistent chunking +rng = AK.CounterRNG(0x12345678; alg=AK.Philox()) +v1 = oneArray{Float32}(undef, 100) +v2 = oneArray{Float32}(undef, 100) +AK.rand!(rng, v1) +AK.rand!(rng, v2) + +# Convenience: fresh auto-seeded RNG on each call +y = oneArray{Float32}(undef, 1024) +AK.rand!(y) + +# Allocation form +y_cpu_auto = AK.rand(1024) # CPU, Vector{Float64} +y_one = AK.rand(oneAPIBackend(), Float32, 1024) # fresh RNG, allocate + fill oneArray +y_cpu_typed = AK.rand(rng, Float16, 1024) # CPU backend, explicit type, explicit RNG + +# Standard normal filling +z = ROCArray{Float32}(undef, 1024) +AK.randn!(rng, z) + +# Standard normal allocation form +z_cpu_auto = AK.randn(1024) # CPU, Vector{Float64} +z_roc = AK.randn(ROCBackend(), 1024) # fresh RNG, allocate + fill ROCArray{Float32} +z_cpu_typed = AK.randn(rng, Float16, 1024) # CPU backend, explicit type, explicit RNG +``` + +```@docs +AcceleratedKernels.CounterRNG +AcceleratedKernels.CounterRNGAlgorithm +AcceleratedKernels.rand_uint +AcceleratedKernels.reset! +AcceleratedKernels.rand! +AcceleratedKernels.rand +AcceleratedKernels.randn! +AcceleratedKernels.randn +``` \ No newline at end of file diff --git a/prototype/RNGTest/Project.toml b/prototype/RNGTest/Project.toml new file mode 100644 index 0000000..7536db2 --- /dev/null +++ b/prototype/RNGTest/Project.toml @@ -0,0 +1,3 @@ +[deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" +RNGTest = "97cc5700-e6cb-5ca1-8fb2-7f6b45264ecd" diff --git a/prototype/RNGTest/README.md b/prototype/RNGTest/README.md new file mode 100644 index 0000000..2bc6a7b --- /dev/null +++ b/prototype/RNGTest/README.md @@ -0,0 +1,33 @@ +# AK + RNGTest SmallCrush Prototype + +This folder provides a chunked random stream generator based on `AcceleratedKernels.jl` that can be fed into `RNGTest.jl`. + +The stream is deterministic and effectively unbounded: +- each refill generates `chunk` random `UInt64` values with `AK.rand!` +- each refill advances one persistent `CounterRNG` stream offset +- this is a practical chunked stream for RNGTest callback mode + +`RNGTest.jl` (in this local checkout) expects a callback returning `Float64` in `[0,1]`, so `UInt64` words are mapped to `Float64` via top-53-bit scaling. + +Current status in this harness: `SplitMix64`, `Philox`, and `Threefry` all pass BigCrush using `run_bigcrush.jl`. + +## Run SmallCrush + +From this directory: + +```powershell +julia --project=. run_smallcrush.jl +``` + +## Run BigCrush + +```powershell +julia --project=. run_bigcrush.jl +``` + +Notes: +- Configure `ALG`, `SEED`, and `CHUNK` at the top of + `run_smallcrush.jl` / `run_bigcrush.jl`. +- The stream refills directly into host scratch using `AK.rand!` on CPU. +- `chunk` controls refill amortization and memory usage. +- `chunk=100000000` means ~800 MB host scratch (`UInt64`). diff --git a/prototype/RNGTest/run_bigcrush.jl b/prototype/RNGTest/run_bigcrush.jl new file mode 100644 index 0000000..a1298a0 --- /dev/null +++ b/prototype/RNGTest/run_bigcrush.jl @@ -0,0 +1,26 @@ +using RNGTest + +include("stream.jl") + + +const ALG = :philox +const SEED = 0x1234 +const CHUNK = 10_000_000 +const HOST_SCRATCH = Vector{UInt64}(undef, CHUNK) + + +stream = AKUInt64Stream( + HOST_SCRATCH; + seed=SEED, + alg=ALG, + start_counter=UInt64(0), +) +gen = make_rngtest_generator!(stream) +genname = "AK_Vector_$(ALG)_seed$(SEED)" + +println("Beginning BigCrush. This may take hours...") + +RNGTest.bigcrushTestU01(gen, genname) + +println("refills: ", stream.refill_count) +println("numbers consumed (approx): ", (stream.refill_count - 1) * stream.chunk + (stream.idx - 1)) diff --git a/prototype/RNGTest/run_smallcrush.jl b/prototype/RNGTest/run_smallcrush.jl new file mode 100644 index 0000000..db1251c --- /dev/null +++ b/prototype/RNGTest/run_smallcrush.jl @@ -0,0 +1,23 @@ +using RNGTest + +include("stream.jl") + + +const ALG = :philox +const SEED = 0x1234 +const CHUNK = 100_000_000 +const HOST_SCRATCH = Vector{UInt64}(undef, CHUNK) + + +stream = AKUInt64Stream( + HOST_SCRATCH; + seed=SEED, + alg=ALG, + start_counter=UInt64(0), +) +gen = make_rngtest_generator!(stream) +genname = "AK_Vector_$(ALG)_seed$(SEED)" + +println("Beginning SmallCrush...") + +RNGTest.smallcrushTestU01(gen, genname) diff --git a/prototype/RNGTest/stream.jl b/prototype/RNGTest/stream.jl new file mode 100644 index 0000000..a383645 --- /dev/null +++ b/prototype/RNGTest/stream.jl @@ -0,0 +1,80 @@ +import AcceleratedKernels as AK + + +function make_rng(seed::Integer, alg::Symbol; offset::Integer=0) + if alg === :philox + return AK.CounterRNG(seed; alg=AK.Philox(), offset=offset) + elseif alg === :threefry + return AK.CounterRNG(seed; alg=AK.Threefry(), offset=offset) + elseif alg === :splitmix64 + return AK.CounterRNG(seed; alg=AK.SplitMix64(), offset=offset) + end + throw(ArgumentError("alg must be :philox, :threefry, or :splitmix64; got $alg")) +end + + +mutable struct AKUInt64Stream{R} + rng::R + chunk::Int + idx::Int + host_scratch::Vector{UInt64} + refill_count::Int +end + + +function AKUInt64Stream( + host_scratch::Vector{UInt64}; + seed::Integer=0x1234, + alg::Symbol=:philox, + start_counter::UInt64=0x0000000000000000, +) + chunk = length(host_scratch) + chunk > 0 || throw(ArgumentError("host_scratch must be non-empty")) + rng = make_rng(seed, alg; offset=start_counter) + + return AKUInt64Stream( + rng, + chunk, + chunk + 1, + host_scratch, + 0, + ) +end + + +@inline _u01_from_u64(u::UInt64)::Float64 = Float64(u >>> 11) * 0x1.0p-53 + + +function _fill_chunk!(s::AKUInt64Stream) + AK.rand!(s.rng, s.host_scratch) + return nothing +end + + +function refill!(s::AKUInt64Stream) + _fill_chunk!(s) + s.idx = 1 + s.refill_count += 1 + return s +end + + +function next_u64!(s::AKUInt64Stream)::UInt64 + if s.idx > s.chunk + refill!(s) + end + @inbounds u = s.host_scratch[s.idx] + s.idx += 1 + return u +end + + +@inline next_float64!(s::AKUInt64Stream)::Float64 = _u01_from_u64(next_u64!(s)) + + +function make_rngtest_generator!(s::AKUInt64Stream) + if s.idx > s.chunk + refill!(s) + end + return () -> next_float64!(s) +end diff --git a/prototype/rand/Project.toml b/prototype/rand/Project.toml new file mode 100644 index 0000000..d1926b1 --- /dev/null +++ b/prototype/rand/Project.toml @@ -0,0 +1,7 @@ +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" diff --git a/prototype/rand/randn.jl b/prototype/rand/randn.jl new file mode 100644 index 0000000..d552b5f --- /dev/null +++ b/prototype/rand/randn.jl @@ -0,0 +1,51 @@ +using BenchmarkTools +using CUDA + +import AcceleratedKernels as AK + + +const N = 100_000_000 +const GPU_BLOCK_SIZE = 256 + +const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox(), offset=0x0) + +TestType = Float32 + +x_cuda = CuArray{TestType}(undef, N) +x_philox = CuArray{TestType}(undef, N) +x_cpu = Vector{TestType}(undef, N) + + +function run_cuda_randn!(x) + CUDA.randn!(x) + CUDA.synchronize() + return x +end + + +function run_ak_randn_gpu!(rng, x) + AK.randn!(rng, x; block_size=GPU_BLOCK_SIZE) + AK.synchronize(AK.get_backend(x)) + return x +end + + +function run_ak_randn_cpu!(rng, x) + AK.randn!(rng, x) + return x +end + +# warmup compile +run_cuda_randn!(x_cuda) +run_ak_randn_gpu!(RNG_PHILOX, x_philox) + +println("N = ", N) +println("CPU threads: ", Threads.nthreads()) + +println("\nCUDA.randn! benchmark (CuArray{$TestType}, in-place)") +display(@benchmark run_cuda_randn!($x_cuda)) + +println("\nAK.randn! Philox benchmark (GPU, CuArray{$TestType})") +display(@benchmark run_ak_randn_gpu!($RNG_PHILOX, $x_philox)) + + diff --git a/prototype/rand/test_rand.jl b/prototype/rand/test_rand.jl new file mode 100644 index 0000000..0d916f2 --- /dev/null +++ b/prototype/rand/test_rand.jl @@ -0,0 +1,57 @@ +using BenchmarkTools +using CUDA + +import AcceleratedKernels as AK + + +const N = 100_000_000 +const GPU_BLOCK_SIZE = 256 + + +const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox()) + + +TestType = Float32 +x_cuda = CuArray{TestType}(undef, N) +x_philox = CuArray{TestType}(undef, N) +x_cpu = Vector{TestType}(undef, N) + + +function run_cuda_rand!(x) + CUDA.rand!(x) + CUDA.synchronize() + return x +end + + +function run_ak_rand_gpu!(rng, x) + AK.rand!(rng, x; block_size=GPU_BLOCK_SIZE) + AK.synchronize(AK.get_backend(x)) + return x +end + + +function run_ak_rand_cpu!(rng, x) + AK.rand!(rng, x) + return x +end + + +# warmup +run_cuda_rand!(x_cuda) +run_ak_rand_gpu!(RNG_PHILOX, x_philox) +run_ak_rand_cpu!(RNG_PHILOX, x_cpu) + + +println("N = ", N) +println("CPU threads: ", Threads.nthreads()) + +println("\nCUDA.rand! benchmark (CuArray{$TestType}, in-place)") +display(@benchmark run_cuda_rand!($x_cuda)) + +println("\nAK.rand! Philox benchmark (GPU, CuArray{$TestType})") +display(@benchmark run_ak_rand_gpu!($RNG_PHILOX, $x_philox)) + +println("\nAK.rand! benchmark (CPU, Vector{$TestType}, Philox)") +display(@benchmark run_ak_rand_cpu!($RNG_PHILOX, $x_cpu)) + diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl index d662c2a..7262cbe 100644 --- a/src/AcceleratedKernels.jl +++ b/src/AcceleratedKernels.jl @@ -15,6 +15,7 @@ using ArgCheck: @argcheck using GPUArraysCore: AnyGPUArray, @allowscalar using KernelAbstractions using KernelAbstractions: @context +using Random import UnsafeAtomics @@ -31,6 +32,7 @@ include("map.jl") include("sort/sort.jl") include("reduce/reduce.jl") include("accumulate/accumulate.jl") +include("rand/rand.jl") include("searchsorted.jl") include("predicates.jl") include("arithmetics.jl") diff --git a/src/rand/philox.jl b/src/rand/philox.jl new file mode 100644 index 0000000..0de9ff4 --- /dev/null +++ b/src/rand/philox.jl @@ -0,0 +1,59 @@ +struct Philox <: CounterRNGAlgorithm end + + +# Philox magic numbers +const PHILOX_M0 = UInt32(0xD256D193) +const PHILOX_W0 = UInt32(0x9E3779B9) +const PHILOX_ROUNDS = 7 + + +@inline function _philox2x32_round(x0::UInt32, x1::UInt32, k0::UInt32) + lo = PHILOX_M0 * x0 + hi = _mulhi_u32(PHILOX_M0, x0) + y0 = xor(xor(hi, k0), x1) + y1 = lo + return y0, y1 +end + + +# Evaluate one Philox block at `counter`, returning two 32-bit lanes `(x0, x1)` +@inline function _philox2x32_block( + seed::UInt64, + counter::UInt64, +)::Tuple{UInt32, UInt32} + x0 = _u32_lo(counter) + x1 = _u32_hi(counter) + + k0 = splitmix32_from_u64(seed) + + @inbounds for _ in 1:PHILOX_ROUNDS + x0, x1 = _philox2x32_round(x0, x1, k0) + k0 += PHILOX_W0 + end + + return x0, x1 +end + + +# Return lane 0 from the single Philox block at `counter` +@inline function rand_uint( + seed::UInt64, + alg::Philox, + counter::UInt64, + ::Type{UInt32}, +)::UInt32 + x0, _ = _philox2x32_block(seed, counter) + return x0 +end + + +# Build UInt64 from the two lanes `(x0, x1)` of the same Philox block at `counter` +@inline function rand_uint( + seed::UInt64, + alg::Philox, + counter::UInt64, + ::Type{UInt64}, +)::UInt64 + x0, x1 = _philox2x32_block(seed, counter) + return _u64_from_u32s(x0, x1) +end diff --git a/src/rand/rand.jl b/src/rand/rand.jl new file mode 100644 index 0000000..b2b14bd --- /dev/null +++ b/src/rand/rand.jl @@ -0,0 +1,221 @@ +const ALLOWED_RAND_SCALARS = Union{ + UInt8, UInt16, UInt32, UInt64, + Int8, Int16, Int32, Int64, + Float16, Float32, Float64, + Bool +} + + +""" + CounterRNGAlgorithm + +Abstract supertype for algorithms used by [`CounterRNG`](@ref). + +To define a custom counter-based RNG algorithm, subtype `CounterRNGAlgorithm` and implement +[`rand_uint`](@ref) for both `UInt32` and `UInt64` outputs. +""" +abstract type CounterRNGAlgorithm end + + +""" + CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox()) + +Counter-based RNG for [`rand!`](@ref). + +`CounterRNG` stores: +- `seed` +- algorithm (`alg`) +- stream `offset` + +The default algorithm is `Philox()`. + +`seed` may be any non-negative `Integer`. It is normalised to `UInt64` internally. +`offset` is initialised to `0` by default and advances by `length(v)` after each [`rand!`](@ref) +call. + +Constructors: +- `CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)` + Uses an explicit non-negative seed and offset. +- `CounterRNG(; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)` + Auto-seeds once using `Base.rand(UInt64)`, with default `offset == 0`. +""" +mutable struct CounterRNG{A <: CounterRNGAlgorithm} + const seed::UInt64 + const alg::A + offset::UInt64 +end + + +function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0) + @argcheck seed >= 0 "Seed must be a non-negative integer" + @argcheck offset >= 0 "Offset must be a non-negative integer" + CounterRNG(UInt64(seed), alg, UInt64(offset)) +end + + +function CounterRNG(; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0) + CounterRNG(Base.rand(UInt64); alg, offset) +end + + +CounterRNG(seed::Integer, alg::CounterRNGAlgorithm) = CounterRNG(seed; alg) + + +""" + reset!(rng::CounterRNG) + +Reset `rng.offset` to `0x0`. +""" +@inline function reset!(rng::CounterRNG) + rng.offset = UInt64(0) + return rng +end + + + + +# Shared helpers +include("utilities.jl") + +# Algorithm-specific integer generators +include("splitmix.jl") +include("philox.jl") +include("threefry.jl") + +# Normally distributed scalar generators and randn! +include("randn.jl") + + + + +""" + rand!( + rng::CounterRNG, + v::AbstractArray{T}, + backend::Backend=get_backend(v); + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + + # Implementation choice + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, + ) + +Fill `v` in-place with pseudo-random values using a counter-based RNG stream. For `v[i]`, the +counter is `rng.offset + UInt64(i - 1)` in linear indexing order. + +After filling `v`, `rng.offset` advances by `length(v)`. It can be called without `rng`, in which +case the default `CounterRNG` is used. + +Supported scalar element types are: +- `UInt8`, `UInt16`, `UInt32`, `UInt64` +- `Int8`, `Int16`, `Int32`, `Int64` +- `Float16`, `Float32`, `Float64` +- `Bool` + +Semantics: +- Unsigned integers: raw random bit patterns of requested width. +- Signed integers: corresponding unsigned patterns reinterpreted as signed. +- Floats: mantissa-based conversion from `UInt32`/`UInt64` into `[0, 1)`, uniform over the + produced mantissa grid (not over all representable floats). +- Bool: `true` if the raw `UInt` draw is odd (`isodd(u)`), otherwise `false`. + +""" +function rand!( + rng::CounterRNG, + v::AbstractArray{T}, + backend::Backend=get_backend(v); + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, +) where T + + @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)" + + # Local isbits captures from mutable rng object + seed, alg, initial_offset = rng.seed, rng.alg, rng.offset + + foreachindex( + v, backend; + max_tasks, min_elems, prefer_threads, block_size, + ) do i + @inbounds v[i] = rand_scalar(seed, alg, initial_offset + _counter_from_index(i), T) + end + + rng.offset += UInt64(length(v)) + + v +end + + +rand!(v::AbstractArray, args...; kwargs...) = rand!(CounterRNG(), v, args...; kwargs...) + + +""" + rand( + rng::CounterRNG, + backend::Backend, + ::Type{T}, + dims::Integer...; + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, + ) where T + +Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via +[`rand!`](@ref), and return it. + +Convenience overloads: +- `rng` omitted: uses a fresh `CounterRNG()`. +- `backend` omitted: defaults to `CPU_BACKEND`. +- `T` omitted: defaults by backend (`Float64` on CPU backend, `Float32` otherwise). +""" +function rand( + rng::CounterRNG, + backend::Backend, + ::Type{T}, + dims::Integer...; + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, +) where T + @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)" + return _allocate_and_fill_rand( + rand!, rng, backend, T, dims...; + max_tasks, min_elems, prefer_threads, block_size, + ) +end + + +function rand(rng::CounterRNG, backend::Backend, dims::Integer...; kwargs...) + DefaultScalarType = (backend == CPU_BACKEND) ? Float64 : Float32 + rand(rng, backend, DefaultScalarType, dims...; kwargs...) +end + + +rand(rng::CounterRNG, args...; kwargs...) = rand(rng, CPU_BACKEND, args...; kwargs...) +rand(backend::Backend, args...; kwargs...) = rand(CounterRNG(), backend, args...; kwargs...) +rand(::Type{T}, dims::Integer...; kwargs...) where {T} = rand(CPU_BACKEND, T, dims...; kwargs...) +rand(dims::Integer...; kwargs...) = rand(CPU_BACKEND, dims...; kwargs...) +rand(; kwargs...) = throw(ArgumentError("rand requires at least one dimension")) + + diff --git a/src/rand/randn.jl b/src/rand/randn.jl new file mode 100644 index 0000000..b2b5dbb --- /dev/null +++ b/src/rand/randn.jl @@ -0,0 +1,248 @@ +const ALLOWED_RANDN_SCALARS = Union{ + Float16, Float32, Float64 +} + +@inline function randn_pair( + seed::UInt64, + alg::CounterRNGAlgorithm, + pair_counter::UInt64, + ::Type{Float16}, +)::Tuple{Float16, Float16} + z0, z1 = randn_pair(seed, alg, pair_counter, Float32) + return Float16(z0), Float16(z1) +end + + +@inline function randn_pair( + seed::UInt64, + alg::CounterRNGAlgorithm, + pair_counter::UInt64, + ::Type{Float32}, +)::Tuple{Float32, Float32} + u = rand_uint(seed, alg, pair_counter, UInt64) + u1 = _uint32_to_open_unit_float32_midpoint(_u32_lo(u)) + u2 = _uint32_to_open_unit_float32_midpoint(_u32_hi(u)) + radius = sqrt(-2.0f0 * log(u1)) + theta = Float32(2pi) * u2 + stheta, ctheta = sincos(theta) + return radius * ctheta, radius * stheta +end + + +@inline function randn_pair( + seed::UInt64, + alg::CounterRNGAlgorithm, + pair_counter::UInt64, + ::Type{Float64}, +)::Tuple{Float64, Float64} + c0 = pair_counter << 1 + u1 = rand_float_open01(seed, alg, c0, Float64) + u2 = rand_float_open01(seed, alg, c0 + UInt64(1), Float64) + radius = sqrt(-2.0 * log(u1)) + theta = Float64(2pi) * u2 + stheta, ctheta = sincos(theta) + return radius * ctheta, radius * stheta +end + + +@inline function randn_pair(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T} + throw(ArgumentError( + "Unsupported normal random type $(T). Supported: $(ALLOWED_RANDN_SCALARS)" + )) +end + + +@inline function randn_scalar( + seed::UInt64, + alg::CounterRNGAlgorithm, + normal_counter::UInt64, + ::Type{T}, +)::T where {T <: ALLOWED_RANDN_SCALARS} + pair_counter = normal_counter >> 1 + z0, z1 = randn_pair(seed, alg, pair_counter, T) + return iszero(normal_counter & UInt64(0x1)) ? z0 : z1 +end + + +@inline function randn_scalar(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T} + throw(ArgumentError( + "Unsupported normal random scalar type $(T). Supported: $(ALLOWED_RANDN_SCALARS)" + )) +end + + +# `Val{ODD}` keeps parity in the type domain so each specialization (`ODD==0` / `ODD==1`) +# can fold index bias at compile time. +# - `Val{0}` => even-offset pair writes at indices `(2i-1, 2i)` so bias is `-1` +# - `Val{1}` => odd-offset pair writes at indices `(2i, 2i+1)` after prefix handling so bias is `0` +@inline _randn_i0_bias(::Val{0}) = -1 +@inline _randn_i0_bias(::Val{1}) = 0 + + +@inline function _randn_core!( + v::AbstractArray{T}, seed, alg, initial_offset, + backend, max_tasks, min_elems, prefer_threads, block_size, + ::Val{ODD}, +) where {T, ODD} + + len = length(v) + prefix_len = ODD + + # If offset is odd, need to individually handle the first element. + prefix_len == 1 && @allowscalar @inbounds v[1] = randn_scalar(seed, alg, initial_offset, T) + + # Stream is now even-aligned, so can foreachindex through the pairs. + pair_start = (initial_offset + UInt64(prefix_len)) >> 1 + + # Capture `Val(ODD)` into the closure so bias stays a compile-time constant inside the loop. + odd_val = Val(ODD) + i0_bias = _randn_i0_bias(odd_val) + remaining_len = len - prefix_len + pair_count = remaining_len >> 1 + + if pair_count > 0 + foreachindex( + Base.OneTo(pair_count), backend; + max_tasks, min_elems, prefer_threads, block_size, + ) do i + pair_counter = pair_start + _counter_from_index(i) + z0, z1 = randn_pair(seed, alg, pair_counter, T) + i0 = (i << 1) + _randn_i0_bias(odd_val) + @inbounds v[i0] = z0 + @inbounds v[i0 + 1] = z1 + end + end + + # If an extra element remains after pair writing, fill it individually. + tail_index = (pair_count << 1) + i0_bias + 2 + if tail_index <= len + tail_counter = initial_offset + UInt64(tail_index - 1) + @allowscalar @inbounds v[tail_index] = randn_scalar(seed, alg, tail_counter, T) + end + + return v +end + + +""" + randn!( + rng::CounterRNG, + v::AbstractArray{T}, + backend::Backend=get_backend(v); + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, + ) where {T <: AbstractFloat} + +Fill `v` in-place with pseudo-random samples from a standard normal distribution. + +For `v[i]`, the normal stream counter is `rng.offset + UInt64(i - 1)` in linear indexing order. +Values are generated using Box-Muller from midpoint-open uniforms in `(0, 1)`. + +After filling `v`, `rng.offset` advances by `length(v)`. + +It can be called without an `rng`, in which case the default `CounterRNG` will be used. +""" +function randn!( + rng::CounterRNG, + v::AbstractArray{T}, + backend::Backend=get_backend(v); + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, +) where T + + @argcheck T <: ALLOWED_RANDN_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RANDN_SCALARS)" + + isempty(v) && return v + + # Local isbits captures from mutable rng object. + seed, alg, initial_offset = rng.seed, rng.alg, rng.offset + + core_args = ( + v, seed, alg, initial_offset, backend, max_tasks, min_elems, prefer_threads, block_size + ) + + # Dispatch depending on required initial index bias + if iseven(initial_offset) + _randn_core!(core_args..., Val(0)) + else + _randn_core!(core_args..., Val(1)) + end + + rng.offset += UInt64(length(v)) + + v +end + + +randn!(v::AbstractArray, args...; kwargs...) = randn!(CounterRNG(), v, args...; kwargs...) + + +""" + randn( + rng::CounterRNG, + backend::Backend, + ::Type{T}, + dims::Integer...; + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, + ) where T + +Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via +[`randn!`](@ref), and return it. + +Convenience overloads: +- `rng` omitted: uses a fresh `CounterRNG()`. +- `backend` omitted: defaults to `CPU_BACKEND`. +- `T` omitted: defaults by backend (`Float64` on CPU backend, `Float32` otherwise). +""" +function randn( + rng::CounterRNG, + backend::Backend, + ::Type{T}, + dims::Integer...; + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, +) where T + @argcheck T <: ALLOWED_RANDN_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RANDN_SCALARS)" + return _allocate_and_fill_rand( + randn!, rng, backend, T, dims...; + max_tasks, min_elems, prefer_threads, block_size, + ) +end + + +function randn(rng::CounterRNG, backend::Backend, dims::Integer...; kwargs...) + DefaultScalarType = (backend == CPU_BACKEND) ? Float64 : Float32 + randn(rng, backend, DefaultScalarType, dims...; kwargs...) +end + + +randn(rng::CounterRNG, args...; kwargs...) = randn(rng, CPU_BACKEND, args...; kwargs...) +randn(backend::Backend, args...; kwargs...) = randn(CounterRNG(), backend, args...; kwargs...) +randn(::Type{T}, dims::Integer...; kwargs...) where {T} = randn(CPU_BACKEND, T, dims...; kwargs...) +randn(dims::Integer...; kwargs...) = randn(CPU_BACKEND, dims...; kwargs...) +randn(; kwargs...) = throw(ArgumentError("randn requires at least one dimension")) diff --git a/src/rand/splitmix.jl b/src/rand/splitmix.jl new file mode 100644 index 0000000..cc474be --- /dev/null +++ b/src/rand/splitmix.jl @@ -0,0 +1,44 @@ +struct SplitMix64 <: CounterRNGAlgorithm end + +# SplitMix64 magic numbers +const SPLITMIX64_INCREMENT = UInt64(0x9e3779b97f4a7c15) +const SPLITMIX64_MIX_A = UInt64(0xbf58476d1ce4e5b9) +const SPLITMIX64_MIX_B = UInt64(0x94d049bb133111eb) + + +@inline function _splitmix64_mix(x::UInt64)::UInt64 + x = xor(x, x >> 30) + x *= SPLITMIX64_MIX_A + x = xor(x, x >> 27) + x *= SPLITMIX64_MIX_B + x = xor(x, x >> 31) + return x +end + + +# Derive a 32-bit seed word from a 64-bit seed using SplitMix64 mixing. +@inline function splitmix32_from_u64(seed::UInt64)::UInt32 + return _u32_hi(_splitmix64_mix(seed + SPLITMIX64_INCREMENT)) +end + + +# Natural SplitMix64 output path: compute 64 random bits directly from one counter +@inline function rand_uint( + seed::UInt64, + alg::SplitMix64, + counter::UInt64, + ::Type{UInt64}, +)::UInt64 + return _splitmix64_mix(counter + seed + SPLITMIX64_INCREMENT) +end + + +# UInt32 path is derived from the high 32 bits of the UInt64 SplitMix output +@inline function rand_uint( + seed::UInt64, + alg::SplitMix64, + counter::UInt64, + ::Type{UInt32}, +)::UInt32 + return _u32_hi(rand_uint(seed, alg, counter, UInt64)) +end diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl new file mode 100644 index 0000000..f6ef632 --- /dev/null +++ b/src/rand/threefry.jl @@ -0,0 +1,75 @@ +struct Threefry <: CounterRNGAlgorithm end + +# Threefry magic numbers +const THREEFRY_PARITY = UInt32(0x1BD11BDA) +const THREEFRY_ROTATIONS = ( + UInt32(13), UInt32(15), UInt32(26), UInt32(6), + UInt32(17), UInt32(29), UInt32(16), UInt32(24), +) +const THREEFRY_ROUNDS = 20 + + +@inline function _threefry_key_word(k0::UInt32, k1::UInt32, k2::UInt32, idx::UInt32)::UInt32 + idx == UInt32(0) && return k0 + idx == UInt32(1) && return k1 + return k2 +end + + +# Evaluate one Threefry block at `counter`, returning two 32-bit lanes `(x0, x1)` +@inline function _threefry2x32_block( + seed::UInt64, + counter::UInt64, +)::Tuple{UInt32, UInt32} + + x0 = _u32_lo(counter) + x1 = _u32_hi(counter) + + k0 = _u32_lo(seed) + k1 = _u32_hi(seed) + k2 = xor(THREEFRY_PARITY, xor(k0, k1)) + + x0 += k0 + x1 += k1 + + @inbounds for round in 0:(THREEFRY_ROUNDS - 1) + round_u32 = UInt32(round) + rot = THREEFRY_ROTATIONS[Int((round_u32 & UInt32(0x7)) + UInt32(1))] + x0 += x1 + x1 = xor(_rotl32(x1, rot), x0) + + if (round_u32 & UInt32(0x3)) == UInt32(0x3) + s = (round_u32 >>> 2) + UInt32(1) + i0 = s % UInt32(3) + i1 = (s + UInt32(1)) % UInt32(3) + x0 += _threefry_key_word(k0, k1, k2, i0) + x1 += _threefry_key_word(k0, k1, k2, i1) + s + end + end + + return x0, x1 +end + + +# Return lane 0 from the single Threefry block at `counter` +@inline function rand_uint( + seed::UInt64, + alg::Threefry, + counter::UInt64, + ::Type{UInt32}, +)::UInt32 + x0, _ = _threefry2x32_block(seed, counter) + return x0 +end + + +# Build UInt64 from the two lanes `(x0, x1)` of the same Threefry block at `counter` +@inline function rand_uint( + seed::UInt64, + alg::Threefry, + counter::UInt64, + ::Type{UInt64}, +)::UInt64 + x0, x1 = _threefry2x32_block(seed, counter) + return _u64_from_u32s(x0, x1) +end diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl new file mode 100644 index 0000000..35d9084 --- /dev/null +++ b/src/rand/utilities.jl @@ -0,0 +1,240 @@ +# lo: rightmost 32 bits, hi: leftmost 32 bits +@inline _u32_lo(x::UInt64)::UInt32 = UInt32(x & UInt64(0xffffffff)) +@inline _u32_hi(x::UInt64)::UInt32 = UInt32(x >> 32) + +# Construct UInt64 by bit concatenation of two UInt32s +@inline _u64_from_u32s(lo::UInt32, hi::UInt32)::UInt64 = (UInt64(hi) << 32) | UInt64(lo) + +# Leftmost 32 bits of a*b cast to UInt64s +@inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32) + +# 32-bit rotate left by r positions +@inline _rotl32(x::UInt32, r::UInt32)::UInt32 = bitrotate(x, Int32(r)) + +# Get counter used for CounterRNG from element index +@inline _counter_from_index(i)::UInt64 = UInt64(i - one(i)) + + +# Shared allocation + fill helper for rand/randn convenience constructors. +@inline function _allocate_and_fill_rand( + fill!, + rng::CounterRNG, + backend::Backend, + ::Type{T}, + dims::Integer...; + + # CPU settings + max_tasks::Int=Threads.nthreads(), + min_elems::Int=1, + prefer_threads::Bool=true, + + # GPU settings + block_size::Int=256, +) where {T} + dims_int = Base.map(Int, dims) + v = KernelAbstractions.allocate(backend, T, dims_int) + fill!(rng, v, backend; max_tasks, min_elems, prefer_threads, block_size) + return v +end + + + + +@inline _rand_scalar_uint_type(::Type{UInt8}) = UInt32 +@inline _rand_scalar_uint_type(::Type{UInt16}) = UInt32 +@inline _rand_scalar_uint_type(::Type{UInt32}) = UInt32 +@inline _rand_scalar_uint_type(::Type{Int8}) = UInt32 +@inline _rand_scalar_uint_type(::Type{Int16}) = UInt32 +@inline _rand_scalar_uint_type(::Type{Int32}) = UInt32 +@inline _rand_scalar_uint_type(::Type{Float16}) = UInt32 +@inline _rand_scalar_uint_type(::Type{Float32}) = UInt32 +@inline _rand_scalar_uint_type(::Type{UInt64}) = UInt64 +@inline _rand_scalar_uint_type(::Type{Int64}) = UInt64 +@inline _rand_scalar_uint_type(::Type{Float64}) = UInt64 +@inline _rand_scalar_uint_type(::Type{Bool}) = UInt32 + + +@inline _rand_scalar_from_uint(::Type{UInt8}, u::UInt32)::UInt8 = trunc(UInt8, u >> 24) +@inline _rand_scalar_from_uint(::Type{UInt16}, u::UInt32)::UInt16 = trunc(UInt16, u >> 16) +@inline _rand_scalar_from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u +@inline _rand_scalar_from_uint(::Type{UInt64}, u::UInt64)::UInt64 = u +@inline _rand_scalar_from_uint(::Type{Int8}, u::UInt32)::Int8 = reinterpret(Int8, trunc(UInt8, u >> 24)) +@inline _rand_scalar_from_uint(::Type{Int16}, u::UInt32)::Int16 = reinterpret(Int16, trunc(UInt16, u >> 16)) +@inline _rand_scalar_from_uint(::Type{Int32}, u::UInt32)::Int32 = reinterpret(Int32, u) +@inline _rand_scalar_from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u) +@inline _rand_scalar_from_uint(::Type{Float16}, u::UInt32)::Float16 = uint32_to_unit_float16(u) +@inline _rand_scalar_from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u) +@inline _rand_scalar_from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u) +@inline _rand_scalar_from_uint(::Type{Bool}, u::UInt32)::Bool = isodd(u) + + +#= +Every RNG algorithm implements rand_uint(seed, alg, counter, UInt32/UInt64). +This is the fallback for unsupported RNG algorithms. +=# +""" + rand_uint(seed::UInt64, alg::CounterRNGAlgorithm, counter::UInt64, ::Type{UIntType}) -> UIntType + where {UIntType <: Union{UInt32, UInt64}} + +Low-level extension point for counter-based RNG algorithms used by [`CounterRNG`](@ref). + +`rand_uint` must deterministically map `(seed, alg, counter)` to a raw unsigned integer of the +requested width. Custom algorithms should implement methods for both: + +- `rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32` +- `rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64` + +These methods are used internally by [`rand!`](@ref), [`rand`](@ref), [`randn!`](@ref), and +[`randn`](@ref) to generate integers, floats, and normal samples. + +# Requirements +- The mapping must be deterministic for fixed `seed`, `alg`, and `counter`. +- Implement both `UInt32` and `UInt64` widths. +- The method should return raw random bits; higher-level type conversion is handled by AK separately. + +# Notes +- `counter` is the logical stream position (typically the array index). +- For block-based algorithms such as Philox or Threefry, the `UInt32` and `UInt64` methods may + share an internal block computation. +- The fallback method throws an `ArgumentError` for algorithms that do not implement `rand_uint`. + +See also: [`CounterRNGAlgorithm`](@ref), [`CounterRNG`](@ref). +""" +@inline function rand_uint( + ::UInt64, + alg::CounterRNGAlgorithm, + ::UInt64, + ::Type{UIntType} +)::UIntType where {UIntType <: Union{UInt32, UInt64}} + throw(ArgumentError("No rand_uint implementation for RNG algorithm: $(typeof(alg))")) +end + + +#= +Shared scalar generation: +1) map requested scalar type to corresponding raw UInt width +2) fill the UInt with random bits +3) convert bits into requested scalar representation +=# +@inline function rand_scalar( + seed::UInt64, + alg::CounterRNGAlgorithm, + counter::UInt64, + ::Type{T} +)::T where {T <: ALLOWED_RAND_SCALARS} + + UIntType = _rand_scalar_uint_type(T) + u = rand_uint(seed, alg, counter, UIntType) + + return _rand_scalar_from_uint(T, u) +end + + +@inline function rand_scalar(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T} + throw(ArgumentError( + "Unsupported random scalar type $(T). Supported: $(ALLOWED_RAND_SCALARS)" + )) +end + + +# Convert random UInt32 bits to Float16 in [0, 1) by mantissa construction. +@inline function uint32_to_unit_float16(u::UInt32)::Float16 + + # Keep 10 random bits for the mantissa (drop 22 rightmost bits from the UInt32) + # and combine with the bit pattern of Float16(1.0) (sign=0, exponent=15). + bits = UInt16(0x3c00) | UInt16(u >> 22) + + # Interpret as 1.mantissa, then subtract 1 for [0, 1) + reinterpret(Float16, bits) - Float16(1) +end + + +# Convert random UInt32 bits to Float32 in [0, 1) by mantissa construction. +@inline function uint32_to_unit_float32(u::UInt32)::Float32 + + # Keep 23 random bits for the mantissa (drop 9 rightmost bits from the UInt32) + # and combine with the bit pattern of 1.0f0 (sign=0, exponent=127). + bits = UInt32(0x3f800000) | (u >> 9) + + # Interpret as 1.mantissa, then subtract 1 for [0, 1) + reinterpret(Float32, bits) - 1.0f0 +end + + +# Convert random UInt64 bits to Float64 in [0, 1) by mantissa construction. +@inline function uint64_to_unit_float64(u::UInt64)::Float64 + + # Keep 52 random bits for the mantissa (drop 12 rightmost bits from the UInt64) + # and combine with the bit pattern of 1.0 (sign=0, exponent=1023). + bits = UInt64(0x3ff0000000000000) | (u >> 12) + + # Interpret as 1.mantissa, then subtract 1 for [0, 1) + reinterpret(Float64, bits) - 1.0 +end + + + + + +### Helpers for randn ### + + +# Midpoint-mapped open-interval Float sampling in (0, 1), used for Box-Muller +const OPEN01_MAX_MIDPOINT_INDEX_F32 = UInt32(0x00fffffe) +const OPEN01_MAX_MIDPOINT_INDEX_F64 = UInt64(0x001ffffffffffffe) +const OPEN01_MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24) +const OPEN01_MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53) + + +# Convert random UInt32 bits to Float32 in (0, 1) using midpoint mapping on a 24-bit grid. +@inline function _uint32_to_open_unit_float32_midpoint(u::UInt32)::Float32 + # `min` keeps the top midpoint below one after Float32 rounding. + k = min(u >> 8, OPEN01_MAX_MIDPOINT_INDEX_F32) + return (Float32(k) + 0.5f0) * OPEN01_MIDPOINT_SCALE_F32 +end + + +# Convert random UInt64 bits to Float64 in (0, 1) using midpoint mapping on a 53-bit grid. +@inline function _uint64_to_open_unit_float64_midpoint(u::UInt64)::Float64 + # `min` keeps the top midpoint below one after Float64 rounding. + k = min(u >> 11, OPEN01_MAX_MIDPOINT_INDEX_F64) + return (Float64(k) + 0.5) * OPEN01_MIDPOINT_SCALE_F64 +end + + +# Float16 path reuses Float32 midpoint sampling for robust math in Box-Muller. +@inline function rand_float_open01( + seed::UInt64, + alg::CounterRNGAlgorithm, + counter::UInt64, + ::Type{Float16}, +)::Float16 + return Float16(rand_float_open01(seed, alg, counter, Float32)) +end + + +@inline function rand_float_open01( + seed::UInt64, + alg::CounterRNGAlgorithm, + counter::UInt64, + ::Type{Float32}, +)::Float32 + return _uint32_to_open_unit_float32_midpoint(rand_uint(seed, alg, counter, UInt32)) +end + + +@inline function rand_float_open01( + seed::UInt64, + alg::CounterRNGAlgorithm, + counter::UInt64, + ::Type{Float64}, +)::Float64 + return _uint64_to_open_unit_float64_midpoint(rand_uint(seed, alg, counter, UInt64)) +end + + +@inline function rand_float_open01(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T} + throw(ArgumentError( + "Unsupported open-interval random type $(T). Supported: Union{Float16, Float32, Float64}" + )) +end diff --git a/test/rand.jl b/test/rand.jl new file mode 100644 index 0000000..a6b62c6 --- /dev/null +++ b/test/rand.jl @@ -0,0 +1,488 @@ +const RAND_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry()) +const RAND_SCALAR_TYPES_ALL = ( + UInt8, UInt16, UInt32, UInt64, + Int8, Int16, Int32, Int64, + Float16, Float32, Float64, + Bool, +) +const RAND_SCALAR_TYPES_BACKEND = IS_CPU_BACKEND ? + RAND_SCALAR_TYPES_ALL : + (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Bool) +const RUN_FLOAT16_RAND_TESTS = IS_CPU_BACKEND +const RUN_FLOAT64_RAND_TESTS = IS_CPU_BACKEND + + +_is_unit_interval(v) = all(x -> !isnan(x) && zero(x) <= x < one(x), v) + + +function _rand_fill_reference!( + rng, + x::AbstractArray{T}; + counter_offset::UInt64=UInt64(0), +) where {T <: AK.ALLOWED_RAND_SCALARS} + @inbounds for i in eachindex(x) + x[i] = AK.rand_scalar(rng.seed, rng.alg, counter_offset + UInt64(i - one(i)), T) + end + return x +end + + +function _assert_rand_matches_reference!(rng, x; kwargs...) + counter_offset = rng.offset + AK.rand!(rng, x; kwargs...) + ref = zeros(eltype(x), size(x)) + _rand_fill_reference!(rng, ref; counter_offset) + @test Array(x) == ref + return x +end + + +@testset "rand" begin + @testset "constructors" begin + @test AK.CounterRNG(0x1; alg=AK.SplitMix64()) isa AK.CounterRNG{AK.SplitMix64} + @test AK.CounterRNG(UInt32(0x1); alg=AK.Philox()) isa AK.CounterRNG{AK.Philox} + @test AK.CounterRNG(UInt16(123); alg=AK.Threefry()) isa AK.CounterRNG{AK.Threefry} + @test AK.CounterRNG(UInt32(300)).seed == UInt64(300) + @test AK.CounterRNG(UInt32(300)).offset == UInt64(0) + @test AK.CounterRNG(0x1; offset=17).offset == UInt64(17) + @test AK.CounterRNG(0x1, AK.Philox()).offset == UInt64(0) + @test_throws ArgumentError AK.CounterRNG(-1) + @test_throws ArgumentError AK.CounterRNG(1; offset=-1) + + Random.seed!(0x1234) + expected_seed = Random.rand(Random.default_rng(), UInt64) + Random.seed!(0x1234) + rng_auto = AK.CounterRNG() + @test rng_auto.seed == expected_seed + @test rng_auto.alg isa AK.Philox + @test rng_auto.offset == UInt64(0) + + rng_auto_off = AK.CounterRNG(; offset=42) + @test rng_auto_off.offset == UInt64(42) + + x1 = array_from_host(zeros(Float32, 1024)) + x2 = array_from_host(zeros(Float32, 1024)) + AK.rand!(rng_auto, x1; prefer_threads, block_size=64) + AK.rand!(rng_auto, x2; prefer_threads, block_size=257) + @test rng_auto.offset == UInt64(2048) + @test Array(x1) != Array(x2) + end + + + @testset "counter rng offset behavior" begin + rng_stream = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17)) + s1 = array_from_host(zeros(Float32, 100)) + s2 = array_from_host(zeros(Float32, 100)) + s12 = array_from_host(zeros(Float32, 200)) + AK.rand!(rng_stream, s1; prefer_threads, block_size=64) + @test rng_stream.offset == UInt64(117) + AK.rand!(rng_stream, s2; prefer_threads, block_size=64) + @test rng_stream.offset == UInt64(217) + + rng_once = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17)) + AK.rand!(rng_once, s12; prefer_threads, block_size=64) + @test vcat(Array(s1), Array(s2)) == Array(s12) + @test rng_once.offset == UInt64(217) + + empty = array_from_host(zeros(Float32, 0)) + stream_offset = rng_stream.offset + AK.rand!(rng_stream, empty; prefer_threads, block_size=64) + @test rng_stream.offset == stream_offset + + @test AK.reset!(rng_stream) === rng_stream + @test rng_stream.offset == UInt64(0) + + y1 = array_from_host(zeros(Float32, 64)) + y2 = array_from_host(zeros(Float32, 64)) + AK.rand!(rng_stream, y1; prefer_threads, block_size=64) + AK.rand!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), y2; prefer_threads, block_size=64) + @test Array(y1) == Array(y2) + + mutable struct DummyRNG + seed::UInt64 + alg::AK.Philox + offset::UInt64 + end + + rng_dummy = DummyRNG(UInt64(0x1234), AK.Philox(), UInt64(0)) + x = array_from_host(zeros(Float32, 16)) + @test_throws MethodError AK.rand!(rng_dummy, x; prefer_threads, block_size=64) + @test_throws MethodError AK.reset!(rng_dummy) + end + + + @testset "reset!" begin + rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + x1 = array_from_host(zeros(Float32, 512)) + x2 = array_from_host(zeros(Float32, 512)) + + AK.rand!(rng, x1; prefer_threads, block_size=64) + @test rng.offset == UInt64(512) + @test AK.reset!(rng) === rng + @test rng.offset == UInt64(0) + AK.rand!(rng, x2; prefer_threads, block_size=64) + + @test Array(x1) == Array(x2) + end + + + @testset "bit helpers" begin + hi = UInt32(0b10101010101010101010101010101010) + lo = UInt32(0b01010101010101010101010101010101) + word = UInt64(hi) << 32 | UInt64(lo) + + @test AK._u32_hi(word) == hi + @test AK._u32_lo(word) == lo + @test AK._u64_from_u32s(lo, hi) == word + @test AK._mulhi_u32(0xffffffff % UInt32, 0xffffffff % UInt32) == 0xfffffffe % UInt32 + @test AK._rotl32(0b10000000000000000000000000000001 % UInt32, UInt32(1)) == 0b11 % UInt32 + @test AK._counter_from_index(1) == UInt64(0) + @test AK._counter_from_index(17) == UInt64(16) + + @test AK._rand_scalar_uint_type(UInt8) === UInt32 + @test AK._rand_scalar_uint_type(UInt16) === UInt32 + @test AK._rand_scalar_uint_type(UInt32) === UInt32 + @test AK._rand_scalar_uint_type(Int8) === UInt32 + @test AK._rand_scalar_uint_type(Int16) === UInt32 + @test AK._rand_scalar_uint_type(Int32) === UInt32 + if RUN_FLOAT16_RAND_TESTS + @test AK._rand_scalar_uint_type(Float16) === UInt32 + end + @test AK._rand_scalar_uint_type(Float32) === UInt32 + @test AK._rand_scalar_uint_type(UInt64) === UInt64 + @test AK._rand_scalar_uint_type(Int64) === UInt64 + @test AK._rand_scalar_uint_type(Bool) === UInt32 + if RUN_FLOAT64_RAND_TESTS + @test AK._rand_scalar_uint_type(Float64) === UInt64 + end + + @test AK._rand_scalar_from_uint(UInt8, UInt32(0xabcdef01)) == UInt8(0xab) + @test AK._rand_scalar_from_uint(UInt16, UInt32(0xabcdef01)) == UInt16(0xabcd) + @test AK._rand_scalar_from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32 + @test AK._rand_scalar_from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64 + @test AK._rand_scalar_from_uint(Int8, UInt32(0xff000000)) == Int8(-1) + @test AK._rand_scalar_from_uint(Int16, UInt32(0xffff0000)) == Int16(-1) + @test AK._rand_scalar_from_uint(Int32, 0b11111111111111111111111111111111 % UInt32) == Int32(-1) + @test AK._rand_scalar_from_uint( + Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64 + ) == Int64(-1) + if RUN_FLOAT16_RAND_TESTS + @test AK._rand_scalar_from_uint(Float16, UInt32(0)) == Float16(0) + end + @test AK._rand_scalar_from_uint(Bool, UInt32(0)) == false + @test AK._rand_scalar_from_uint(Bool, UInt32(1)) == true + + if RUN_FLOAT16_RAND_TESTS + @test AK.uint32_to_unit_float16(UInt32(0)) == Float16(0) + @test Float16(0) <= AK.uint32_to_unit_float16(typemax(UInt32)) < Float16(1) + end + @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0 + @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0 + if RUN_FLOAT64_RAND_TESTS + @test AK.uint64_to_unit_float64(UInt64(0)) == 0.0 + @test 0.0 <= AK.uint64_to_unit_float64(typemax(UInt64)) < 1.0 + end + end + + + @testset "rand_uint" begin + for alg in RAND_ALGS + rng = AK.CounterRNG(0x123456789abcdef; alg) + for U in (UInt32, UInt64) + @test AK.rand_uint(rng.seed, rng.alg, UInt64(0), U) == AK.rand_uint(rng.seed, rng.alg, UInt64(0), U) + @test AK.rand_uint(rng.seed, rng.alg, UInt64(1), U) != AK.rand_uint(rng.seed, rng.alg, UInt64(0), U) + + vals = [AK.rand_uint(rng.seed, rng.alg, UInt64(i), U) for i in 0:511] + @test length(unique(vals)) > 460 + end + end + + rng_splitmix = AK.CounterRNG(0x31415926; alg=AK.SplitMix64()) + for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023)) + @test AK.rand_uint(rng_splitmix.seed, rng_splitmix.alg, c, UInt32) == AK._u32_hi( + AK.rand_uint(rng_splitmix.seed, rng_splitmix.alg, c, UInt64) + ) + end + + for alg in (AK.Philox(), AK.Threefry()) + rng = AK.CounterRNG(0xabcdef1234567890; alg) + for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023)) + @test AK._u32_lo(AK.rand_uint(rng.seed, rng.alg, c, UInt64)) == + AK.rand_uint(rng.seed, rng.alg, c, UInt32) + end + end + end + + + @testset "rand_scalar" begin + rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + + for T in RAND_SCALAR_TYPES_BACKEND + s0 = AK.rand_scalar(rng.seed, rng.alg, UInt64(0), T) + s1 = AK.rand_scalar(rng.seed, rng.alg, UInt64(1), T) + @test s0 isa T + @test s1 isa T + @test s0 == AK.rand_scalar(rng.seed, rng.alg, UInt64(0), T) + if !(T in (Bool, Float16, UInt8, UInt16, Int8, Int16)) + @test s0 != s1 + end + if T <: AbstractFloat + @test zero(T) <= s0 < one(T) + @test zero(T) <= s1 < one(T) + end + end + + c = UInt64(42) + @test AK.rand_scalar(rng.seed, rng.alg, c, UInt8) == + trunc(UInt8, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 24) + @test AK.rand_scalar(rng.seed, rng.alg, c, UInt16) == + trunc(UInt16, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 16) + @test AK.rand_scalar(rng.seed, rng.alg, c, Int8) == + reinterpret(Int8, trunc(UInt8, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 24)) + @test AK.rand_scalar(rng.seed, rng.alg, c, Int16) == + reinterpret(Int16, trunc(UInt16, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 16)) + @test AK.rand_scalar(rng.seed, rng.alg, c, Int32) == + reinterpret(Int32, AK.rand_uint(rng.seed, rng.alg, c, UInt32)) + @test AK.rand_scalar(rng.seed, rng.alg, c, Int64) == + reinterpret(Int64, AK.rand_uint(rng.seed, rng.alg, c, UInt64)) + if RUN_FLOAT16_RAND_TESTS + @test AK.rand_scalar(rng.seed, rng.alg, c, Float16) == AK.uint32_to_unit_float16( + AK.rand_uint(rng.seed, rng.alg, c, UInt32) + ) + end + @test AK.rand_scalar(rng.seed, rng.alg, c, Float32) == AK.uint32_to_unit_float32( + AK.rand_uint(rng.seed, rng.alg, c, UInt32) + ) + @test AK.rand_scalar(rng.seed, rng.alg, c, Bool) == + isodd(AK.rand_uint(rng.seed, rng.alg, c, UInt32)) + if RUN_FLOAT64_RAND_TESTS + @test AK.rand_scalar(rng.seed, rng.alg, c, Float64) == AK.uint64_to_unit_float64( + AK.rand_uint(rng.seed, rng.alg, c, UInt64) + ) + end + bools = [AK.rand_scalar(rng.seed, rng.alg, UInt64(i), Bool) for i in 0:511] + @test any(identity, bools) + @test any(!, bools) + @test_throws ArgumentError AK.rand_scalar(rng.seed, rng.alg, UInt64(0), UInt128) + end + + + @testset "rand! explicit rng" begin + lengths = (0, 1, 31, 32, 33, 257, 1024) + rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + + for T in RAND_SCALAR_TYPES_BACKEND + for len in lengths + x = array_from_host(zeros(T, len)) + _assert_rand_matches_reference!(rng, x; prefer_threads, block_size=64) + if T <: AbstractFloat + @test _is_unit_interval(Array(x)) + end + end + end + + for T in RAND_SCALAR_TYPES_BACKEND + x1 = array_from_host(zeros(T, 2048)) + x2 = array_from_host(zeros(T, 2048)) + rng1 = AK.CounterRNG(rng.seed; alg=rng.alg) + rng2 = AK.CounterRNG(rng.seed; alg=rng.alg) + AK.rand!(rng1, x1; prefer_threads, block_size=64) + AK.rand!(rng2, x2; prefer_threads, block_size=257) + @test Array(x1) == Array(x2) + end + + for T in RAND_SCALAR_TYPES_BACKEND + rng1 = AK.CounterRNG(rng.seed; alg=rng.alg) + rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg) + x1 = array_from_host(zeros(T, 2048)) + x2 = array_from_host(zeros(T, 2048)) + AK.rand!(rng1, x1; prefer_threads, block_size=64) + AK.rand!(rng2, x2; prefer_threads, block_size=64) + @test Array(x1) != Array(x2) + end + + begin + rng_stream = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + rng_once = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + x1 = array_from_host(zeros(Float32, 100)) + x2 = array_from_host(zeros(Float32, 100)) + x12 = array_from_host(zeros(Float32, 200)) + AK.rand!(rng_stream, x1; prefer_threads, block_size=64) + AK.rand!(rng_stream, x2; prefer_threads, block_size=64) + AK.rand!(rng_once, x12; prefer_threads, block_size=64) + @test vcat(Array(x1), Array(x2)) == Array(x12) + @test rng_stream.offset == UInt64(200) + end + + for T in (RUN_FLOAT16_RAND_TESTS ? (Float16, Float32, UInt64, Bool) : (Float32, UInt64, Bool)) + xnd = array_from_host(zeros(T, 7, 11, 5)) + _assert_rand_matches_reference!(rng, xnd; prefer_threads, block_size=128) + end + + if IS_CPU_BACKEND + for T in RAND_SCALAR_TYPES_BACKEND + base = zeros(T, 64) + view_x = @view base[2:2:end] + AK.rand!( + rng, view_x; + max_tasks=Threads.nthreads(), + min_elems=1, + prefer_threads=true + ) + ref_view = zeros(T, length(view_x)) + _rand_fill_reference!( + rng, ref_view; + counter_offset=rng.offset - UInt64(length(view_x)), + ) + @test collect(view_x) == ref_view + end + end + end + + + @testset "rand! convenience" begin + ref1 = array_from_host(zeros(Float32, 1024)) + ref2 = array_from_host(zeros(Float32, 1024)) + x1 = array_from_host(zeros(Float32, 1024)) + x2 = array_from_host(zeros(Float32, 1024)) + + Random.seed!(0xabcdef) + seed1 = Random.rand(Random.default_rng(), UInt64) + AK.rand!(AK.CounterRNG(seed1; alg=AK.Philox()), ref1; prefer_threads, block_size=64) + seed2 = Random.rand(Random.default_rng(), UInt64) + AK.rand!(AK.CounterRNG(seed2; alg=AK.Philox()), ref2; prefer_threads, block_size=64) + + Random.seed!(0xabcdef) + AK.rand!(x1; prefer_threads, block_size=64) + AK.rand!(x2; prefer_threads, block_size=64) + @test Array(x1) == Array(ref1) + @test Array(x2) == Array(ref2) + + x_bad = zeros(UInt128, 16) + @test_throws ArgumentError AK.rand!(x_bad; prefer_threads) + @test_throws ArgumentError AK.rand!(AK.CounterRNG(0x1), x_bad; prefer_threads) + end + + + @testset "rand allocation convenience" begin + default_alloc_type = IS_CPU_BACKEND ? Float64 : Float32 + + rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()) + y = AK.rand(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64) + @test size(y) == (6, 7) + @test eltype(y) === Float32 + @test _is_unit_interval(Array(y)) + @test rng.offset == UInt64(length(y)) + + rng_default = AK.CounterRNG(UInt64(0x99); alg=AK.Philox()) + rng_default_ref = AK.CounterRNG(UInt64(0x99); alg=AK.Philox()) + y_default = AK.rand(rng_default, BACKEND, 128; prefer_threads, block_size=64) + y_default_ref = AK.rand( + rng_default_ref, BACKEND, default_alloc_type, 128; prefer_threads, block_size=64 + ) + @test eltype(y_default) === default_alloc_type + @test Array(y_default) == Array(y_default_ref) + @test rng_default.offset == rng_default_ref.offset == UInt64(128) + + rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) + rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) + y_alloc = AK.rand(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64) + y_fill = array_from_host(zeros(Float32, 128)) + AK.rand!(rng_fill, y_fill; prefer_threads, block_size=64) + @test Array(y_alloc) == Array(y_fill) + @test rng_alloc.offset == rng_fill.offset == UInt64(128) + + rng_cpu_default = AK.CounterRNG(UInt64(0x66); alg=AK.Philox()) + rng_cpu_default_ref = AK.CounterRNG(UInt64(0x66); alg=AK.Philox()) + y_cpu_default = AK.rand(rng_cpu_default, 128; prefer_threads, block_size=64) + y_cpu_default_ref = AK.rand( + rng_cpu_default_ref, AK.get_backend([]), 128; prefer_threads, block_size=64 + ) + @test eltype(y_cpu_default) === Float64 + @test Array(y_cpu_default) == Array(y_cpu_default_ref) + @test rng_cpu_default.offset == rng_cpu_default_ref.offset == UInt64(128) + + rng_cpu_typed = AK.CounterRNG(UInt64(0x77); alg=AK.Philox()) + rng_cpu_typed_ref = AK.CounterRNG(UInt64(0x77); alg=AK.Philox()) + y_cpu_typed = AK.rand(rng_cpu_typed, Float32, 128; prefer_threads, block_size=64) + y_cpu_typed_ref = AK.rand( + rng_cpu_typed_ref, AK.get_backend([]), Float32, 128; prefer_threads, block_size=64 + ) + @test eltype(y_cpu_typed) === Float32 + @test Array(y_cpu_typed) == Array(y_cpu_typed_ref) + @test rng_cpu_typed.offset == rng_cpu_typed_ref.offset == UInt64(128) + + # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks. + AK.rand(BACKEND, Float32, 1; prefer_threads, block_size=64) + + # Auto-seeded constructor should match explicit seed capture from default RNG. + Random.seed!(0x9abc) + seed = Random.rand(Random.default_rng(), UInt64) + ref = AK.rand(AK.CounterRNG(seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64) + Random.seed!(0x9abc) + x = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64) + @test Array(x) == Array(ref) + + # Auto-seeded convenience without explicit type should use backend-dependent default type. + Random.seed!(0x4242) + seed_default = Random.rand(Random.default_rng(), UInt64) + ref_default = AK.rand( + AK.CounterRNG(seed_default; alg=AK.Philox()), + BACKEND, + default_alloc_type, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x4242) + x_default = AK.rand(BACKEND, 64; prefer_threads, block_size=64) + @test eltype(x_default) === default_alloc_type + @test Array(x_default) == Array(ref_default) + + # Convenience without backend should default to CPU backend and Float64. + Random.seed!(0x4545) + seed_cpu_default = Random.rand(Random.default_rng(), UInt64) + ref_cpu_default = AK.rand( + AK.CounterRNG(seed_cpu_default; alg=AK.Philox()), + AK.get_backend([]), + Float64, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x4545) + x_cpu_default = AK.rand(64; prefer_threads, block_size=64) + @test eltype(x_cpu_default) === Float64 + @test Array(x_cpu_default) == Array(ref_cpu_default) + + # Type-only convenience should default to CPU backend. + Random.seed!(0x5656) + seed_cpu_typed = Random.rand(Random.default_rng(), UInt64) + ref_cpu_typed = AK.rand( + AK.CounterRNG(seed_cpu_typed; alg=AK.Philox()), + AK.get_backend([]), + Float32, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x5656) + x_cpu_typed_no_rng = AK.rand(Float32, 64; prefer_threads, block_size=64) + @test eltype(x_cpu_typed_no_rng) === Float32 + @test Array(x_cpu_typed_no_rng) == Array(ref_cpu_typed) + + # Reseeding should reproduce the same auto-seeded draw. + Random.seed!(0x7777) + x1 = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64) + Random.seed!(0x7777) + x2 = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64) + @test Array(x1) == Array(x2) + + @test_throws ArgumentError AK.rand(AK.CounterRNG(0x1), BACKEND, UInt128, 16; prefer_threads) + @test_throws MethodError AK.rand(AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.rand(BACKEND, Float32, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.rand(BACKEND, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.rand(16; prefer_threads, bad=:kwarg) + @test_throws ArgumentError AK.rand() + end +end diff --git a/test/randn.jl b/test/randn.jl new file mode 100644 index 0000000..2d7b5ff --- /dev/null +++ b/test/randn.jl @@ -0,0 +1,353 @@ +const RANDN_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry()) +const RANDN_FLOAT_TYPES_BACKEND = IS_CPU_BACKEND ? (Float16, Float32, Float64) : (Float32,) +const RANDN_LENGTHS = (0, 1, 2, 31, 32, 33, 257, 1024) + + +_all_finite(v) = all(isfinite, v) +_randn_reference_atol(::Type{Float16}) = 16 * eps(Float16) +_randn_reference_atol(::Type{Float32}) = 64 * eps(Float32) +_randn_reference_atol(::Type{Float64}) = 64 * eps(Float64) + + +function _randn_fill_reference!( + rng, + x::AbstractArray{T}; + counter_offset::UInt64=UInt64(0), +) where {T <: AK.ALLOWED_RANDN_SCALARS} + @inbounds for i in eachindex(x) + x[i] = AK.randn_scalar(rng.seed, rng.alg, counter_offset + UInt64(i - one(i)), T) + end + return x +end + + +function _assert_randn_matches_reference!(rng, x; kwargs...) + counter_offset = rng.offset + AK.randn!(rng, x; kwargs...) + ref = zeros(eltype(x), size(x)) + _randn_fill_reference!(rng, ref; counter_offset) + xa = Array(x) + + if IS_CPU_BACKEND + @test xa == ref + else + # randn uses Box-Muller (`log`, `sqrt`, `sincos`), and GPU libm implementations are not + # bit-identical to CPU scalar libm. Stream/counter mapping is still deterministic, but the + # final Float32 values can differ by a few ULP, so we use a tight absolute tolerance here. + atol = _randn_reference_atol(eltype(xa)) + @test all(isapprox.(xa, ref; rtol=zero(atol), atol)) + end + + return x +end + + +@testset "randn" begin + @testset "scalar helpers" begin + @test 0.0f0 < AK._uint32_to_open_unit_float32_midpoint(UInt32(0)) < 1.0f0 + @test 0.0f0 < AK._uint32_to_open_unit_float32_midpoint(typemax(UInt32)) < 1.0f0 + + if IS_CPU_BACKEND + @test 0.0 < AK._uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0 + @test 0.0 < AK._uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0 + end + + seed = UInt64(0x123456789abcdef) + for alg in RANDN_ALGS + for counter in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023)) + u32 = AK.rand_float_open01(seed, alg, counter, Float32) + @test 0.0f0 < u32 < 1.0f0 + + if IS_CPU_BACKEND + u64 = AK.rand_float_open01(seed, alg, counter, Float64) + @test 0.0 < u64 < 1.0 + end + end + + for T in RANDN_FLOAT_TYPES_BACKEND + s0 = AK.randn_scalar(seed, alg, UInt64(42), T) + s1 = AK.randn_scalar(seed, alg, UInt64(43), T) + + @test s0 isa T + @test s1 isa T + @test isfinite(s0) + @test isfinite(s1) + @test s0 == AK.randn_scalar(seed, alg, UInt64(42), T) + @test s1 == AK.randn_scalar(seed, alg, UInt64(43), T) + + p0, p1 = AK.randn_pair(seed, alg, UInt64(21), T) + @test p0 == AK.randn_scalar(seed, alg, UInt64(42), T) + @test p1 == AK.randn_scalar(seed, alg, UInt64(43), T) + end + end + + @test_throws ArgumentError AK.rand_float_open01(seed, AK.Philox(), UInt64(0), UInt32) + @test_throws ArgumentError AK.randn_scalar(seed, AK.Philox(), UInt64(0), UInt32) + end + + + @testset "randn! explicit rng" begin + for alg in RANDN_ALGS + rng = AK.CounterRNG(0x123456789abcdef; alg) + + for T in RANDN_FLOAT_TYPES_BACKEND + x1 = array_from_host(zeros(T, 2048)) + x2 = array_from_host(zeros(T, 2048)) + rng1 = AK.CounterRNG(rng.seed; alg=rng.alg) + rng2 = AK.CounterRNG(rng.seed; alg=rng.alg) + + AK.randn!(rng1, x1; prefer_threads, block_size=64) + AK.randn!(rng2, x2; prefer_threads, block_size=257) + @test Array(x1) == Array(x2) + end + + for T in RANDN_FLOAT_TYPES_BACKEND + x1 = array_from_host(zeros(T, 2048)) + x2 = array_from_host(zeros(T, 2048)) + rng1 = AK.CounterRNG(rng.seed; alg=rng.alg) + rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg) + + AK.randn!(rng1, x1; prefer_threads, block_size=64) + AK.randn!(rng2, x2; prefer_threads, block_size=64) + @test Array(x1) != Array(x2) + end + end + end + + + @testset "offset and reset semantics" begin + rng_stream = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17)) + s1 = array_from_host(zeros(Float32, 99)) + s2 = array_from_host(zeros(Float32, 101)) + s12 = array_from_host(zeros(Float32, 200)) + + AK.randn!(rng_stream, s1; prefer_threads, block_size=64) + @test rng_stream.offset == UInt64(116) + AK.randn!(rng_stream, s2; prefer_threads, block_size=64) + @test rng_stream.offset == UInt64(217) + + rng_once = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17)) + AK.randn!(rng_once, s12; prefer_threads, block_size=64) + @test vcat(Array(s1), Array(s2)) == Array(s12) + @test rng_once.offset == UInt64(217) + + empty = array_from_host(zeros(Float32, 0)) + stream_offset = rng_stream.offset + AK.randn!(rng_stream, empty; prefer_threads, block_size=64) + @test rng_stream.offset == stream_offset + + @test AK.reset!(rng_stream) === rng_stream + @test rng_stream.offset == UInt64(0) + + y1 = array_from_host(zeros(Float32, 64)) + y2 = array_from_host(zeros(Float32, 64)) + AK.randn!(rng_stream, y1; prefer_threads, block_size=64) + AK.randn!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), y2; prefer_threads, block_size=64) + @test Array(y1) == Array(y2) + end + + + @testset "shapes and views" begin + rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + + for T in RANDN_FLOAT_TYPES_BACKEND + xnd = array_from_host(zeros(T, 7, 11, 5)) + _assert_randn_matches_reference!(rng, xnd; prefer_threads, block_size=128) + end + + if IS_CPU_BACKEND + for T in RANDN_FLOAT_TYPES_BACKEND + base = zeros(T, 64) + view_x = @view base[2:2:end] + + AK.randn!( + rng, view_x; + max_tasks=Threads.nthreads(), + min_elems=1, + prefer_threads=true, + ) + + ref_view = zeros(T, length(view_x)) + _randn_fill_reference!( + rng, ref_view; + counter_offset=rng.offset - UInt64(length(view_x)), + ) + @test collect(view_x) == ref_view + end + end + end + + + @testset "randn! convenience" begin + ref1 = array_from_host(zeros(Float32, 1024)) + ref2 = array_from_host(zeros(Float32, 1024)) + x1 = array_from_host(zeros(Float32, 1024)) + x2 = array_from_host(zeros(Float32, 1024)) + + Random.seed!(0xabcdef) + seed1 = Random.rand(Random.default_rng(), UInt64) + AK.randn!(AK.CounterRNG(seed1; alg=AK.Philox()), ref1; prefer_threads, block_size=64) + seed2 = Random.rand(Random.default_rng(), UInt64) + AK.randn!(AK.CounterRNG(seed2; alg=AK.Philox()), ref2; prefer_threads, block_size=64) + + Random.seed!(0xabcdef) + AK.randn!(x1; prefer_threads, block_size=64) + AK.randn!(x2; prefer_threads, block_size=64) + @test Array(x1) == Array(ref1) + @test Array(x2) == Array(ref2) + + x_bad = zeros(UInt32, 16) + @test_throws ArgumentError AK.randn!(x_bad; prefer_threads) + @test_throws ArgumentError AK.randn!(AK.CounterRNG(0x1), x_bad; prefer_threads) + end + + + @testset "randn allocation convenience" begin + default_alloc_type = IS_CPU_BACKEND ? Float64 : Float32 + + rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()) + y = AK.randn(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64) + @test size(y) == (6, 7) + @test eltype(y) === Float32 + @test _all_finite(Array(y)) + @test rng.offset == UInt64(length(y)) + + rng_default = AK.CounterRNG(UInt64(0x99); alg=AK.Philox()) + rng_default_ref = AK.CounterRNG(UInt64(0x99); alg=AK.Philox()) + y_default = AK.randn(rng_default, BACKEND, 128; prefer_threads, block_size=64) + y_default_ref = AK.randn( + rng_default_ref, BACKEND, default_alloc_type, 128; prefer_threads, block_size=64 + ) + @test eltype(y_default) === default_alloc_type + @test Array(y_default) == Array(y_default_ref) + @test rng_default.offset == rng_default_ref.offset == UInt64(128) + + rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) + rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox()) + y_alloc = AK.randn(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64) + y_fill = array_from_host(zeros(Float32, 128)) + AK.randn!(rng_fill, y_fill; prefer_threads, block_size=64) + @test Array(y_alloc) == Array(y_fill) + @test rng_alloc.offset == rng_fill.offset == UInt64(128) + + rng_cpu_default = AK.CounterRNG(UInt64(0x66); alg=AK.Philox()) + rng_cpu_default_ref = AK.CounterRNG(UInt64(0x66); alg=AK.Philox()) + y_cpu_default = AK.randn(rng_cpu_default, 128; prefer_threads, block_size=64) + y_cpu_default_ref = AK.randn( + rng_cpu_default_ref, AK.get_backend([]), 128; prefer_threads, block_size=64 + ) + @test eltype(y_cpu_default) === Float64 + @test Array(y_cpu_default) == Array(y_cpu_default_ref) + @test rng_cpu_default.offset == rng_cpu_default_ref.offset == UInt64(128) + + rng_cpu_typed = AK.CounterRNG(UInt64(0x77); alg=AK.Philox()) + rng_cpu_typed_ref = AK.CounterRNG(UInt64(0x77); alg=AK.Philox()) + y_cpu_typed = AK.randn(rng_cpu_typed, Float32, 128; prefer_threads, block_size=64) + y_cpu_typed_ref = AK.randn( + rng_cpu_typed_ref, AK.get_backend([]), Float32, 128; prefer_threads, block_size=64 + ) + @test eltype(y_cpu_typed) === Float32 + @test Array(y_cpu_typed) == Array(y_cpu_typed_ref) + @test rng_cpu_typed.offset == rng_cpu_typed_ref.offset == UInt64(128) + + # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks. + AK.randn(BACKEND, Float32, 1; prefer_threads, block_size=64) + + # Auto-seeded constructor should match explicit seed capture from default RNG. + Random.seed!(0x9abc) + seed = Random.rand(Random.default_rng(), UInt64) + ref = AK.randn(AK.CounterRNG( + seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64 + ) + Random.seed!(0x9abc) + x = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64) + @test Array(x) == Array(ref) + + # Auto-seeded convenience without explicit type should use backend-dependent default type. + Random.seed!(0x4242) + seed_default = Random.rand(Random.default_rng(), UInt64) + ref_default = AK.randn( + AK.CounterRNG(seed_default; alg=AK.Philox()), + BACKEND, + default_alloc_type, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x4242) + x_default = AK.randn(BACKEND, 64; prefer_threads, block_size=64) + @test eltype(x_default) === default_alloc_type + @test Array(x_default) == Array(ref_default) + + # Convenience without backend should default to CPU backend and Float64. + Random.seed!(0x4545) + seed_cpu_default = Random.rand(Random.default_rng(), UInt64) + ref_cpu_default = AK.randn( + AK.CounterRNG(seed_cpu_default; alg=AK.Philox()), + AK.get_backend([]), + Float64, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x4545) + x_cpu_default = AK.randn(64; prefer_threads, block_size=64) + @test eltype(x_cpu_default) === Float64 + @test Array(x_cpu_default) == Array(ref_cpu_default) + + # Type-only convenience should default to CPU backend. + Random.seed!(0x5656) + seed_cpu_typed = Random.rand(Random.default_rng(), UInt64) + ref_cpu_typed = AK.randn( + AK.CounterRNG(seed_cpu_typed; alg=AK.Philox()), + AK.get_backend([]), + Float32, + 64; + prefer_threads, + block_size=64, + ) + Random.seed!(0x5656) + x_cpu_typed_no_rng = AK.randn(Float32, 64; prefer_threads, block_size=64) + @test eltype(x_cpu_typed_no_rng) === Float32 + @test Array(x_cpu_typed_no_rng) == Array(ref_cpu_typed) + + # Reseeding should reproduce the same auto-seeded draw. + Random.seed!(0x7777) + x1 = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64) + Random.seed!(0x7777) + x2 = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64) + @test Array(x1) == Array(x2) + + @test_throws ArgumentError AK.randn(AK.CounterRNG(0x1), BACKEND, UInt32, 16; prefer_threads) + @test_throws MethodError AK.randn( + AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg + ) + @test_throws MethodError AK.randn(BACKEND, Float32, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.randn(BACKEND, 16; prefer_threads, bad=:kwarg) + @test_throws MethodError AK.randn(16; prefer_threads, bad=:kwarg) + @test_throws ArgumentError AK.randn() + end + + + @testset "moments sanity" begin + n = 200_000 + rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox()) + + for T in RANDN_FLOAT_TYPES_BACKEND + x = array_from_host(zeros(T, n)) + AK.randn!(rng, x; prefer_threads, block_size=128) + xa = Float64.(Array(x)) + + m = sum(xa) / length(xa) + v = sum((xi - m)^2 for xi in xa) / length(xa) + + if T === Float16 + @test abs(m) < 0.1 + @test abs(v - one(v)) < 0.15 + else + @test abs(m) < 0.01 + @test abs(v - one(v)) < 0.03 + end + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 716fd8e..707afee 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -69,6 +69,8 @@ end include("partition.jl") include("looping.jl") include("map.jl") +include("rand.jl") +include("randn.jl") include("sort.jl") include("reduce.jl") include("accumulate.jl")