Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
20eea2c
on-device rand! with three options for stateless counter-based RNG
fjbarter Mar 16, 2026
3c3e81a
add convenience for AK.rand!(X), and single UInt64 seed construction …
fjbarter Mar 16, 2026
f590d5e
just use foreachindex for the kernel now that it is simple enough
fjbarter Mar 16, 2026
9e471d6
finalise tests and docs
fjbarter Mar 16, 2026
65caf10
ensure deteminism in convenience tests by avoiding race causing inter…
fjbarter Mar 17, 2026
3d1e0c0
fix Random dep for use with julia v1.10
fjbarter Mar 17, 2026
48cd247
add Bool scalar support with isodd() on a UInt32
fjbarter Mar 17, 2026
68e03c3
add support for Float16, UInt8, UInt16, Int8, Int16 (was bored)
fjbarter Mar 17, 2026
8677d84
Add more faithful seed-key mapping for Philox, and improve thoroughne…
fjbarter Mar 18, 2026
d687e2a
streaming rng by now including an offset, incremented by length(x) wh…
fjbarter Mar 23, 2026
71a62d4
style update + remove AbstractCounterRNG type in favour of purely all…
fjbarter Mar 24, 2026
42a434e
update tests and docs to match new CounterRNG interface
fjbarter Mar 24, 2026
a9582a0
initial randn!
fjbarter Mar 25, 2026
61aa11d
ensure compile-time initial index bias, now beating CUDA for both odd…
fjbarter Mar 26, 2026
9abe388
attempt to fix oneAPI test precompilation hang with Julia v1.10 by di…
fjbarter Mar 26, 2026
841bafb
fix Threefry UInt32 device arithmetic to avoid breaking on Metal
fjbarter Mar 26, 2026
39e089f
expand rand/randn convenience APIs and align docs/tests. Move randn o…
fjbarter Mar 27, 2026
9595086
document rand_uint and CounterRNGAlgorithm properly, improve RNG docs…
fjbarter Mar 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,13 @@ steps:
- JuliaCI/julia#v1:
version: "1.10"
command: |
julia -e 'using Pkg
julia --pkgimages=no -e 'using Pkg

println("--- :julia: Instantiating environment")
Pkg.add("oneAPI")
Pkg.develop(path=".")

println("+++ :julia: Running tests")
Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
println("--- :julia: Instantiating environment")
Pkg.add("oneAPI")
Pkg.develop(path=".")
println("+++ :julia: Running tests")
Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
agents:
queue: "juliagpu"
intel: "*"
Expand Down
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"

[weakdeps]
Expand All @@ -21,6 +22,7 @@ ArgCheck = "2"
GPUArraysCore = "0.2.0"
KernelAbstractions = "0.9.34"
Markdown = "1"
Random = "1"
UnsafeAtomics = "0.3.0"
julia = "1.10"
oneAPI = "1, 2"
1 change: 1 addition & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ makedocs(;
"Using Different Backends" => "api/using_backends.md",
"General Loops" => "api/foreachindex.md",
"Map" => "api/map.md",
"Random Number Generation" => "api/rand.md",
"Sorting" => "api/sort.md",
"Reduce" => "api/reduce.md",
"MapReduce" => "api/mapreduce.md",
Expand Down
180 changes: 180 additions & 0 deletions docs/src/api/rand.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
### Random Number Generation

Counter-based random generation for CPU and GPU backends with deterministic stream behaviour for a
fixed `seed`, algorithm, and call sequence.

Both in-place and allocation forms are supported:
- Uniform: `AK.rand!`, `AK.rand`
- Standard normal: `AK.randn!`, `AK.randn`

`CounterRNG` stores:
- `seed::UInt64`
- algorithm `alg`
- stream `offset::UInt64`

The offset starts at `0` and advances by the number of generated values after each call. For
`AK.rand!(rng, v)` and `AK.randn!(rng, v)`, element `v[i]` is generated from logical counter
`rng.offset + UInt64(i - 1)` in linear indexing order.

This gives stream-consistent chunking:
- filling `100` then `100` elements yields the same `200` values as one `200`-element fill.
- `AK.reset!(rng)` rewinds `rng.offset` to `0x0`.

Calls that share the same `CounterRNG` instance concurrently are not thread-safe and may race on
`offset`.

`AK.rand!` and `AK.randn!` accept `rng::AK.CounterRNG`. Passing other RNG container types is not
supported and will throw a `MethodError`.

#### Auto-seeded convenience behaviour

Use an explicit `CounterRNG` when reproducibility is required.

For convenience, calls without an explicit `rng` construct a fresh `CounterRNG()` on each call,
using one auto-seeded `Base.rand(UInt64)` draw. Therefore repeated bare calls intentionally produce
different outputs unless `Random.seed!()` is used first.

Examples:
- `AK.rand!(v)`
- `AK.randn!(v)`
- `AK.rand(backend, args...)`
- `AK.randn(backend, args...)`

These do **not** continue a shared stream across calls unless you pass the same explicit
`CounterRNG`.

#### Allocation forms

Canonical forms:
- `AK.rand(rng, backend, T, dims...)`
- `AK.randn(rng, backend, T, dims...)`

Shared defaults:
- omit `rng` -> fresh `CounterRNG()`
- omit `backend` -> CPU backend
- omit `T` -> `Float64` on CPU backend, `Float32` otherwise

Common shorthands include:
- `AK.rand(dims...)`
- `AK.rand(T, dims...)`
- `AK.rand(backend, dims...)`
- and the corresponding `AK.randn(...)` variants

For explicit `rng`, both `AK.rand` and `AK.randn` advance `rng.offset` by the number of generated
elements, i.e. `prod(dims)`.

#### Supported element types

`AK.rand!` / `AK.rand` support:
- `UInt8`, `UInt16`, `UInt32`, `UInt64`
- `Int8`, `Int16`, `Int32`, `Int64`
- `Float16`, `Float32`, `Float64`
- `Bool`

`AK.randn!` / `AK.randn` currently support:
- `Float16`, `Float32`, `Float64`

#### Value generation semantics

The core generator produces either a `UInt32` or `UInt64`, depending on the requested output type.
That raw unsigned value is then mapped as follows:
- Unsigned integers: returned directly, or truncated if narrower
- Signed integers: the corresponding unsigned bit pattern reinterpreted as signed, then truncated if narrower
- Floats: mantissa construction onto a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/))
- Bool: `true` if the raw `UInt` draw is odd (`isodd(u)`), otherwise `false`

`AK.randn!` uses Box-Muller with midpoint-mapped open-interval uniforms in `(0, 1)`.

#### Algorithms currently available

- `SplitMix64` ([read more](https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64))
- `Philox` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf))
- `Threefry` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf))

`Philox` is the default algorithm for `CounterRNG()`.

#### Statistical testing and security

- In this repository, `SplitMix64`, `Philox`, and `Threefry` have passed TestU01 BigCrush
- These generators are not intended to be cryptographically secure

#### Philox keying note

AK uses `Philox2x32` internally, which has a single 32-bit Philox key word.

Users may pass any non-negative `Integer` seed with `seed <= typemax(UInt64)`; AK converts it to
`UInt64` and derives the 32-bit Philox key using SplitMix. This wrapper choice is deliberate for
ease of use and deterministic streams, not a change to the Philox round function itself.

Therefore, AK Philox streams are deterministic and high-quality, but are not guaranteed to be
bit-for-bit identical to a raw Random123 Philox stream unless the same seed-to-key mapping and
counter convention are used.

#### Custom algorithms

To define a custom counter RNG:
- define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`
- implement:
- `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32`
- `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64`

Then use it via:
- `AK.CounterRNG(seed; alg=MyAlg(), offset=...)`

Both widths should be implemented so `AK.rand!` supports all integer and floating-point output
types without fallback or error.

#### Performance note

`Philox` is the default because it is high-quality and very fast. `AK.rand!` has been measured at
roughly memory-bound throughput (~390 GB/s) on an Nvidia GeForce RTX 5060, including slightly better
performance than CURAND for large `CuArray{Float32}` fills and substantially faster `CuArray{Int32}`
filling than native `CUDA.rand!` in the benchmarks used for this repository.

Examples:
```julia
import AcceleratedKernels as AK
using oneAPI
using AMDGPU

# Reproducible
rng = AK.CounterRNG(0x12345678; alg=AK.Philox())
v = oneArray{Float32}(undef, 1024)
AK.rand!(rng, v)

# Stream-consistent chunking
rng = AK.CounterRNG(0x12345678; alg=AK.Philox())
v1 = oneArray{Float32}(undef, 100)
v2 = oneArray{Float32}(undef, 100)
AK.rand!(rng, v1)
AK.rand!(rng, v2)

# Convenience: fresh auto-seeded RNG on each call
y = oneArray{Float32}(undef, 1024)
AK.rand!(y)

# Allocation form
y_cpu_auto = AK.rand(1024) # CPU, Vector{Float64}
y_one = AK.rand(oneAPIBackend(), Float32, 1024) # fresh RNG, allocate + fill oneArray
y_cpu_typed = AK.rand(rng, Float16, 1024) # CPU backend, explicit type, explicit RNG

# Standard normal filling
z = ROCArray{Float32}(undef, 1024)
AK.randn!(rng, z)

# Standard normal allocation form
z_cpu_auto = AK.randn(1024) # CPU, Vector{Float64}
z_roc = AK.randn(ROCBackend(), 1024) # fresh RNG, allocate + fill ROCArray{Float32}
z_cpu_typed = AK.randn(rng, Float16, 1024) # CPU backend, explicit type, explicit RNG
```

```@docs
AcceleratedKernels.CounterRNG
AcceleratedKernels.CounterRNGAlgorithm
AcceleratedKernels.rand_uint
AcceleratedKernels.reset!
AcceleratedKernels.rand!
AcceleratedKernels.rand
AcceleratedKernels.randn!
AcceleratedKernels.randn
```
3 changes: 3 additions & 0 deletions prototype/RNGTest/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[deps]
AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
RNGTest = "97cc5700-e6cb-5ca1-8fb2-7f6b45264ecd"
33 changes: 33 additions & 0 deletions prototype/RNGTest/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# AK + RNGTest SmallCrush Prototype

This folder provides a chunked random stream generator based on `AcceleratedKernels.jl` that can be fed into `RNGTest.jl`.

The stream is deterministic and effectively unbounded:
- each refill generates `chunk` random `UInt64` values with `AK.rand!`
- each refill advances one persistent `CounterRNG` stream offset
- this is a practical chunked stream for RNGTest callback mode

`RNGTest.jl` (in this local checkout) expects a callback returning `Float64` in `[0,1]`, so `UInt64` words are mapped to `Float64` via top-53-bit scaling.

Current status in this harness: `SplitMix64`, `Philox`, and `Threefry` all pass BigCrush using `run_bigcrush.jl`.

## Run SmallCrush

From this directory:

```powershell
julia --project=. run_smallcrush.jl
```

## Run BigCrush

```powershell
julia --project=. run_bigcrush.jl
```

Notes:
- Configure `ALG`, `SEED`, and `CHUNK` at the top of
`run_smallcrush.jl` / `run_bigcrush.jl`.
- The stream refills directly into host scratch using `AK.rand!` on CPU.
- `chunk` controls refill amortization and memory usage.
- `chunk=100000000` means ~800 MB host scratch (`UInt64`).
26 changes: 26 additions & 0 deletions prototype/RNGTest/run_bigcrush.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
using RNGTest

include("stream.jl")


const ALG = :philox
const SEED = 0x1234
const CHUNK = 10_000_000
const HOST_SCRATCH = Vector{UInt64}(undef, CHUNK)


stream = AKUInt64Stream(
HOST_SCRATCH;
seed=SEED,
alg=ALG,
start_counter=UInt64(0),
)
gen = make_rngtest_generator!(stream)
genname = "AK_Vector_$(ALG)_seed$(SEED)"

println("Beginning BigCrush. This may take hours...")

RNGTest.bigcrushTestU01(gen, genname)

println("refills: ", stream.refill_count)
println("numbers consumed (approx): ", (stream.refill_count - 1) * stream.chunk + (stream.idx - 1))
23 changes: 23 additions & 0 deletions prototype/RNGTest/run_smallcrush.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using RNGTest

include("stream.jl")


const ALG = :philox
const SEED = 0x1234
const CHUNK = 100_000_000
const HOST_SCRATCH = Vector{UInt64}(undef, CHUNK)


stream = AKUInt64Stream(
HOST_SCRATCH;
seed=SEED,
alg=ALG,
start_counter=UInt64(0),
)
gen = make_rngtest_generator!(stream)
genname = "AK_Vector_$(ALG)_seed$(SEED)"

println("Beginning SmallCrush...")

RNGTest.smallcrushTestU01(gen, genname)
80 changes: 80 additions & 0 deletions prototype/RNGTest/stream.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import AcceleratedKernels as AK


function make_rng(seed::Integer, alg::Symbol; offset::Integer=0)
if alg === :philox
return AK.CounterRNG(seed; alg=AK.Philox(), offset=offset)
elseif alg === :threefry
return AK.CounterRNG(seed; alg=AK.Threefry(), offset=offset)
elseif alg === :splitmix64
return AK.CounterRNG(seed; alg=AK.SplitMix64(), offset=offset)
end
throw(ArgumentError("alg must be :philox, :threefry, or :splitmix64; got $alg"))
end


mutable struct AKUInt64Stream{R}
rng::R
chunk::Int
idx::Int
host_scratch::Vector{UInt64}
refill_count::Int
end


function AKUInt64Stream(
host_scratch::Vector{UInt64};
seed::Integer=0x1234,
alg::Symbol=:philox,
start_counter::UInt64=0x0000000000000000,
)
chunk = length(host_scratch)
chunk > 0 || throw(ArgumentError("host_scratch must be non-empty"))
rng = make_rng(seed, alg; offset=start_counter)

return AKUInt64Stream(
rng,
chunk,
chunk + 1,
host_scratch,
0,
)
end


@inline _u01_from_u64(u::UInt64)::Float64 = Float64(u >>> 11) * 0x1.0p-53


function _fill_chunk!(s::AKUInt64Stream)
AK.rand!(s.rng, s.host_scratch)
return nothing
end


function refill!(s::AKUInt64Stream)
_fill_chunk!(s)
s.idx = 1
s.refill_count += 1
return s
end


function next_u64!(s::AKUInt64Stream)::UInt64
if s.idx > s.chunk
refill!(s)
end
@inbounds u = s.host_scratch[s.idx]
s.idx += 1
return u
end


@inline next_float64!(s::AKUInt64Stream)::Float64 = _u01_from_u64(next_u64!(s))


function make_rngtest_generator!(s::AKUInt64Stream)
if s.idx > s.chunk
refill!(s)
end
return () -> next_float64!(s)
end
Loading