diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 43cfba3..c0e5830 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -88,14 +88,13 @@ steps:
       - JuliaCI/julia#v1:
           version: "1.10"
     command: |
-      julia -e 'using Pkg
+      julia --pkgimages=no -e 'using Pkg
 
-                println("--- :julia: Instantiating environment")
-                Pkg.add("oneAPI")
-                Pkg.develop(path=".")
-
-                println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
+                               println("--- :julia: Instantiating environment")
+                               Pkg.add("oneAPI")
+                               Pkg.develop(path=".") 
+                               println("+++ :julia: Running tests")
+                               Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
     agents:
       queue: "juliagpu"
       intel: "*"
diff --git a/Project.toml b/Project.toml
index 2fccea8..b0c601d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,7 @@ ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
 
 [weakdeps]
@@ -21,6 +22,7 @@ ArgCheck = "2"
 GPUArraysCore = "0.2.0"
 KernelAbstractions = "0.9.34"
 Markdown = "1"
+Random = "1"
 UnsafeAtomics = "0.3.0"
 julia = "1.10"
 oneAPI = "1, 2"
diff --git a/docs/make.jl b/docs/make.jl
index 58b4632..5e794a2 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -23,6 +23,7 @@ makedocs(;
             "Using Different Backends" => "api/using_backends.md",
             "General Loops" => "api/foreachindex.md",
             "Map" => "api/map.md",
+            "Random Number Generation" => "api/rand.md",
             "Sorting" => "api/sort.md",
             "Reduce" => "api/reduce.md",
             "MapReduce" => "api/mapreduce.md",
diff --git a/docs/src/api/rand.md b/docs/src/api/rand.md
new file mode 100644
index 0000000..381a797
--- /dev/null
+++ b/docs/src/api/rand.md
@@ -0,0 +1,180 @@
+### Random Number Generation
+
+Counter-based random generation for CPU and GPU backends with deterministic stream behaviour for a
+fixed `seed`, algorithm, and call sequence.
+
+Both in-place and allocation forms are supported:
+- Uniform: `AK.rand!`, `AK.rand`
+- Standard normal: `AK.randn!`, `AK.randn`
+
+`CounterRNG` stores:
+- `seed::UInt64`
+- algorithm `alg`
+- stream `offset::UInt64`
+
+The offset starts at `0` and advances by the number of generated values after each call. For
+`AK.rand!(rng, v)` and `AK.randn!(rng, v)`, element `v[i]` is generated from logical counter
+`rng.offset + UInt64(i - 1)` in linear indexing order.
+
+This gives stream-consistent chunking:
+- filling `100` then `100` elements yields the same `200` values as one `200`-element fill.
+- `AK.reset!(rng)` rewinds `rng.offset` to `0x0`.
+
+Calls that share the same `CounterRNG` instance concurrently are not thread-safe and may race on
+`offset`.
+
+`AK.rand!` and `AK.randn!` accept `rng::AK.CounterRNG`. Passing other RNG container types is not
+supported and will throw a `MethodError`.
+
+#### Auto-seeded convenience behaviour
+
+Use an explicit `CounterRNG` when reproducibility is required.
+
+For convenience, calls without an explicit `rng` construct a fresh `CounterRNG()` on each call,
+using one auto-seeded `Base.rand(UInt64)` draw. Therefore repeated bare calls intentionally produce
+different outputs unless `Random.seed!()` is used first.
+
+Examples:
+- `AK.rand!(v)`
+- `AK.randn!(v)`
+- `AK.rand(backend, args...)`
+- `AK.randn(backend, args...)`
+
+These do **not** continue a shared stream across calls unless you pass the same explicit
+`CounterRNG`.
+
+#### Allocation forms
+
+Canonical forms:
+- `AK.rand(rng, backend, T, dims...)`
+- `AK.randn(rng, backend, T, dims...)`
+
+Shared defaults:
+- omit `rng` -> fresh `CounterRNG()`
+- omit `backend` -> CPU backend
+- omit `T` -> `Float64` on CPU backend, `Float32` otherwise
+
+Common shorthands include:
+- `AK.rand(dims...)`
+- `AK.rand(T, dims...)`
+- `AK.rand(backend, dims...)`
+- and the corresponding `AK.randn(...)` variants
+
+For explicit `rng`, both `AK.rand` and `AK.randn` advance `rng.offset` by the number of generated
+elements, i.e. `prod(dims)`.
+
+#### Supported element types
+
+`AK.rand!` / `AK.rand` support:
+- `UInt8`, `UInt16`, `UInt32`, `UInt64`
+- `Int8`, `Int16`, `Int32`, `Int64`
+- `Float16`, `Float32`, `Float64`
+- `Bool`
+
+`AK.randn!` / `AK.randn` currently support:
+- `Float16`, `Float32`, `Float64`
+
+#### Value generation semantics
+
+The core generator produces either a `UInt32` or `UInt64`, depending on the requested output type.
+That raw unsigned value is then mapped as follows:
+- Unsigned integers: returned directly, or truncated if narrower
+- Signed integers: the corresponding unsigned bit pattern reinterpreted as signed, then truncated if narrower
+- Floats: mantissa construction onto a uniform grid in `[0, 1)` ([read more](https://lomont.org/posts/2017/unit-random/))
+- Bool: `true` if the raw `UInt` draw is odd (`isodd(u)`), otherwise `false`
+
+`AK.randn!` uses Box-Muller with midpoint-mapped open-interval uniforms in `(0, 1)`.
+
+#### Algorithms currently available
+
+- `SplitMix64` ([read more](https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64))
+- `Philox` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf))
+- `Threefry` ([read more](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf))
+
+`Philox` is the default algorithm for `CounterRNG()`.
+
+#### Statistical testing and security
+
+- In this repository, `SplitMix64`, `Philox`, and `Threefry` have passed TestU01 BigCrush
+- These generators are not intended to be cryptographically secure
+
+#### Philox keying note
+
+AK uses `Philox2x32` internally, which has a single 32-bit Philox key word.
+
+Users may pass any non-negative `Integer` seed with `seed <= typemax(UInt64)`; AK converts it to
+`UInt64` and derives the 32-bit Philox key using SplitMix. This wrapper choice is deliberate for
+ease of use and deterministic streams, not a change to the Philox round function itself.
+
+Therefore, AK Philox streams are deterministic and high-quality, but are not guaranteed to be
+bit-for-bit identical to a raw Random123 Philox stream unless the same seed-to-key mapping and
+counter convention are used.
+
+#### Custom algorithms
+
+To define a custom counter RNG:
+- define an algorithm type `MyAlg <: AK.CounterRNGAlgorithm`
+- implement:
+  - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32`
+  - `AK.rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64`
+
+Then use it via:
+- `AK.CounterRNG(seed; alg=MyAlg(), offset=...)`
+
+Both widths should be implemented so `AK.rand!` supports all integer and floating-point output
+types without fallback or error.
+
+#### Performance note
+
+`Philox` is the default because it is high-quality and very fast. `AK.rand!` has been measured at
+roughly memory-bound throughput (~390 GB/s) on an Nvidia GeForce RTX 5060, including slightly better
+performance than CURAND for large `CuArray{Float32}` fills and substantially faster `CuArray{Int32}`
+filling than native `CUDA.rand!` in the benchmarks used for this repository.
+
+Examples:
+```julia
+import AcceleratedKernels as AK
+using oneAPI
+using AMDGPU
+
+# Reproducible
+rng = AK.CounterRNG(0x12345678; alg=AK.Philox())
+v = oneArray{Float32}(undef, 1024)
+AK.rand!(rng, v)
+
+# Stream-consistent chunking
+rng = AK.CounterRNG(0x12345678; alg=AK.Philox())
+v1 = oneArray{Float32}(undef, 100)
+v2 = oneArray{Float32}(undef, 100)
+AK.rand!(rng, v1)
+AK.rand!(rng, v2)
+
+# Convenience: fresh auto-seeded RNG on each call
+y = oneArray{Float32}(undef, 1024)
+AK.rand!(y)
+
+# Allocation form
+y_cpu_auto = AK.rand(1024)                            # CPU, Vector{Float64}
+y_one = AK.rand(oneAPIBackend(), Float32, 1024)       # fresh RNG, allocate + fill oneArray
+y_cpu_typed = AK.rand(rng, Float16, 1024)             # CPU backend, explicit type, explicit RNG
+
+# Standard normal filling
+z = ROCArray{Float32}(undef, 1024)
+AK.randn!(rng, z)
+
+# Standard normal allocation form
+z_cpu_auto = AK.randn(1024)                           # CPU, Vector{Float64}
+z_roc = AK.randn(ROCBackend(), 1024)                  # fresh RNG, allocate + fill ROCArray{Float32}
+z_cpu_typed = AK.randn(rng, Float16, 1024)            # CPU backend, explicit type, explicit RNG
+```
+
+```@docs
+AcceleratedKernels.CounterRNG
+AcceleratedKernels.CounterRNGAlgorithm
+AcceleratedKernels.rand_uint
+AcceleratedKernels.reset!
+AcceleratedKernels.rand!
+AcceleratedKernels.rand
+AcceleratedKernels.randn!
+AcceleratedKernels.randn
+```
\ No newline at end of file
diff --git a/prototype/RNGTest/Project.toml b/prototype/RNGTest/Project.toml
new file mode 100644
index 0000000..7536db2
--- /dev/null
+++ b/prototype/RNGTest/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
+RNGTest = "97cc5700-e6cb-5ca1-8fb2-7f6b45264ecd"
diff --git a/prototype/RNGTest/README.md b/prototype/RNGTest/README.md
new file mode 100644
index 0000000..2bc6a7b
--- /dev/null
+++ b/prototype/RNGTest/README.md
@@ -0,0 +1,33 @@
+# AK + RNGTest SmallCrush Prototype
+
+This folder provides a chunked random stream generator based on `AcceleratedKernels.jl` that can be fed into `RNGTest.jl`.
+
+The stream is deterministic and effectively unbounded:
+- each refill generates `chunk` random `UInt64` values with `AK.rand!`
+- each refill advances one persistent `CounterRNG` stream offset
+- this is a practical chunked stream for RNGTest callback mode
+
+`RNGTest.jl` (in this local checkout) expects a callback returning `Float64` in `[0,1]`, so `UInt64` words are mapped to `Float64` via top-53-bit scaling.
+
+Current status in this harness: `SplitMix64`, `Philox`, and `Threefry` all pass BigCrush using `run_bigcrush.jl`.
+
+## Run SmallCrush
+
+From this directory:
+
+```powershell
+julia --project=. run_smallcrush.jl
+```
+
+## Run BigCrush
+
+```powershell
+julia --project=. run_bigcrush.jl
+```
+
+Notes:
+- Configure `ALG`, `SEED`, and `CHUNK` at the top of
+  `run_smallcrush.jl` / `run_bigcrush.jl`.
+- The stream refills directly into host scratch using `AK.rand!` on CPU.
+- `chunk` controls refill amortization and memory usage.
+- `chunk=100000000` means ~800 MB host scratch (`UInt64`).
diff --git a/prototype/RNGTest/run_bigcrush.jl b/prototype/RNGTest/run_bigcrush.jl
new file mode 100644
index 0000000..a1298a0
--- /dev/null
+++ b/prototype/RNGTest/run_bigcrush.jl
@@ -0,0 +1,26 @@
+using RNGTest
+
+include("stream.jl")
+
+
+const ALG = :philox
+const SEED = 0x1234
+const CHUNK = 10_000_000
+const HOST_SCRATCH = Vector{UInt64}(undef, CHUNK)
+
+
+stream = AKUInt64Stream(
+    HOST_SCRATCH;
+    seed=SEED,
+    alg=ALG,
+    start_counter=UInt64(0),
+)
+gen = make_rngtest_generator!(stream)
+genname = "AK_Vector_$(ALG)_seed$(SEED)"
+
+println("Beginning BigCrush. This may take hours...")
+
+RNGTest.bigcrushTestU01(gen, genname)
+
+println("refills: ", stream.refill_count)
+println("numbers consumed (approx): ", (stream.refill_count - 1) * stream.chunk + (stream.idx - 1))
diff --git a/prototype/RNGTest/run_smallcrush.jl b/prototype/RNGTest/run_smallcrush.jl
new file mode 100644
index 0000000..db1251c
--- /dev/null
+++ b/prototype/RNGTest/run_smallcrush.jl
@@ -0,0 +1,23 @@
+using RNGTest
+
+include("stream.jl")
+
+
+const ALG = :philox
+const SEED = 0x1234
+const CHUNK = 100_000_000
+const HOST_SCRATCH = Vector{UInt64}(undef, CHUNK)
+
+
+stream = AKUInt64Stream(
+    HOST_SCRATCH;
+    seed=SEED,
+    alg=ALG,
+    start_counter=UInt64(0),
+)
+gen = make_rngtest_generator!(stream)
+genname = "AK_Vector_$(ALG)_seed$(SEED)"
+
+println("Beginning SmallCrush...")
+
+RNGTest.smallcrushTestU01(gen, genname)
diff --git a/prototype/RNGTest/stream.jl b/prototype/RNGTest/stream.jl
new file mode 100644
index 0000000..a383645
--- /dev/null
+++ b/prototype/RNGTest/stream.jl
@@ -0,0 +1,80 @@
+import AcceleratedKernels as AK
+
+
+function make_rng(seed::Integer, alg::Symbol; offset::Integer=0)
+    if alg === :philox
+        return AK.CounterRNG(seed; alg=AK.Philox(), offset=offset)
+    elseif alg === :threefry
+        return AK.CounterRNG(seed; alg=AK.Threefry(), offset=offset)
+    elseif alg === :splitmix64
+        return AK.CounterRNG(seed; alg=AK.SplitMix64(), offset=offset)
+    end
+    throw(ArgumentError("alg must be :philox, :threefry, or :splitmix64; got $alg"))
+end
+
+
+mutable struct AKUInt64Stream{R}
+    rng::R
+    chunk::Int
+    idx::Int
+    host_scratch::Vector{UInt64}
+    refill_count::Int
+end
+
+
+function AKUInt64Stream(
+    host_scratch::Vector{UInt64};
+    seed::Integer=0x1234,
+    alg::Symbol=:philox,
+    start_counter::UInt64=0x0000000000000000,
+)
+    chunk = length(host_scratch)
+    chunk > 0 || throw(ArgumentError("host_scratch must be non-empty"))
+    rng = make_rng(seed, alg; offset=start_counter)
+
+    return AKUInt64Stream(
+        rng,
+        chunk,
+        chunk + 1,
+        host_scratch,
+        0,
+    )
+end
+
+
+@inline _u01_from_u64(u::UInt64)::Float64 = Float64(u >>> 11) * 0x1.0p-53
+
+
+function _fill_chunk!(s::AKUInt64Stream)
+    AK.rand!(s.rng, s.host_scratch)
+    return nothing
+end
+
+
+function refill!(s::AKUInt64Stream)
+    _fill_chunk!(s)
+    s.idx = 1
+    s.refill_count += 1
+    return s
+end
+
+
+function next_u64!(s::AKUInt64Stream)::UInt64
+    if s.idx > s.chunk
+        refill!(s)
+    end
+    @inbounds u = s.host_scratch[s.idx]
+    s.idx += 1
+    return u
+end
+
+
+@inline next_float64!(s::AKUInt64Stream)::Float64 = _u01_from_u64(next_u64!(s))
+
+
+function make_rngtest_generator!(s::AKUInt64Stream)
+    if s.idx > s.chunk
+        refill!(s)
+    end
+    return () -> next_float64!(s)
+end
diff --git a/prototype/rand/Project.toml b/prototype/rand/Project.toml
new file mode 100644
index 0000000..d1926b1
--- /dev/null
+++ b/prototype/rand/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
diff --git a/prototype/rand/randn.jl b/prototype/rand/randn.jl
new file mode 100644
index 0000000..d552b5f
--- /dev/null
+++ b/prototype/rand/randn.jl
@@ -0,0 +1,51 @@
+using BenchmarkTools
+using CUDA
+
+import AcceleratedKernels as AK
+
+
+const N = 100_000_000
+const GPU_BLOCK_SIZE = 256
+
+const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox(), offset=0x0)
+
+TestType = Float32
+
+x_cuda = CuArray{TestType}(undef, N)
+x_philox = CuArray{TestType}(undef, N)
+x_cpu = Vector{TestType}(undef, N)
+
+
+function run_cuda_randn!(x)
+    CUDA.randn!(x)
+    CUDA.synchronize()
+    return x
+end
+
+
+function run_ak_randn_gpu!(rng, x)
+    AK.randn!(rng, x; block_size=GPU_BLOCK_SIZE)
+    AK.synchronize(AK.get_backend(x))
+    return x
+end
+
+
+function run_ak_randn_cpu!(rng, x)
+    AK.randn!(rng, x)
+    return x
+end
+
+# warmup compile
+run_cuda_randn!(x_cuda)
+run_ak_randn_gpu!(RNG_PHILOX, x_philox)
+
+println("N = ", N)
+println("CPU threads: ", Threads.nthreads())
+
+println("\nCUDA.randn! benchmark (CuArray{$TestType}, in-place)")
+display(@benchmark run_cuda_randn!($x_cuda))
+
+println("\nAK.randn! Philox benchmark (GPU, CuArray{$TestType})")
+display(@benchmark run_ak_randn_gpu!($RNG_PHILOX, $x_philox))
+
+
diff --git a/prototype/rand/test_rand.jl b/prototype/rand/test_rand.jl
new file mode 100644
index 0000000..0d916f2
--- /dev/null
+++ b/prototype/rand/test_rand.jl
@@ -0,0 +1,57 @@
+using BenchmarkTools
+using CUDA
+
+import AcceleratedKernels as AK
+
+
+const N = 100_000_000
+const GPU_BLOCK_SIZE = 256
+
+
+const RNG_PHILOX = AK.CounterRNG(0x12345678; alg=AK.Philox())
+
+
+TestType = Float32
+x_cuda = CuArray{TestType}(undef, N)
+x_philox = CuArray{TestType}(undef, N)
+x_cpu = Vector{TestType}(undef, N)
+
+
+function run_cuda_rand!(x)
+    CUDA.rand!(x)
+    CUDA.synchronize()
+    return x
+end
+
+
+function run_ak_rand_gpu!(rng, x)
+    AK.rand!(rng, x; block_size=GPU_BLOCK_SIZE)
+    AK.synchronize(AK.get_backend(x))
+    return x
+end
+
+
+function run_ak_rand_cpu!(rng, x)
+    AK.rand!(rng, x)
+    return x
+end
+
+
+# warmup
+run_cuda_rand!(x_cuda)
+run_ak_rand_gpu!(RNG_PHILOX, x_philox)
+run_ak_rand_cpu!(RNG_PHILOX, x_cpu)
+
+
+println("N = ", N)
+println("CPU threads: ", Threads.nthreads())
+
+println("\nCUDA.rand! benchmark (CuArray{$TestType}, in-place)")
+display(@benchmark run_cuda_rand!($x_cuda))
+
+println("\nAK.rand! Philox benchmark (GPU, CuArray{$TestType})")
+display(@benchmark run_ak_rand_gpu!($RNG_PHILOX, $x_philox))
+
+println("\nAK.rand! benchmark (CPU, Vector{$TestType}, Philox)")
+display(@benchmark run_ak_rand_cpu!($RNG_PHILOX, $x_cpu))
+
diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
index d662c2a..7262cbe 100644
--- a/src/AcceleratedKernels.jl
+++ b/src/AcceleratedKernels.jl
@@ -15,6 +15,7 @@ using ArgCheck: @argcheck
 using GPUArraysCore: AnyGPUArray, @allowscalar
 using KernelAbstractions
 using KernelAbstractions: @context
+using Random
 import UnsafeAtomics
 
 
@@ -31,6 +32,7 @@ include("map.jl")
 include("sort/sort.jl")
 include("reduce/reduce.jl")
 include("accumulate/accumulate.jl")
+include("rand/rand.jl")
 include("searchsorted.jl")
 include("predicates.jl")
 include("arithmetics.jl")
diff --git a/src/rand/philox.jl b/src/rand/philox.jl
new file mode 100644
index 0000000..0de9ff4
--- /dev/null
+++ b/src/rand/philox.jl
@@ -0,0 +1,59 @@
+struct Philox <: CounterRNGAlgorithm end
+
+
+# Philox magic numbers
+const PHILOX_M0 = UInt32(0xD256D193)
+const PHILOX_W0 = UInt32(0x9E3779B9)
+const PHILOX_ROUNDS = 7
+
+
+@inline function _philox2x32_round(x0::UInt32, x1::UInt32, k0::UInt32)
+    lo = PHILOX_M0 * x0
+    hi = _mulhi_u32(PHILOX_M0, x0)
+    y0 = xor(xor(hi, k0), x1)
+    y1 = lo
+    return y0, y1
+end
+
+
+# Evaluate one Philox block at `counter`, returning two 32-bit lanes `(x0, x1)`
+@inline function _philox2x32_block(
+    seed::UInt64,
+    counter::UInt64,
+)::Tuple{UInt32, UInt32}
+    x0 = _u32_lo(counter)
+    x1 = _u32_hi(counter)
+
+    k0 = splitmix32_from_u64(seed)
+
+    @inbounds for _ in 1:PHILOX_ROUNDS
+        x0, x1 = _philox2x32_round(x0, x1, k0)
+        k0 += PHILOX_W0
+    end
+
+    return x0, x1
+end
+
+
+# Return lane 0 from the single Philox block at `counter`
+@inline function rand_uint(
+    seed::UInt64,
+    alg::Philox,
+    counter::UInt64,
+    ::Type{UInt32},
+)::UInt32
+    x0, _ = _philox2x32_block(seed, counter)
+    return x0
+end
+
+
+# Build UInt64 from the two lanes `(x0, x1)` of the same Philox block at `counter`
+@inline function rand_uint(
+    seed::UInt64,
+    alg::Philox,
+    counter::UInt64,
+    ::Type{UInt64},
+)::UInt64
+    x0, x1 = _philox2x32_block(seed, counter)
+    return _u64_from_u32s(x0, x1)
+end
diff --git a/src/rand/rand.jl b/src/rand/rand.jl
new file mode 100644
index 0000000..b2b14bd
--- /dev/null
+++ b/src/rand/rand.jl
@@ -0,0 +1,221 @@
+const ALLOWED_RAND_SCALARS = Union{
+    UInt8, UInt16, UInt32, UInt64,
+    Int8, Int16, Int32, Int64,
+    Float16, Float32, Float64,
+    Bool
+}
+
+
+"""
+    CounterRNGAlgorithm
+
+Abstract supertype for algorithms used by [`CounterRNG`](@ref).
+
+To define a custom counter-based RNG algorithm, subtype `CounterRNGAlgorithm` and implement
+[`rand_uint`](@ref) for both `UInt32` and `UInt64` outputs.
+"""
+abstract type CounterRNGAlgorithm end
+
+
+"""
+    CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox())
+
+Counter-based RNG for [`rand!`](@ref).
+
+`CounterRNG` stores:
+- `seed`
+- algorithm (`alg`)
+- stream `offset`
+
+The default algorithm is `Philox()`.
+
+`seed` may be any non-negative `Integer`. It is normalised to `UInt64` internally.
+`offset` is initialised to `0` by default and advances by `length(v)` after each [`rand!`](@ref)
+call.
+
+Constructors:
+- `CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)`
+  Uses an explicit non-negative seed and offset.
+- `CounterRNG(; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)`
+  Auto-seeds once using `Base.rand(UInt64)`, with default `offset == 0`.
+"""
+mutable struct CounterRNG{A <: CounterRNGAlgorithm}
+    const seed::UInt64
+    const alg::A
+    offset::UInt64
+end
+
+
+function CounterRNG(seed::Integer; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)
+    @argcheck seed >= 0 "Seed must be a non-negative integer"
+    @argcheck offset >= 0 "Offset must be a non-negative integer"
+    CounterRNG(UInt64(seed), alg, UInt64(offset))
+end
+
+
+function CounterRNG(; alg::CounterRNGAlgorithm=Philox(), offset::Integer=0)
+    CounterRNG(Base.rand(UInt64); alg, offset)
+end
+
+
+CounterRNG(seed::Integer, alg::CounterRNGAlgorithm) = CounterRNG(seed; alg)
+
+
+"""
+    reset!(rng::CounterRNG)
+
+Reset `rng.offset` to `0x0`.
+"""
+@inline function reset!(rng::CounterRNG)
+    rng.offset = UInt64(0)
+    return rng
+end
+
+
+
+
+# Shared helpers
+include("utilities.jl")
+
+# Algorithm-specific integer generators
+include("splitmix.jl")
+include("philox.jl")
+include("threefry.jl")
+
+# Normally distributed scalar generators and randn!
+include("randn.jl")
+
+
+
+
+"""
+    rand!(
+        rng::CounterRNG,
+        v::AbstractArray{T},
+        backend::Backend=get_backend(v);
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+
+        # Implementation choice
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+    )
+
+Fill `v` in-place with pseudo-random values using a counter-based RNG stream. For `v[i]`, the
+counter is `rng.offset + UInt64(i - 1)` in linear indexing order.
+
+After filling `v`, `rng.offset` advances by `length(v)`. It can be called without `rng`, in which
+case the default `CounterRNG` is used.
+
+Supported scalar element types are:
+- `UInt8`, `UInt16`, `UInt32`, `UInt64`
+- `Int8`, `Int16`, `Int32`, `Int64`
+- `Float16`, `Float32`, `Float64`
+- `Bool`
+
+Semantics:
+- Unsigned integers: raw random bit patterns of requested width.
+- Signed integers: corresponding unsigned patterns reinterpreted as signed.
+- Floats: mantissa-based conversion from `UInt32`/`UInt64` into `[0, 1)`, uniform over the
+  produced mantissa grid (not over all representable floats).
+- Bool: `true` if the raw `UInt` draw is odd (`isodd(u)`), otherwise `false`.
+
+"""
+function rand!(
+    rng::CounterRNG,
+    v::AbstractArray{T},
+    backend::Backend=get_backend(v);
+
+    # CPU settings
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int=256,
+) where T
+
+    @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)"
+
+    # Local isbits captures from mutable rng object
+    seed, alg, initial_offset = rng.seed, rng.alg, rng.offset
+    
+    foreachindex(
+        v, backend;
+        max_tasks, min_elems, prefer_threads, block_size,
+    ) do i
+        @inbounds v[i] = rand_scalar(seed, alg, initial_offset + _counter_from_index(i), T)
+    end
+
+    rng.offset += UInt64(length(v))
+    
+    v
+end
+
+
+rand!(v::AbstractArray, args...; kwargs...) = rand!(CounterRNG(), v, args...; kwargs...)
+
+
+"""
+    rand(
+        rng::CounterRNG,
+        backend::Backend,
+        ::Type{T},
+        dims::Integer...;
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+    ) where T
+
+Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via
+[`rand!`](@ref), and return it.
+
+Convenience overloads:
+- `rng` omitted: uses a fresh `CounterRNG()`.
+- `backend` omitted: defaults to `CPU_BACKEND`.
+- `T` omitted: defaults by backend (`Float64` on CPU backend, `Float32` otherwise).
+"""
+function rand(
+    rng::CounterRNG,
+    backend::Backend,
+    ::Type{T},
+    dims::Integer...;
+
+    # CPU settings
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int=256,
+) where T
+    @argcheck T <: ALLOWED_RAND_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RAND_SCALARS)"
+    return _allocate_and_fill_rand(
+        rand!, rng, backend, T, dims...;
+        max_tasks, min_elems, prefer_threads, block_size,
+    )
+end
+
+
+function rand(rng::CounterRNG, backend::Backend, dims::Integer...; kwargs...)
+    DefaultScalarType = (backend == CPU_BACKEND) ? Float64 : Float32
+    rand(rng, backend, DefaultScalarType, dims...; kwargs...)
+end
+
+
+rand(rng::CounterRNG, args...; kwargs...) = rand(rng, CPU_BACKEND, args...; kwargs...)
+rand(backend::Backend, args...; kwargs...) = rand(CounterRNG(), backend, args...; kwargs...)
+rand(::Type{T}, dims::Integer...; kwargs...) where {T} = rand(CPU_BACKEND, T, dims...; kwargs...)
+rand(dims::Integer...; kwargs...) = rand(CPU_BACKEND, dims...; kwargs...)
+rand(; kwargs...) = throw(ArgumentError("rand requires at least one dimension"))
+
+
diff --git a/src/rand/randn.jl b/src/rand/randn.jl
new file mode 100644
index 0000000..b2b5dbb
--- /dev/null
+++ b/src/rand/randn.jl
@@ -0,0 +1,248 @@
+const ALLOWED_RANDN_SCALARS = Union{
+    Float16, Float32, Float64
+}
+
+@inline function randn_pair(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    pair_counter::UInt64,
+    ::Type{Float16},
+)::Tuple{Float16, Float16}
+    z0, z1 = randn_pair(seed, alg, pair_counter, Float32)
+    return Float16(z0), Float16(z1)
+end
+
+
+@inline function randn_pair(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    pair_counter::UInt64,
+    ::Type{Float32},
+)::Tuple{Float32, Float32}
+    u = rand_uint(seed, alg, pair_counter, UInt64)
+    u1 = _uint32_to_open_unit_float32_midpoint(_u32_lo(u))
+    u2 = _uint32_to_open_unit_float32_midpoint(_u32_hi(u))
+    radius = sqrt(-2.0f0 * log(u1))
+    theta = Float32(2pi) * u2
+    stheta, ctheta = sincos(theta)
+    return radius * ctheta, radius * stheta
+end
+
+
+@inline function randn_pair(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    pair_counter::UInt64,
+    ::Type{Float64},
+)::Tuple{Float64, Float64}
+    c0 = pair_counter << 1
+    u1 = rand_float_open01(seed, alg, c0, Float64)
+    u2 = rand_float_open01(seed, alg, c0 + UInt64(1), Float64)
+    radius = sqrt(-2.0 * log(u1))
+    theta = Float64(2pi) * u2
+    stheta, ctheta = sincos(theta)
+    return radius * ctheta, radius * stheta
+end
+
+
+@inline function randn_pair(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T}
+    throw(ArgumentError(
+        "Unsupported normal random type $(T). Supported: $(ALLOWED_RANDN_SCALARS)"
+    ))
+end
+
+
+@inline function randn_scalar(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    normal_counter::UInt64,
+    ::Type{T},
+)::T where {T <: ALLOWED_RANDN_SCALARS}
+    pair_counter = normal_counter >> 1
+    z0, z1 = randn_pair(seed, alg, pair_counter, T)
+    return iszero(normal_counter & UInt64(0x1)) ? z0 : z1
+end
+
+
+@inline function randn_scalar(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T}
+    throw(ArgumentError(
+        "Unsupported normal random scalar type $(T). Supported: $(ALLOWED_RANDN_SCALARS)"
+    ))
+end
+
+
+# `Val{ODD}` keeps parity in the type domain so each specialization (`ODD==0` / `ODD==1`)
+# can fold index bias at compile time.
+# - `Val{0}` => even-offset pair writes at indices `(2i-1, 2i)` so bias is `-1`
+# - `Val{1}` => odd-offset pair writes at indices `(2i, 2i+1)` after prefix handling so bias is `0`
+@inline _randn_i0_bias(::Val{0}) = -1
+@inline _randn_i0_bias(::Val{1}) = 0
+
+
+@inline function _randn_core!(
+    v::AbstractArray{T}, seed, alg, initial_offset,
+    backend, max_tasks, min_elems, prefer_threads, block_size,
+    ::Val{ODD},
+) where {T, ODD}
+
+    len = length(v)
+    prefix_len = ODD
+
+    # If offset is odd, need to individually handle the first element.
+    prefix_len == 1 && @allowscalar @inbounds v[1] = randn_scalar(seed, alg, initial_offset, T)
+
+    # Stream is now even-aligned, so can foreachindex through the pairs.
+    pair_start = (initial_offset + UInt64(prefix_len)) >> 1
+
+    # Capture `Val(ODD)` into the closure so bias stays a compile-time constant inside the loop.
+    odd_val = Val(ODD)
+    i0_bias = _randn_i0_bias(odd_val)
+    remaining_len = len - prefix_len
+    pair_count = remaining_len >> 1
+
+    if pair_count > 0
+        foreachindex(
+            Base.OneTo(pair_count), backend;
+            max_tasks, min_elems, prefer_threads, block_size,
+        ) do i
+            pair_counter = pair_start + _counter_from_index(i)
+            z0, z1 = randn_pair(seed, alg, pair_counter, T)
+            i0 = (i << 1) + _randn_i0_bias(odd_val)
+            @inbounds v[i0] = z0
+            @inbounds v[i0 + 1] = z1
+        end
+    end
+
+    # If an extra element remains after pair writing, fill it individually.
+    tail_index = (pair_count << 1) + i0_bias + 2
+    if tail_index <= len
+        tail_counter = initial_offset + UInt64(tail_index - 1)
+        @allowscalar @inbounds v[tail_index] = randn_scalar(seed, alg, tail_counter, T)
+    end
+
+    return v
+end
+
+
+"""
+    randn!(
+        rng::CounterRNG,
+        v::AbstractArray{T},
+        backend::Backend=get_backend(v);
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+    ) where {T <: AbstractFloat}
+
+Fill `v` in-place with pseudo-random samples from a standard normal distribution.
+
+For `v[i]`, the normal stream counter is `rng.offset + UInt64(i - 1)` in linear indexing order.
+Values are generated using Box-Muller from midpoint-open uniforms in `(0, 1)`.
+
+After filling `v`, `rng.offset` advances by `length(v)`.
+
+It can be called without an `rng`, in which case the default `CounterRNG` will be used.
+"""
+function randn!(
+    rng::CounterRNG,
+    v::AbstractArray{T},
+    backend::Backend=get_backend(v);
+
+    # CPU settings
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int=256,
+) where T
+
+    @argcheck T <: ALLOWED_RANDN_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RANDN_SCALARS)"
+
+    isempty(v) && return v
+
+    # Local isbits captures from mutable rng object.
+    seed, alg, initial_offset = rng.seed, rng.alg, rng.offset
+
+    core_args = (
+        v, seed, alg, initial_offset, backend, max_tasks, min_elems, prefer_threads, block_size
+    )
+
+    # Dispatch depending on required initial index bias
+    if iseven(initial_offset)
+        _randn_core!(core_args..., Val(0))
+    else
+        _randn_core!(core_args..., Val(1))
+    end
+
+    rng.offset += UInt64(length(v))
+
+    v
+end
+
+
+randn!(v::AbstractArray, args...; kwargs...) = randn!(CounterRNG(), v, args...; kwargs...)
+
+
+"""
+    randn(
+        rng::CounterRNG,
+        backend::Backend,
+        ::Type{T},
+        dims::Integer...;
+
+        # CPU settings
+        max_tasks::Int=Threads.nthreads(),
+        min_elems::Int=1,
+        prefer_threads::Bool=true,
+
+        # GPU settings
+        block_size::Int=256,
+    ) where T
+
+Allocate an array of element type `T` on `backend` with shape `dims`, fill it in-place via
+[`randn!`](@ref), and return it.
+
+Convenience overloads:
+- `rng` omitted: uses a fresh `CounterRNG()`.
+- `backend` omitted: defaults to `CPU_BACKEND`.
+- `T` omitted: defaults by backend (`Float64` on CPU backend, `Float32` otherwise).
+"""
+function randn(
+    rng::CounterRNG,
+    backend::Backend,
+    ::Type{T},
+    dims::Integer...;
+
+    # CPU settings
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int=256,
+) where T
+    @argcheck T <: ALLOWED_RANDN_SCALARS "Unsupported eltype $T. Supported: $(ALLOWED_RANDN_SCALARS)"
+    return _allocate_and_fill_rand(
+        randn!, rng, backend, T, dims...;
+        max_tasks, min_elems, prefer_threads, block_size,
+    )
+end
+
+
+function randn(rng::CounterRNG, backend::Backend, dims::Integer...; kwargs...)
+    DefaultScalarType = (backend == CPU_BACKEND) ? Float64 : Float32
+    randn(rng, backend, DefaultScalarType, dims...; kwargs...)
+end
+
+
+randn(rng::CounterRNG, args...; kwargs...) = randn(rng, CPU_BACKEND, args...; kwargs...)
+randn(backend::Backend, args...; kwargs...) = randn(CounterRNG(), backend, args...; kwargs...)
+randn(::Type{T}, dims::Integer...; kwargs...) where {T} = randn(CPU_BACKEND, T, dims...; kwargs...)
+randn(dims::Integer...; kwargs...) = randn(CPU_BACKEND, dims...; kwargs...)
+randn(; kwargs...) = throw(ArgumentError("randn requires at least one dimension"))
diff --git a/src/rand/splitmix.jl b/src/rand/splitmix.jl
new file mode 100644
index 0000000..cc474be
--- /dev/null
+++ b/src/rand/splitmix.jl
@@ -0,0 +1,44 @@
+struct SplitMix64 <: CounterRNGAlgorithm end
+
+# SplitMix64 magic numbers
+const SPLITMIX64_INCREMENT = UInt64(0x9e3779b97f4a7c15)
+const SPLITMIX64_MIX_A = UInt64(0xbf58476d1ce4e5b9)
+const SPLITMIX64_MIX_B = UInt64(0x94d049bb133111eb)
+
+
+@inline function _splitmix64_mix(x::UInt64)::UInt64
+    x = xor(x, x >> 30)
+    x *= SPLITMIX64_MIX_A
+    x = xor(x, x >> 27)
+    x *= SPLITMIX64_MIX_B
+    x = xor(x, x >> 31)
+    return x
+end
+
+
+# Derive a 32-bit seed word from a 64-bit seed using SplitMix64 mixing.
+@inline function splitmix32_from_u64(seed::UInt64)::UInt32
+    return _u32_hi(_splitmix64_mix(seed + SPLITMIX64_INCREMENT))
+end
+
+
+# Natural SplitMix64 output path: compute 64 random bits directly from one counter
+@inline function rand_uint(
+    seed::UInt64,
+    alg::SplitMix64,
+    counter::UInt64,
+    ::Type{UInt64},
+)::UInt64
+    return _splitmix64_mix(counter + seed + SPLITMIX64_INCREMENT)
+end
+
+
+# UInt32 path is derived from the high 32 bits of the UInt64 SplitMix output
+@inline function rand_uint(
+    seed::UInt64,
+    alg::SplitMix64,
+    counter::UInt64,
+    ::Type{UInt32},
+)::UInt32
+    return _u32_hi(rand_uint(seed, alg, counter, UInt64))
+end
diff --git a/src/rand/threefry.jl b/src/rand/threefry.jl
new file mode 100644
index 0000000..f6ef632
--- /dev/null
+++ b/src/rand/threefry.jl
@@ -0,0 +1,75 @@
+struct Threefry <: CounterRNGAlgorithm end
+
+# Threefry magic numbers
+const THREEFRY_PARITY = UInt32(0x1BD11BDA)
+const THREEFRY_ROTATIONS = (
+    UInt32(13), UInt32(15), UInt32(26), UInt32(6),
+    UInt32(17), UInt32(29), UInt32(16), UInt32(24),
+)
+const THREEFRY_ROUNDS = 20
+
+
+@inline function _threefry_key_word(k0::UInt32, k1::UInt32, k2::UInt32, idx::UInt32)::UInt32
+    idx == UInt32(0) && return k0
+    idx == UInt32(1) && return k1
+    return k2
+end
+
+
+# Evaluate one Threefry block at `counter`, returning two 32-bit lanes `(x0, x1)`
+@inline function _threefry2x32_block(
+    seed::UInt64,
+    counter::UInt64,
+)::Tuple{UInt32, UInt32}
+
+    x0 = _u32_lo(counter)
+    x1 = _u32_hi(counter)
+
+    k0 = _u32_lo(seed)
+    k1 = _u32_hi(seed)
+    k2 = xor(THREEFRY_PARITY, xor(k0, k1))
+
+    x0 += k0
+    x1 += k1
+
+    @inbounds for round in 0:(THREEFRY_ROUNDS - 1)
+        round_u32 = UInt32(round)
+        rot = THREEFRY_ROTATIONS[Int((round_u32 & UInt32(0x7)) + UInt32(1))]
+        x0 += x1
+        x1 = xor(_rotl32(x1, rot), x0)
+
+        if (round_u32 & UInt32(0x3)) == UInt32(0x3)
+            s = (round_u32 >>> 2) + UInt32(1)
+            i0 = s % UInt32(3)
+            i1 = (s + UInt32(1)) % UInt32(3)
+            x0 += _threefry_key_word(k0, k1, k2, i0)
+            x1 += _threefry_key_word(k0, k1, k2, i1) + s
+        end
+    end
+
+    return x0, x1
+end
+
+
+# Return lane 0 from the single Threefry block at `counter`
+@inline function rand_uint(
+    seed::UInt64,
+    alg::Threefry,
+    counter::UInt64,
+    ::Type{UInt32},
+)::UInt32
+    x0, _ = _threefry2x32_block(seed, counter)
+    return x0
+end
+
+
+# Build UInt64 from the two lanes `(x0, x1)` of the same Threefry block at `counter`
+@inline function rand_uint(
+    seed::UInt64,
+    alg::Threefry,
+    counter::UInt64,
+    ::Type{UInt64},
+)::UInt64
+    x0, x1 = _threefry2x32_block(seed, counter)
+    return _u64_from_u32s(x0, x1)
+end
diff --git a/src/rand/utilities.jl b/src/rand/utilities.jl
new file mode 100644
index 0000000..35d9084
--- /dev/null
+++ b/src/rand/utilities.jl
@@ -0,0 +1,240 @@
+# lo: rightmost 32 bits, hi: leftmost 32 bits
+@inline _u32_lo(x::UInt64)::UInt32 = UInt32(x & UInt64(0xffffffff))
+@inline _u32_hi(x::UInt64)::UInt32 = UInt32(x >> 32)
+
+# Construct UInt64 by bit concatenation of two UInt32s
+@inline _u64_from_u32s(lo::UInt32, hi::UInt32)::UInt64 = (UInt64(hi) << 32) | UInt64(lo)
+
+# Leftmost 32 bits of a*b cast to UInt64s
+@inline _mulhi_u32(a::UInt32, b::UInt32)::UInt32 = UInt32((UInt64(a) * UInt64(b)) >> 32)
+
+# 32-bit rotate left by r positions
+@inline _rotl32(x::UInt32, r::UInt32)::UInt32 = bitrotate(x, Int32(r))
+
+# Get counter used for CounterRNG from element index
+@inline _counter_from_index(i)::UInt64 = UInt64(i - one(i))
+
+
+# Shared allocation + fill helper for rand/randn convenience constructors.
+@inline function _allocate_and_fill_rand(
+    fill!,
+    rng::CounterRNG,
+    backend::Backend,
+    ::Type{T},
+    dims::Integer...;
+
+    # CPU settings
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+    prefer_threads::Bool=true,
+
+    # GPU settings
+    block_size::Int=256,
+) where {T}
+    dims_int = Base.map(Int, dims)
+    v = KernelAbstractions.allocate(backend, T, dims_int)
+    fill!(rng, v, backend; max_tasks, min_elems, prefer_threads, block_size)
+    return v
+end
+
+
+
+
+@inline _rand_scalar_uint_type(::Type{UInt8}) = UInt32
+@inline _rand_scalar_uint_type(::Type{UInt16}) = UInt32
+@inline _rand_scalar_uint_type(::Type{UInt32}) = UInt32
+@inline _rand_scalar_uint_type(::Type{Int8}) = UInt32
+@inline _rand_scalar_uint_type(::Type{Int16}) = UInt32
+@inline _rand_scalar_uint_type(::Type{Int32}) = UInt32
+@inline _rand_scalar_uint_type(::Type{Float16}) = UInt32
+@inline _rand_scalar_uint_type(::Type{Float32}) = UInt32
+@inline _rand_scalar_uint_type(::Type{UInt64}) = UInt64
+@inline _rand_scalar_uint_type(::Type{Int64}) = UInt64
+@inline _rand_scalar_uint_type(::Type{Float64}) = UInt64
+@inline _rand_scalar_uint_type(::Type{Bool}) = UInt32
+
+
+@inline _rand_scalar_from_uint(::Type{UInt8}, u::UInt32)::UInt8 = trunc(UInt8, u >> 24)
+@inline _rand_scalar_from_uint(::Type{UInt16}, u::UInt32)::UInt16 = trunc(UInt16, u >> 16)
+@inline _rand_scalar_from_uint(::Type{UInt32}, u::UInt32)::UInt32 = u
+@inline _rand_scalar_from_uint(::Type{UInt64}, u::UInt64)::UInt64 = u
+@inline _rand_scalar_from_uint(::Type{Int8}, u::UInt32)::Int8 = reinterpret(Int8, trunc(UInt8, u >> 24))
+@inline _rand_scalar_from_uint(::Type{Int16}, u::UInt32)::Int16 = reinterpret(Int16, trunc(UInt16, u >> 16))
+@inline _rand_scalar_from_uint(::Type{Int32}, u::UInt32)::Int32 = reinterpret(Int32, u)
+@inline _rand_scalar_from_uint(::Type{Int64}, u::UInt64)::Int64 = reinterpret(Int64, u)
+@inline _rand_scalar_from_uint(::Type{Float16}, u::UInt32)::Float16 = uint32_to_unit_float16(u)
+@inline _rand_scalar_from_uint(::Type{Float32}, u::UInt32)::Float32 = uint32_to_unit_float32(u)
+@inline _rand_scalar_from_uint(::Type{Float64}, u::UInt64)::Float64 = uint64_to_unit_float64(u)
+@inline _rand_scalar_from_uint(::Type{Bool}, u::UInt32)::Bool = isodd(u)
+
+
+#=
+Every RNG algorithm implements rand_uint(seed, alg, counter, UInt32/UInt64).
+This is the fallback for unsupported RNG algorithms.
+=#
+"""
+    rand_uint(seed::UInt64, alg::CounterRNGAlgorithm, counter::UInt64, ::Type{UIntType}) -> UIntType
+    where {UIntType <: Union{UInt32, UInt64}}
+
+Low-level extension point for counter-based RNG algorithms used by [`CounterRNG`](@ref).
+
+`rand_uint` must deterministically map `(seed, alg, counter)` to a raw unsigned integer of the
+requested width. Custom algorithms should implement methods for both:
+
+- `rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt32})::UInt32`
+- `rand_uint(seed::UInt64, alg::MyAlg, counter::UInt64, ::Type{UInt64})::UInt64`
+
+These methods are used internally by [`rand!`](@ref), [`rand`](@ref), [`randn!`](@ref), and
+[`randn`](@ref) to generate integers, floats, and normal samples.
+
+# Requirements
+- The mapping must be deterministic for fixed `seed`, `alg`, and `counter`.
+- Implement both `UInt32` and `UInt64` widths.
+- The method should return raw random bits; higher-level type conversion is handled by AK separately.
+
+# Notes
+- `counter` is the logical stream position (typically the array index).
+- For block-based algorithms such as Philox or Threefry, the `UInt32` and `UInt64` methods may
+  share an internal block computation.
+- The fallback method throws an `ArgumentError` for algorithms that do not implement `rand_uint`.
+
+See also: [`CounterRNGAlgorithm`](@ref), [`CounterRNG`](@ref).
+"""
+@inline function rand_uint(
+    ::UInt64,
+    alg::CounterRNGAlgorithm,
+    ::UInt64,
+    ::Type{UIntType}
+)::UIntType where {UIntType <: Union{UInt32, UInt64}}
+    throw(ArgumentError("No rand_uint implementation for RNG algorithm: $(typeof(alg))"))
+end
+
+
+#=
+Shared scalar generation:
+1) map requested scalar type to corresponding raw UInt width
+2) fill the UInt with random bits
+3) convert bits into requested scalar representation
+=#
+@inline function rand_scalar(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    counter::UInt64,
+    ::Type{T}
+)::T where {T <: ALLOWED_RAND_SCALARS}
+
+    UIntType = _rand_scalar_uint_type(T)
+    u = rand_uint(seed, alg, counter, UIntType)
+
+    return _rand_scalar_from_uint(T, u)
+end
+
+
+@inline function rand_scalar(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T}
+    throw(ArgumentError(
+        "Unsupported random scalar type $(T). Supported: $(ALLOWED_RAND_SCALARS)"
+    ))
+end
+
+
+# Convert random UInt32 bits to Float16 in [0, 1) by mantissa construction.
+@inline function uint32_to_unit_float16(u::UInt32)::Float16
+
+    # Keep 10 random bits for the mantissa (drop 22 rightmost bits from the UInt32)
+    # and combine with the bit pattern of Float16(1.0) (sign=0, exponent=15).
+    bits = UInt16(0x3c00) | UInt16(u >> 22)
+
+    # Interpret as 1.mantissa, then subtract 1 for [0, 1)
+    reinterpret(Float16, bits) - Float16(1)
+end
+
+
+# Convert random UInt32 bits to Float32 in [0, 1) by mantissa construction.
+@inline function uint32_to_unit_float32(u::UInt32)::Float32
+
+    # Keep 23 random bits for the mantissa (drop 9 rightmost bits from the UInt32)
+    # and combine with the bit pattern of 1.0f0 (sign=0, exponent=127).
+    bits = UInt32(0x3f800000) | (u >> 9)
+
+    # Interpret as 1.mantissa, then subtract 1 for [0, 1)
+    reinterpret(Float32, bits) - 1.0f0
+end
+
+
+# Convert random UInt64 bits to Float64 in [0, 1) by mantissa construction.
+@inline function uint64_to_unit_float64(u::UInt64)::Float64
+
+    # Keep 52 random bits for the mantissa (drop 12 rightmost bits from the UInt64)
+    # and combine with the bit pattern of 1.0 (sign=0, exponent=1023).
+    bits = UInt64(0x3ff0000000000000) | (u >> 12)
+
+    # Interpret as 1.mantissa, then subtract 1 for [0, 1)
+    reinterpret(Float64, bits) - 1.0
+end
+
+
+
+
+
+### Helpers for randn ###
+
+
+# Midpoint-mapped open-interval Float sampling in (0, 1), used for Box-Muller
+const OPEN01_MAX_MIDPOINT_INDEX_F32 = UInt32(0x00fffffe)
+const OPEN01_MAX_MIDPOINT_INDEX_F64 = UInt64(0x001ffffffffffffe)
+const OPEN01_MIDPOINT_SCALE_F32 = ldexp(Float32(1), -24)
+const OPEN01_MIDPOINT_SCALE_F64 = ldexp(Float64(1), -53)
+
+
+# Convert random UInt32 bits to Float32 in (0, 1) using midpoint mapping on a 24-bit grid.
+@inline function _uint32_to_open_unit_float32_midpoint(u::UInt32)::Float32
+    # `min` keeps the top midpoint below one after Float32 rounding.
+    k = min(u >> 8, OPEN01_MAX_MIDPOINT_INDEX_F32)
+    return (Float32(k) + 0.5f0) * OPEN01_MIDPOINT_SCALE_F32
+end
+
+
+# Convert random UInt64 bits to Float64 in (0, 1) using midpoint mapping on a 53-bit grid.
+@inline function _uint64_to_open_unit_float64_midpoint(u::UInt64)::Float64
+    # `min` keeps the top midpoint below one after Float64 rounding.
+    k = min(u >> 11, OPEN01_MAX_MIDPOINT_INDEX_F64)
+    return (Float64(k) + 0.5) * OPEN01_MIDPOINT_SCALE_F64
+end
+
+
+# Float16 path reuses Float32 midpoint sampling for robust math in Box-Muller.
+@inline function rand_float_open01(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    counter::UInt64,
+    ::Type{Float16},
+)::Float16
+    return Float16(rand_float_open01(seed, alg, counter, Float32))
+end
+
+
+@inline function rand_float_open01(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    counter::UInt64,
+    ::Type{Float32},
+)::Float32
+    return _uint32_to_open_unit_float32_midpoint(rand_uint(seed, alg, counter, UInt32))
+end
+
+
+@inline function rand_float_open01(
+    seed::UInt64,
+    alg::CounterRNGAlgorithm,
+    counter::UInt64,
+    ::Type{Float64},
+)::Float64
+    return _uint64_to_open_unit_float64_midpoint(rand_uint(seed, alg, counter, UInt64))
+end
+
+
+@inline function rand_float_open01(::UInt64, ::CounterRNGAlgorithm, ::UInt64, ::Type{T}) where {T}
+    throw(ArgumentError(
+        "Unsupported open-interval random type $(T). Supported: Union{Float16, Float32, Float64}"
+    ))
+end
diff --git a/test/rand.jl b/test/rand.jl
new file mode 100644
index 0000000..a6b62c6
--- /dev/null
+++ b/test/rand.jl
@@ -0,0 +1,488 @@
+const RAND_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry())
+const RAND_SCALAR_TYPES_ALL = (
+    UInt8, UInt16, UInt32, UInt64,
+    Int8, Int16, Int32, Int64,
+    Float16, Float32, Float64,
+    Bool,
+)
+const RAND_SCALAR_TYPES_BACKEND = IS_CPU_BACKEND ?
+                                  RAND_SCALAR_TYPES_ALL :
+                                  (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Bool)
+const RUN_FLOAT16_RAND_TESTS = IS_CPU_BACKEND
+const RUN_FLOAT64_RAND_TESTS = IS_CPU_BACKEND
+
+
+_is_unit_interval(v) = all(x -> !isnan(x) && zero(x) <= x < one(x), v)
+
+
+function _rand_fill_reference!(
+    rng,
+    x::AbstractArray{T};
+    counter_offset::UInt64=UInt64(0),
+) where {T <: AK.ALLOWED_RAND_SCALARS}
+    @inbounds for i in eachindex(x)
+        x[i] = AK.rand_scalar(rng.seed, rng.alg, counter_offset + UInt64(i - one(i)), T)
+    end
+    return x
+end
+
+
+function _assert_rand_matches_reference!(rng, x; kwargs...)
+    counter_offset = rng.offset
+    AK.rand!(rng, x; kwargs...)
+    ref = zeros(eltype(x), size(x))
+    _rand_fill_reference!(rng, ref; counter_offset)
+    @test Array(x) == ref
+    return x
+end
+
+
+@testset "rand" begin
+    @testset "constructors" begin
+        @test AK.CounterRNG(0x1; alg=AK.SplitMix64()) isa AK.CounterRNG{AK.SplitMix64}
+        @test AK.CounterRNG(UInt32(0x1); alg=AK.Philox()) isa AK.CounterRNG{AK.Philox}
+        @test AK.CounterRNG(UInt16(123); alg=AK.Threefry()) isa AK.CounterRNG{AK.Threefry}
+        @test AK.CounterRNG(UInt32(300)).seed == UInt64(300)
+        @test AK.CounterRNG(UInt32(300)).offset == UInt64(0)
+        @test AK.CounterRNG(0x1; offset=17).offset == UInt64(17)
+        @test AK.CounterRNG(0x1, AK.Philox()).offset == UInt64(0)
+        @test_throws ArgumentError AK.CounterRNG(-1)
+        @test_throws ArgumentError AK.CounterRNG(1; offset=-1)
+
+        Random.seed!(0x1234)
+        expected_seed = Random.rand(Random.default_rng(), UInt64)
+        Random.seed!(0x1234)
+        rng_auto = AK.CounterRNG()
+        @test rng_auto.seed == expected_seed
+        @test rng_auto.alg isa AK.Philox
+        @test rng_auto.offset == UInt64(0)
+
+        rng_auto_off = AK.CounterRNG(; offset=42)
+        @test rng_auto_off.offset == UInt64(42)
+
+        x1 = array_from_host(zeros(Float32, 1024))
+        x2 = array_from_host(zeros(Float32, 1024))
+        AK.rand!(rng_auto, x1; prefer_threads, block_size=64)
+        AK.rand!(rng_auto, x2; prefer_threads, block_size=257)
+        @test rng_auto.offset == UInt64(2048)
+        @test Array(x1) != Array(x2)
+    end
+
+
+    @testset "counter rng offset behavior" begin
+        rng_stream = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17))
+        s1 = array_from_host(zeros(Float32, 100))
+        s2 = array_from_host(zeros(Float32, 100))
+        s12 = array_from_host(zeros(Float32, 200))
+        AK.rand!(rng_stream, s1; prefer_threads, block_size=64)
+        @test rng_stream.offset == UInt64(117)
+        AK.rand!(rng_stream, s2; prefer_threads, block_size=64)
+        @test rng_stream.offset == UInt64(217)
+
+        rng_once = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17))
+        AK.rand!(rng_once, s12; prefer_threads, block_size=64)
+        @test vcat(Array(s1), Array(s2)) == Array(s12)
+        @test rng_once.offset == UInt64(217)
+
+        empty = array_from_host(zeros(Float32, 0))
+        stream_offset = rng_stream.offset
+        AK.rand!(rng_stream, empty; prefer_threads, block_size=64)
+        @test rng_stream.offset == stream_offset
+
+        @test AK.reset!(rng_stream) === rng_stream
+        @test rng_stream.offset == UInt64(0)
+
+        y1 = array_from_host(zeros(Float32, 64))
+        y2 = array_from_host(zeros(Float32, 64))
+        AK.rand!(rng_stream, y1; prefer_threads, block_size=64)
+        AK.rand!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), y2; prefer_threads, block_size=64)
+        @test Array(y1) == Array(y2)
+
+        mutable struct DummyRNG
+            seed::UInt64
+            alg::AK.Philox
+            offset::UInt64
+        end
+
+        rng_dummy = DummyRNG(UInt64(0x1234), AK.Philox(), UInt64(0))
+        x = array_from_host(zeros(Float32, 16))
+        @test_throws MethodError AK.rand!(rng_dummy, x; prefer_threads, block_size=64)
+        @test_throws MethodError AK.reset!(rng_dummy)
+    end
+
+
+    @testset "reset!" begin
+        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+        x1 = array_from_host(zeros(Float32, 512))
+        x2 = array_from_host(zeros(Float32, 512))
+
+        AK.rand!(rng, x1; prefer_threads, block_size=64)
+        @test rng.offset == UInt64(512)
+        @test AK.reset!(rng) === rng
+        @test rng.offset == UInt64(0)
+        AK.rand!(rng, x2; prefer_threads, block_size=64)
+
+        @test Array(x1) == Array(x2)
+    end
+
+
+    @testset "bit helpers" begin
+        hi = UInt32(0b10101010101010101010101010101010)
+        lo = UInt32(0b01010101010101010101010101010101)
+        word = UInt64(hi) << 32 | UInt64(lo)
+
+        @test AK._u32_hi(word) == hi
+        @test AK._u32_lo(word) == lo
+        @test AK._u64_from_u32s(lo, hi) == word
+        @test AK._mulhi_u32(0xffffffff % UInt32, 0xffffffff % UInt32) == 0xfffffffe % UInt32
+        @test AK._rotl32(0b10000000000000000000000000000001 % UInt32, UInt32(1)) == 0b11 % UInt32
+        @test AK._counter_from_index(1) == UInt64(0)
+        @test AK._counter_from_index(17) == UInt64(16)
+
+        @test AK._rand_scalar_uint_type(UInt8) === UInt32
+        @test AK._rand_scalar_uint_type(UInt16) === UInt32
+        @test AK._rand_scalar_uint_type(UInt32) === UInt32
+        @test AK._rand_scalar_uint_type(Int8) === UInt32
+        @test AK._rand_scalar_uint_type(Int16) === UInt32
+        @test AK._rand_scalar_uint_type(Int32) === UInt32
+        if RUN_FLOAT16_RAND_TESTS
+            @test AK._rand_scalar_uint_type(Float16) === UInt32
+        end
+        @test AK._rand_scalar_uint_type(Float32) === UInt32
+        @test AK._rand_scalar_uint_type(UInt64) === UInt64
+        @test AK._rand_scalar_uint_type(Int64) === UInt64
+        @test AK._rand_scalar_uint_type(Bool) === UInt32
+        if RUN_FLOAT64_RAND_TESTS
+            @test AK._rand_scalar_uint_type(Float64) === UInt64
+        end
+
+        @test AK._rand_scalar_from_uint(UInt8, UInt32(0xabcdef01)) == UInt8(0xab)
+        @test AK._rand_scalar_from_uint(UInt16, UInt32(0xabcdef01)) == UInt16(0xabcd)
+        @test AK._rand_scalar_from_uint(UInt32, 0b1010 % UInt32) == 0b1010 % UInt32
+        @test AK._rand_scalar_from_uint(UInt64, 0b1010 % UInt64) == 0b1010 % UInt64
+        @test AK._rand_scalar_from_uint(Int8, UInt32(0xff000000)) == Int8(-1)
+        @test AK._rand_scalar_from_uint(Int16, UInt32(0xffff0000)) == Int16(-1)
+        @test AK._rand_scalar_from_uint(Int32, 0b11111111111111111111111111111111 % UInt32) == Int32(-1)
+        @test AK._rand_scalar_from_uint(
+            Int64, 0b1111111111111111111111111111111111111111111111111111111111111111 % UInt64
+        ) == Int64(-1)
+        if RUN_FLOAT16_RAND_TESTS
+            @test AK._rand_scalar_from_uint(Float16, UInt32(0)) == Float16(0)
+        end
+        @test AK._rand_scalar_from_uint(Bool, UInt32(0)) == false
+        @test AK._rand_scalar_from_uint(Bool, UInt32(1)) == true
+
+        if RUN_FLOAT16_RAND_TESTS
+            @test AK.uint32_to_unit_float16(UInt32(0)) == Float16(0)
+            @test Float16(0) <= AK.uint32_to_unit_float16(typemax(UInt32)) < Float16(1)
+        end
+        @test AK.uint32_to_unit_float32(UInt32(0)) == 0.0f0
+        @test 0.0f0 <= AK.uint32_to_unit_float32(typemax(UInt32)) < 1.0f0
+        if RUN_FLOAT64_RAND_TESTS
+            @test AK.uint64_to_unit_float64(UInt64(0)) == 0.0
+            @test 0.0 <= AK.uint64_to_unit_float64(typemax(UInt64)) < 1.0
+        end
+    end
+
+
+    @testset "rand_uint" begin
+        for alg in RAND_ALGS
+            rng = AK.CounterRNG(0x123456789abcdef; alg)
+            for U in (UInt32, UInt64)
+                @test AK.rand_uint(rng.seed, rng.alg, UInt64(0), U) == AK.rand_uint(rng.seed, rng.alg, UInt64(0), U)
+                @test AK.rand_uint(rng.seed, rng.alg, UInt64(1), U) != AK.rand_uint(rng.seed, rng.alg, UInt64(0), U)
+
+                vals = [AK.rand_uint(rng.seed, rng.alg, UInt64(i), U) for i in 0:511]
+                @test length(unique(vals)) > 460
+            end
+        end
+
+        rng_splitmix = AK.CounterRNG(0x31415926; alg=AK.SplitMix64())
+        for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023))
+            @test AK.rand_uint(rng_splitmix.seed, rng_splitmix.alg, c, UInt32) == AK._u32_hi(
+                AK.rand_uint(rng_splitmix.seed, rng_splitmix.alg, c, UInt64)
+            )
+        end
+
+        for alg in (AK.Philox(), AK.Threefry())
+            rng = AK.CounterRNG(0xabcdef1234567890; alg)
+            for c in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023))
+                @test AK._u32_lo(AK.rand_uint(rng.seed, rng.alg, c, UInt64)) ==
+                      AK.rand_uint(rng.seed, rng.alg, c, UInt32)
+            end
+        end
+    end
+
+
+    @testset "rand_scalar" begin
+        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+
+        for T in RAND_SCALAR_TYPES_BACKEND
+            s0 = AK.rand_scalar(rng.seed, rng.alg, UInt64(0), T)
+            s1 = AK.rand_scalar(rng.seed, rng.alg, UInt64(1), T)
+            @test s0 isa T
+            @test s1 isa T
+            @test s0 == AK.rand_scalar(rng.seed, rng.alg, UInt64(0), T)
+            if !(T in (Bool, Float16, UInt8, UInt16, Int8, Int16))
+                @test s0 != s1
+            end
+            if T <: AbstractFloat
+                @test zero(T) <= s0 < one(T)
+                @test zero(T) <= s1 < one(T)
+            end
+        end
+
+        c = UInt64(42)
+        @test AK.rand_scalar(rng.seed, rng.alg, c, UInt8) ==
+              trunc(UInt8, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 24)
+        @test AK.rand_scalar(rng.seed, rng.alg, c, UInt16) ==
+              trunc(UInt16, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 16)
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Int8) ==
+              reinterpret(Int8, trunc(UInt8, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 24))
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Int16) ==
+              reinterpret(Int16, trunc(UInt16, AK.rand_uint(rng.seed, rng.alg, c, UInt32) >> 16))
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Int32) ==
+              reinterpret(Int32, AK.rand_uint(rng.seed, rng.alg, c, UInt32))
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Int64) ==
+              reinterpret(Int64, AK.rand_uint(rng.seed, rng.alg, c, UInt64))
+        if RUN_FLOAT16_RAND_TESTS
+            @test AK.rand_scalar(rng.seed, rng.alg, c, Float16) == AK.uint32_to_unit_float16(
+                AK.rand_uint(rng.seed, rng.alg, c, UInt32)
+            )
+        end
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Float32) == AK.uint32_to_unit_float32(
+            AK.rand_uint(rng.seed, rng.alg, c, UInt32)
+        )
+        @test AK.rand_scalar(rng.seed, rng.alg, c, Bool) ==
+              isodd(AK.rand_uint(rng.seed, rng.alg, c, UInt32))
+        if RUN_FLOAT64_RAND_TESTS
+            @test AK.rand_scalar(rng.seed, rng.alg, c, Float64) == AK.uint64_to_unit_float64(
+                AK.rand_uint(rng.seed, rng.alg, c, UInt64)
+            )
+        end
+        bools = [AK.rand_scalar(rng.seed, rng.alg, UInt64(i), Bool) for i in 0:511]
+        @test any(identity, bools)
+        @test any(!, bools)
+        @test_throws ArgumentError AK.rand_scalar(rng.seed, rng.alg, UInt64(0), UInt128)
+    end
+
+
+    @testset "rand! explicit rng" begin
+        lengths = (0, 1, 31, 32, 33, 257, 1024)
+        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+
+        for T in RAND_SCALAR_TYPES_BACKEND
+            for len in lengths
+                x = array_from_host(zeros(T, len))
+                _assert_rand_matches_reference!(rng, x; prefer_threads, block_size=64)
+                if T <: AbstractFloat
+                    @test _is_unit_interval(Array(x))
+                end
+            end
+        end
+
+        for T in RAND_SCALAR_TYPES_BACKEND
+            x1 = array_from_host(zeros(T, 2048))
+            x2 = array_from_host(zeros(T, 2048))
+            rng1 = AK.CounterRNG(rng.seed; alg=rng.alg)
+            rng2 = AK.CounterRNG(rng.seed; alg=rng.alg)
+            AK.rand!(rng1, x1; prefer_threads, block_size=64)
+            AK.rand!(rng2, x2; prefer_threads, block_size=257)
+            @test Array(x1) == Array(x2)
+        end
+
+        for T in RAND_SCALAR_TYPES_BACKEND
+            rng1 = AK.CounterRNG(rng.seed; alg=rng.alg)
+            rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg)
+            x1 = array_from_host(zeros(T, 2048))
+            x2 = array_from_host(zeros(T, 2048))
+            AK.rand!(rng1, x1; prefer_threads, block_size=64)
+            AK.rand!(rng2, x2; prefer_threads, block_size=64)
+            @test Array(x1) != Array(x2)
+        end
+
+        begin
+            rng_stream = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+            rng_once = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+            x1 = array_from_host(zeros(Float32, 100))
+            x2 = array_from_host(zeros(Float32, 100))
+            x12 = array_from_host(zeros(Float32, 200))
+            AK.rand!(rng_stream, x1; prefer_threads, block_size=64)
+            AK.rand!(rng_stream, x2; prefer_threads, block_size=64)
+            AK.rand!(rng_once, x12; prefer_threads, block_size=64)
+            @test vcat(Array(x1), Array(x2)) == Array(x12)
+            @test rng_stream.offset == UInt64(200)
+        end
+
+        for T in (RUN_FLOAT16_RAND_TESTS ? (Float16, Float32, UInt64, Bool) : (Float32, UInt64, Bool))
+            xnd = array_from_host(zeros(T, 7, 11, 5))
+            _assert_rand_matches_reference!(rng, xnd; prefer_threads, block_size=128)
+        end
+
+        if IS_CPU_BACKEND
+            for T in RAND_SCALAR_TYPES_BACKEND
+                base = zeros(T, 64)
+                view_x = @view base[2:2:end]
+                AK.rand!(
+                    rng, view_x;
+                    max_tasks=Threads.nthreads(),
+                    min_elems=1,
+                    prefer_threads=true
+                )
+                ref_view = zeros(T, length(view_x))
+                _rand_fill_reference!(
+                    rng, ref_view;
+                    counter_offset=rng.offset - UInt64(length(view_x)),
+                )
+                @test collect(view_x) == ref_view
+            end
+        end
+    end
+
+
+    @testset "rand! convenience" begin
+        ref1 = array_from_host(zeros(Float32, 1024))
+        ref2 = array_from_host(zeros(Float32, 1024))
+        x1 = array_from_host(zeros(Float32, 1024))
+        x2 = array_from_host(zeros(Float32, 1024))
+
+        Random.seed!(0xabcdef)
+        seed1 = Random.rand(Random.default_rng(), UInt64)
+        AK.rand!(AK.CounterRNG(seed1; alg=AK.Philox()), ref1; prefer_threads, block_size=64)
+        seed2 = Random.rand(Random.default_rng(), UInt64)
+        AK.rand!(AK.CounterRNG(seed2; alg=AK.Philox()), ref2; prefer_threads, block_size=64)
+
+        Random.seed!(0xabcdef)
+        AK.rand!(x1; prefer_threads, block_size=64)
+        AK.rand!(x2; prefer_threads, block_size=64)
+        @test Array(x1) == Array(ref1)
+        @test Array(x2) == Array(ref2)
+
+        x_bad = zeros(UInt128, 16)
+        @test_throws ArgumentError AK.rand!(x_bad; prefer_threads)
+        @test_throws ArgumentError AK.rand!(AK.CounterRNG(0x1), x_bad; prefer_threads)
+    end
+
+
+    @testset "rand allocation convenience" begin
+        default_alloc_type = IS_CPU_BACKEND ? Float64 : Float32
+
+        rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox())
+        y = AK.rand(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64)
+        @test size(y) == (6, 7)
+        @test eltype(y) === Float32
+        @test _is_unit_interval(Array(y))
+        @test rng.offset == UInt64(length(y))
+
+        rng_default = AK.CounterRNG(UInt64(0x99); alg=AK.Philox())
+        rng_default_ref = AK.CounterRNG(UInt64(0x99); alg=AK.Philox())
+        y_default = AK.rand(rng_default, BACKEND, 128; prefer_threads, block_size=64)
+        y_default_ref = AK.rand(
+            rng_default_ref, BACKEND, default_alloc_type, 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_default) === default_alloc_type
+        @test Array(y_default) == Array(y_default_ref)
+        @test rng_default.offset == rng_default_ref.offset == UInt64(128)
+
+        rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
+        rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
+        y_alloc = AK.rand(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64)
+        y_fill = array_from_host(zeros(Float32, 128))
+        AK.rand!(rng_fill, y_fill; prefer_threads, block_size=64)
+        @test Array(y_alloc) == Array(y_fill)
+        @test rng_alloc.offset == rng_fill.offset == UInt64(128)
+
+        rng_cpu_default = AK.CounterRNG(UInt64(0x66); alg=AK.Philox())
+        rng_cpu_default_ref = AK.CounterRNG(UInt64(0x66); alg=AK.Philox())
+        y_cpu_default = AK.rand(rng_cpu_default, 128; prefer_threads, block_size=64)
+        y_cpu_default_ref = AK.rand(
+            rng_cpu_default_ref, AK.get_backend([]), 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_cpu_default) === Float64
+        @test Array(y_cpu_default) == Array(y_cpu_default_ref)
+        @test rng_cpu_default.offset == rng_cpu_default_ref.offset == UInt64(128)
+
+        rng_cpu_typed = AK.CounterRNG(UInt64(0x77); alg=AK.Philox())
+        rng_cpu_typed_ref = AK.CounterRNG(UInt64(0x77); alg=AK.Philox())
+        y_cpu_typed = AK.rand(rng_cpu_typed, Float32, 128; prefer_threads, block_size=64)
+        y_cpu_typed_ref = AK.rand(
+            rng_cpu_typed_ref, AK.get_backend([]), Float32, 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_cpu_typed) === Float32
+        @test Array(y_cpu_typed) == Array(y_cpu_typed_ref)
+        @test rng_cpu_typed.offset == rng_cpu_typed_ref.offset == UInt64(128)
+
+        # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks.
+        AK.rand(BACKEND, Float32, 1; prefer_threads, block_size=64)
+
+        # Auto-seeded constructor should match explicit seed capture from default RNG.
+        Random.seed!(0x9abc)
+        seed = Random.rand(Random.default_rng(), UInt64)
+        ref = AK.rand(AK.CounterRNG(seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64)
+        Random.seed!(0x9abc)
+        x = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        @test Array(x) == Array(ref)
+
+        # Auto-seeded convenience without explicit type should use backend-dependent default type.
+        Random.seed!(0x4242)
+        seed_default = Random.rand(Random.default_rng(), UInt64)
+        ref_default = AK.rand(
+            AK.CounterRNG(seed_default; alg=AK.Philox()),
+            BACKEND,
+            default_alloc_type,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x4242)
+        x_default = AK.rand(BACKEND, 64; prefer_threads, block_size=64)
+        @test eltype(x_default) === default_alloc_type
+        @test Array(x_default) == Array(ref_default)
+
+        # Convenience without backend should default to CPU backend and Float64.
+        Random.seed!(0x4545)
+        seed_cpu_default = Random.rand(Random.default_rng(), UInt64)
+        ref_cpu_default = AK.rand(
+            AK.CounterRNG(seed_cpu_default; alg=AK.Philox()),
+            AK.get_backend([]),
+            Float64,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x4545)
+        x_cpu_default = AK.rand(64; prefer_threads, block_size=64)
+        @test eltype(x_cpu_default) === Float64
+        @test Array(x_cpu_default) == Array(ref_cpu_default)
+
+        # Type-only convenience should default to CPU backend.
+        Random.seed!(0x5656)
+        seed_cpu_typed = Random.rand(Random.default_rng(), UInt64)
+        ref_cpu_typed = AK.rand(
+            AK.CounterRNG(seed_cpu_typed; alg=AK.Philox()),
+            AK.get_backend([]),
+            Float32,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x5656)
+        x_cpu_typed_no_rng = AK.rand(Float32, 64; prefer_threads, block_size=64)
+        @test eltype(x_cpu_typed_no_rng) === Float32
+        @test Array(x_cpu_typed_no_rng) == Array(ref_cpu_typed)
+
+        # Reseeding should reproduce the same auto-seeded draw.
+        Random.seed!(0x7777)
+        x1 = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        Random.seed!(0x7777)
+        x2 = AK.rand(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        @test Array(x1) == Array(x2)
+
+        @test_throws ArgumentError AK.rand(AK.CounterRNG(0x1), BACKEND, UInt128, 16; prefer_threads)
+        @test_throws MethodError AK.rand(AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.rand(BACKEND, Float32, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.rand(BACKEND, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.rand(16; prefer_threads, bad=:kwarg)
+        @test_throws ArgumentError AK.rand()
+    end
+end
diff --git a/test/randn.jl b/test/randn.jl
new file mode 100644
index 0000000..2d7b5ff
--- /dev/null
+++ b/test/randn.jl
@@ -0,0 +1,353 @@
+const RANDN_ALGS = (AK.SplitMix64(), AK.Philox(), AK.Threefry())
+const RANDN_FLOAT_TYPES_BACKEND = IS_CPU_BACKEND ? (Float16, Float32, Float64) : (Float32,)
+const RANDN_LENGTHS = (0, 1, 2, 31, 32, 33, 257, 1024)
+
+
+_all_finite(v) = all(isfinite, v)
+_randn_reference_atol(::Type{Float16}) = 16 * eps(Float16)
+_randn_reference_atol(::Type{Float32}) = 64 * eps(Float32)
+_randn_reference_atol(::Type{Float64}) = 64 * eps(Float64)
+
+
+function _randn_fill_reference!(
+    rng,
+    x::AbstractArray{T};
+    counter_offset::UInt64=UInt64(0),
+) where {T <: AK.ALLOWED_RANDN_SCALARS}
+    @inbounds for i in eachindex(x)
+        x[i] = AK.randn_scalar(rng.seed, rng.alg, counter_offset + UInt64(i - one(i)), T)
+    end
+    return x
+end
+
+
+function _assert_randn_matches_reference!(rng, x; kwargs...)
+    counter_offset = rng.offset
+    AK.randn!(rng, x; kwargs...)
+    ref = zeros(eltype(x), size(x))
+    _randn_fill_reference!(rng, ref; counter_offset)
+    xa = Array(x)
+
+    if IS_CPU_BACKEND
+        @test xa == ref
+    else
+        # randn uses Box-Muller (`log`, `sqrt`, `sincos`), and GPU libm implementations are not
+        # bit-identical to CPU scalar libm. Stream/counter mapping is still deterministic, but the
+        # final Float32 values can differ by a few ULP, so we use a tight absolute tolerance here.
+        atol = _randn_reference_atol(eltype(xa))
+        @test all(isapprox.(xa, ref; rtol=zero(atol), atol))
+    end
+
+    return x
+end
+
+
+@testset "randn" begin
+    @testset "scalar helpers" begin
+        @test 0.0f0 < AK._uint32_to_open_unit_float32_midpoint(UInt32(0)) < 1.0f0
+        @test 0.0f0 < AK._uint32_to_open_unit_float32_midpoint(typemax(UInt32)) < 1.0f0
+
+        if IS_CPU_BACKEND
+            @test 0.0 < AK._uint64_to_open_unit_float64_midpoint(UInt64(0)) < 1.0
+            @test 0.0 < AK._uint64_to_open_unit_float64_midpoint(typemax(UInt64)) < 1.0
+        end
+
+        seed = UInt64(0x123456789abcdef)
+        for alg in RANDN_ALGS
+            for counter in (UInt64(0), UInt64(1), UInt64(17), UInt64(1023))
+                u32 = AK.rand_float_open01(seed, alg, counter, Float32)
+                @test 0.0f0 < u32 < 1.0f0
+
+                if IS_CPU_BACKEND
+                    u64 = AK.rand_float_open01(seed, alg, counter, Float64)
+                    @test 0.0 < u64 < 1.0
+                end
+            end
+
+            for T in RANDN_FLOAT_TYPES_BACKEND
+                s0 = AK.randn_scalar(seed, alg, UInt64(42), T)
+                s1 = AK.randn_scalar(seed, alg, UInt64(43), T)
+
+                @test s0 isa T
+                @test s1 isa T
+                @test isfinite(s0)
+                @test isfinite(s1)
+                @test s0 == AK.randn_scalar(seed, alg, UInt64(42), T)
+                @test s1 == AK.randn_scalar(seed, alg, UInt64(43), T)
+
+                p0, p1 = AK.randn_pair(seed, alg, UInt64(21), T)
+                @test p0 == AK.randn_scalar(seed, alg, UInt64(42), T)
+                @test p1 == AK.randn_scalar(seed, alg, UInt64(43), T)
+            end
+        end
+
+        @test_throws ArgumentError AK.rand_float_open01(seed, AK.Philox(), UInt64(0), UInt32)
+        @test_throws ArgumentError AK.randn_scalar(seed, AK.Philox(), UInt64(0), UInt32)
+    end
+
+
+    @testset "randn! explicit rng" begin
+        for alg in RANDN_ALGS
+            rng = AK.CounterRNG(0x123456789abcdef; alg)
+
+            for T in RANDN_FLOAT_TYPES_BACKEND
+                x1 = array_from_host(zeros(T, 2048))
+                x2 = array_from_host(zeros(T, 2048))
+                rng1 = AK.CounterRNG(rng.seed; alg=rng.alg)
+                rng2 = AK.CounterRNG(rng.seed; alg=rng.alg)
+
+                AK.randn!(rng1, x1; prefer_threads, block_size=64)
+                AK.randn!(rng2, x2; prefer_threads, block_size=257)
+                @test Array(x1) == Array(x2)
+            end
+
+            for T in RANDN_FLOAT_TYPES_BACKEND
+                x1 = array_from_host(zeros(T, 2048))
+                x2 = array_from_host(zeros(T, 2048))
+                rng1 = AK.CounterRNG(rng.seed; alg=rng.alg)
+                rng2 = AK.CounterRNG(rng.seed + UInt64(1); alg=rng.alg)
+
+                AK.randn!(rng1, x1; prefer_threads, block_size=64)
+                AK.randn!(rng2, x2; prefer_threads, block_size=64)
+                @test Array(x1) != Array(x2)
+            end
+        end
+    end
+
+
+    @testset "offset and reset semantics" begin
+        rng_stream = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17))
+        s1 = array_from_host(zeros(Float32, 99))
+        s2 = array_from_host(zeros(Float32, 101))
+        s12 = array_from_host(zeros(Float32, 200))
+
+        AK.randn!(rng_stream, s1; prefer_threads, block_size=64)
+        @test rng_stream.offset == UInt64(116)
+        AK.randn!(rng_stream, s2; prefer_threads, block_size=64)
+        @test rng_stream.offset == UInt64(217)
+
+        rng_once = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox(), offset=UInt64(17))
+        AK.randn!(rng_once, s12; prefer_threads, block_size=64)
+        @test vcat(Array(s1), Array(s2)) == Array(s12)
+        @test rng_once.offset == UInt64(217)
+
+        empty = array_from_host(zeros(Float32, 0))
+        stream_offset = rng_stream.offset
+        AK.randn!(rng_stream, empty; prefer_threads, block_size=64)
+        @test rng_stream.offset == stream_offset
+
+        @test AK.reset!(rng_stream) === rng_stream
+        @test rng_stream.offset == UInt64(0)
+
+        y1 = array_from_host(zeros(Float32, 64))
+        y2 = array_from_host(zeros(Float32, 64))
+        AK.randn!(rng_stream, y1; prefer_threads, block_size=64)
+        AK.randn!(AK.CounterRNG(UInt64(0x1234); alg=AK.Philox()), y2; prefer_threads, block_size=64)
+        @test Array(y1) == Array(y2)
+    end
+
+
+    @testset "shapes and views" begin
+        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+
+        for T in RANDN_FLOAT_TYPES_BACKEND
+            xnd = array_from_host(zeros(T, 7, 11, 5))
+            _assert_randn_matches_reference!(rng, xnd; prefer_threads, block_size=128)
+        end
+
+        if IS_CPU_BACKEND
+            for T in RANDN_FLOAT_TYPES_BACKEND
+                base = zeros(T, 64)
+                view_x = @view base[2:2:end]
+
+                AK.randn!(
+                    rng, view_x;
+                    max_tasks=Threads.nthreads(),
+                    min_elems=1,
+                    prefer_threads=true,
+                )
+
+                ref_view = zeros(T, length(view_x))
+                _randn_fill_reference!(
+                    rng, ref_view;
+                    counter_offset=rng.offset - UInt64(length(view_x)),
+                )
+                @test collect(view_x) == ref_view
+            end
+        end
+    end
+
+
+    @testset "randn! convenience" begin
+        ref1 = array_from_host(zeros(Float32, 1024))
+        ref2 = array_from_host(zeros(Float32, 1024))
+        x1 = array_from_host(zeros(Float32, 1024))
+        x2 = array_from_host(zeros(Float32, 1024))
+
+        Random.seed!(0xabcdef)
+        seed1 = Random.rand(Random.default_rng(), UInt64)
+        AK.randn!(AK.CounterRNG(seed1; alg=AK.Philox()), ref1; prefer_threads, block_size=64)
+        seed2 = Random.rand(Random.default_rng(), UInt64)
+        AK.randn!(AK.CounterRNG(seed2; alg=AK.Philox()), ref2; prefer_threads, block_size=64)
+
+        Random.seed!(0xabcdef)
+        AK.randn!(x1; prefer_threads, block_size=64)
+        AK.randn!(x2; prefer_threads, block_size=64)
+        @test Array(x1) == Array(ref1)
+        @test Array(x2) == Array(ref2)
+
+        x_bad = zeros(UInt32, 16)
+        @test_throws ArgumentError AK.randn!(x_bad; prefer_threads)
+        @test_throws ArgumentError AK.randn!(AK.CounterRNG(0x1), x_bad; prefer_threads)
+    end
+
+
+    @testset "randn allocation convenience" begin
+        default_alloc_type = IS_CPU_BACKEND ? Float64 : Float32
+
+        rng = AK.CounterRNG(UInt64(0x1234); alg=AK.Philox())
+        y = AK.randn(rng, BACKEND, Float32, Int32(6), UInt16(7); prefer_threads, block_size=64)
+        @test size(y) == (6, 7)
+        @test eltype(y) === Float32
+        @test _all_finite(Array(y))
+        @test rng.offset == UInt64(length(y))
+
+        rng_default = AK.CounterRNG(UInt64(0x99); alg=AK.Philox())
+        rng_default_ref = AK.CounterRNG(UInt64(0x99); alg=AK.Philox())
+        y_default = AK.randn(rng_default, BACKEND, 128; prefer_threads, block_size=64)
+        y_default_ref = AK.randn(
+            rng_default_ref, BACKEND, default_alloc_type, 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_default) === default_alloc_type
+        @test Array(y_default) == Array(y_default_ref)
+        @test rng_default.offset == rng_default_ref.offset == UInt64(128)
+
+        rng_alloc = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
+        rng_fill = AK.CounterRNG(UInt64(0x55); alg=AK.Philox())
+        y_alloc = AK.randn(rng_alloc, BACKEND, Float32, 128; prefer_threads, block_size=64)
+        y_fill = array_from_host(zeros(Float32, 128))
+        AK.randn!(rng_fill, y_fill; prefer_threads, block_size=64)
+        @test Array(y_alloc) == Array(y_fill)
+        @test rng_alloc.offset == rng_fill.offset == UInt64(128)
+
+        rng_cpu_default = AK.CounterRNG(UInt64(0x66); alg=AK.Philox())
+        rng_cpu_default_ref = AK.CounterRNG(UInt64(0x66); alg=AK.Philox())
+        y_cpu_default = AK.randn(rng_cpu_default, 128; prefer_threads, block_size=64)
+        y_cpu_default_ref = AK.randn(
+            rng_cpu_default_ref, AK.get_backend([]), 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_cpu_default) === Float64
+        @test Array(y_cpu_default) == Array(y_cpu_default_ref)
+        @test rng_cpu_default.offset == rng_cpu_default_ref.offset == UInt64(128)
+
+        rng_cpu_typed = AK.CounterRNG(UInt64(0x77); alg=AK.Philox())
+        rng_cpu_typed_ref = AK.CounterRNG(UInt64(0x77); alg=AK.Philox())
+        y_cpu_typed = AK.randn(rng_cpu_typed, Float32, 128; prefer_threads, block_size=64)
+        y_cpu_typed_ref = AK.randn(
+            rng_cpu_typed_ref, AK.get_backend([]), Float32, 128; prefer_threads, block_size=64
+        )
+        @test eltype(y_cpu_typed) === Float32
+        @test Array(y_cpu_typed) == Array(y_cpu_typed_ref)
+        @test rng_cpu_typed.offset == rng_cpu_typed_ref.offset == UInt64(128)
+
+        # Warm-up first call path so one-time compilation/backend init does not perturb RNG checks.
+        AK.randn(BACKEND, Float32, 1; prefer_threads, block_size=64)
+
+        # Auto-seeded constructor should match explicit seed capture from default RNG.
+        Random.seed!(0x9abc)
+        seed = Random.rand(Random.default_rng(), UInt64)
+        ref = AK.randn(AK.CounterRNG(
+            seed; alg=AK.Philox()), BACKEND, Float32, 64; prefer_threads, block_size=64
+        )
+        Random.seed!(0x9abc)
+        x = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        @test Array(x) == Array(ref)
+
+        # Auto-seeded convenience without explicit type should use backend-dependent default type.
+        Random.seed!(0x4242)
+        seed_default = Random.rand(Random.default_rng(), UInt64)
+        ref_default = AK.randn(
+            AK.CounterRNG(seed_default; alg=AK.Philox()),
+            BACKEND,
+            default_alloc_type,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x4242)
+        x_default = AK.randn(BACKEND, 64; prefer_threads, block_size=64)
+        @test eltype(x_default) === default_alloc_type
+        @test Array(x_default) == Array(ref_default)
+
+        # Convenience without backend should default to CPU backend and Float64.
+        Random.seed!(0x4545)
+        seed_cpu_default = Random.rand(Random.default_rng(), UInt64)
+        ref_cpu_default = AK.randn(
+            AK.CounterRNG(seed_cpu_default; alg=AK.Philox()),
+            AK.get_backend([]),
+            Float64,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x4545)
+        x_cpu_default = AK.randn(64; prefer_threads, block_size=64)
+        @test eltype(x_cpu_default) === Float64
+        @test Array(x_cpu_default) == Array(ref_cpu_default)
+
+        # Type-only convenience should default to CPU backend.
+        Random.seed!(0x5656)
+        seed_cpu_typed = Random.rand(Random.default_rng(), UInt64)
+        ref_cpu_typed = AK.randn(
+            AK.CounterRNG(seed_cpu_typed; alg=AK.Philox()),
+            AK.get_backend([]),
+            Float32,
+            64;
+            prefer_threads,
+            block_size=64,
+        )
+        Random.seed!(0x5656)
+        x_cpu_typed_no_rng = AK.randn(Float32, 64; prefer_threads, block_size=64)
+        @test eltype(x_cpu_typed_no_rng) === Float32
+        @test Array(x_cpu_typed_no_rng) == Array(ref_cpu_typed)
+
+        # Reseeding should reproduce the same auto-seeded draw.
+        Random.seed!(0x7777)
+        x1 = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        Random.seed!(0x7777)
+        x2 = AK.randn(BACKEND, Float32, 64; prefer_threads, block_size=64)
+        @test Array(x1) == Array(x2)
+
+        @test_throws ArgumentError AK.randn(AK.CounterRNG(0x1), BACKEND, UInt32, 16; prefer_threads)
+        @test_throws MethodError AK.randn(
+            AK.CounterRNG(0x1), BACKEND, Float32, 16; prefer_threads, bad=:kwarg
+        )
+        @test_throws MethodError AK.randn(BACKEND, Float32, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.randn(BACKEND, 16; prefer_threads, bad=:kwarg)
+        @test_throws MethodError AK.randn(16; prefer_threads, bad=:kwarg)
+        @test_throws ArgumentError AK.randn()
+    end
+
+
+    @testset "moments sanity" begin
+        n = 200_000
+        rng = AK.CounterRNG(0x123456789abcdef; alg=AK.Philox())
+
+        for T in RANDN_FLOAT_TYPES_BACKEND
+            x = array_from_host(zeros(T, n))
+            AK.randn!(rng, x; prefer_threads, block_size=128)
+            xa = Float64.(Array(x))
+
+            m = sum(xa) / length(xa)
+            v = sum((xi - m)^2 for xi in xa) / length(xa)
+
+            if T === Float16
+                @test abs(m) < 0.1
+                @test abs(v - one(v)) < 0.15
+            else
+                @test abs(m) < 0.01
+                @test abs(v - one(v)) < 0.03
+            end
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 716fd8e..707afee 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -69,6 +69,8 @@ end
 include("partition.jl")
 include("looping.jl")
 include("map.jl")
+include("rand.jl")
+include("randn.jl")
 include("sort.jl")
 include("reduce.jl")
 include("accumulate.jl")