From 828e0e152869d36332cf7f9681e090516896d8bf Mon Sep 17 00:00:00 2001
From: Davide Angelocola <davide.angelocola@gmail.com>
Date: Mon, 15 Jun 2026 22:54:00 +0200
Subject: [PATCH 1/2] feat(reader): lazy FastLanes RLE decode via top-level
 records
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the eager primitive RLE expansion with four top-level record
types in reader.array: LazyRleLongArray, LazyRleIntArray, LazyRleShortArray,
LazyRleByteArray. Each holds (dtype, length, values, indices,
valuesIdxOffsets, firstOffset, valuesLen, numChunks, offset) and resolves
getXxx(i) via chunkIdx = absRow >> 10 + table lookups instead of allocating
an n-sized output buffer. forEachXxx walks chunks one at a time; the
constant-run fast path (numChunkValues <= 1) emits the single value with
a tight inner loop.

RleArrays (package-private) holds FL_CHUNK_SIZE=1024, FL_LOG2, FL_MASK
constants plus the chunkValueCount helper used by all four records.

Bool and VarBin RLE paths stay eager (RleEncodingDecoder doesn't decode
them in the first place; accepts only !isFloating primitive ptypes —
unchanged). Floating-point also rejected per existing accepts().

Drive-by fix in SparseEncodingDecoder: when numPatches == 0, decode
zero-length children instead of null so the record's getXxx /
patchValues.length() path doesn't NPE. Round-trip tests previously caught
this (decode_noPatches_returnsFillValue) — they now pass.

Drive-by fix: drop dead java.nio.ByteOrder import from
SparseEncodingDecoder (leftover from the lazy-Sparse PR).

Test helper sweep: convert ArraySegments.of(result).get(LE_XXX, off) calls
in RleEncodingEncoderTest and SparseEncodingEncoderTest to typed
LongArray.getLong / IntArray.getInt / DoubleArray.getDouble / etc.
accessors. The segment-extract path doesn't work on lazy outputs and the
typed accessor is the right API anyway.

6 new unit tests in LazyRleArrayTest cover the constant-run fast path,
per-row lookup with explicit indices, multi-chunk boundary, offset
slicing, fold reduction, and int-typed dispatch. ./mvnw verify green
(13 modules, integration suite 45s).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../vortex/reader/array/LazyRleByteArray.java |  88 ++++++++++++
 .../vortex/reader/array/LazyRleIntArray.java  |  80 +++++++++++
 .../vortex/reader/array/LazyRleLongArray.java |  90 ++++++++++++
 .../reader/array/LazyRleShortArray.java       |  88 ++++++++++++
 .../dfa1/vortex/reader/array/RleArrays.java   |  42 ++++++
 .../reader/decode/RleEncodingDecoder.java     | 132 +++++++++---------
 .../reader/decode/SparseEncodingDecoder.java  |  11 +-
 .../vortex/reader/array/LazyRleArrayTest.java | 123 ++++++++++++++++
 .../writer/encode/RleEncodingEncoderTest.java |  22 ++-
 .../encode/SparseEncodingEncoderTest.java     |  22 ++-
 10 files changed, 600 insertions(+), 98 deletions(-)
 create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleByteArray.java
 create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleIntArray.java
 create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleLongArray.java
 create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleShortArray.java
 create mode 100644 reader/src/main/java/io/github/dfa1/vortex/reader/array/RleArrays.java
 create mode 100644 reader/src/test/java/io/github/dfa1/vortex/reader/array/LazyRleArrayTest.java

diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleByteArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleByteArray.java
new file mode 100644
index 00000000..b952e892
--- /dev/null
+++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleByteArray.java
@@ -0,0 +1,88 @@
+package io.github.dfa1.vortex.reader.array;
+
+import io.github.dfa1.vortex.core.DType;
+
+import java.util.function.LongBinaryOperator;
+
+/// Lazy FastLanes-RLE-encoded {@link ByteArray}. See {@link LazyRleLongArray} for semantics.
+///
+/// @param dtype             logical element type
+/// @param length            total logical row count
+/// @param values            concatenated distinct values per chunk
+/// @param indices           per-row local index table
+/// @param valuesIdxOffsets  per-chunk values-pool start offsets
+/// @param firstOffset       absolute origin of the values pool
+/// @param valuesLen         total values pool length
+/// @param numChunks         number of FastLanes chunks covered
+/// @param offset            starting absolute position
+/// @param unsigned          {@code true} when the dtype is U8 (affects {@link #getInt(long)} widening)
+public record LazyRleByteArray(
+        DType dtype, long length, byte[] values, int[] indices,
+        long[] valuesIdxOffsets, long firstOffset, long valuesLen,
+        int numChunks, int offset, boolean unsigned)
+        implements ByteArray {
+
+    private byte lookup(long i) {
+        int absRow = (int) (i + offset);
+        int chunkIdx = absRow >>> RleArrays.FL_LOG2;
+        int rowInChunk = absRow & RleArrays.FL_MASK;
+        long valueIdxOffset = valuesIdxOffsets[chunkIdx] - firstOffset;
+        int numChunkValues = RleArrays.chunkValueCount(chunkIdx, numChunks, valuesIdxOffsets, firstOffset, valuesLen);
+        if (numChunkValues <= 1) {
+            return numChunkValues == 1 ? values[(int) valueIdxOffset] : (byte) 0;
+        }
+        int localIdx = indices[chunkIdx * RleArrays.FL_CHUNK_SIZE + rowInChunk];
+        if (localIdx >= numChunkValues) {
+            localIdx = numChunkValues - 1;
+        }
+        return values[(int) valueIdxOffset + localIdx];
+    }
+
+    @Override
+    public byte getByte(long i) {
+        return lookup(i);
+    }
+
+    @Override
+    public int getInt(long i) {
+        byte v = lookup(i);
+        return unsigned ? Byte.toUnsignedInt(v) : v;
+    }
+
+    @Override
+    public long fold(long identity, LongBinaryOperator op) {
+        long acc = identity;
+        long n = length;
+        long emitted = 0;
+        int absRow = offset;
+        int startChunk = absRow >>> RleArrays.FL_LOG2;
+        for (int chunkIdx = startChunk; chunkIdx < numChunks && emitted < n; chunkIdx++) {
+            int chunkBase = chunkIdx * RleArrays.FL_CHUNK_SIZE;
+            int rowInChunk = absRow - chunkBase;
+            long valueIdxOffset = valuesIdxOffsets[chunkIdx] - firstOffset;
+            int numChunkValues = RleArrays.chunkValueCount(chunkIdx, numChunks, valuesIdxOffsets, firstOffset, valuesLen);
+            int end = Math.min(RleArrays.FL_CHUNK_SIZE, rowInChunk + (int) (n - emitted));
+            if (numChunkValues <= 1) {
+                byte v = numChunkValues == 1 ? values[(int) valueIdxOffset] : (byte) 0;
+                long widened = unsigned ? Byte.toUnsignedInt(v) : v;
+                for (int r = rowInChunk; r < end; r++) {
+                    acc = op.applyAsLong(acc, widened);
+                }
+            } else {
+                for (int r = rowInChunk; r < end; r++) {
+                    int localIdx = indices[chunkBase + r];
+                    if (localIdx >= numChunkValues) {
+                        localIdx = numChunkValues - 1;
+                    }
+                    byte v = values[(int) valueIdxOffset + localIdx];
+                    long widened = unsigned ? Byte.toUnsignedInt(v) : v;
+                    acc = op.applyAsLong(acc, widened);
+                }
+            }
+            int count = end - rowInChunk;
+            emitted += count;
+            absRow += count;
+        }
+        return acc;
+    }
+}
diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleIntArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleIntArray.java
new file mode 100644
index 00000000..b43dc8b9
--- /dev/null
+++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleIntArray.java
@@ -0,0 +1,80 @@
+package io.github.dfa1.vortex.reader.array;
+
+import io.github.dfa1.vortex.core.DType;
+
+import java.util.function.IntBinaryOperator;
+import java.util.function.IntConsumer;
+
+/// Lazy FastLanes-RLE-encoded {@link IntArray}. See {@link LazyRleLongArray} for semantics.
+///
+/// @param dtype             logical element type
+/// @param length            total logical row count
+/// @param values            concatenated distinct values per chunk
+/// @param indices           per-row local index table
+/// @param valuesIdxOffsets  per-chunk values-pool start offsets
+/// @param firstOffset       absolute origin of the values pool
+/// @param valuesLen         total values pool length
+/// @param numChunks         number of FastLanes chunks covered
+/// @param offset            starting absolute position
+public record LazyRleIntArray(
+        DType dtype, long length, int[] values, int[] indices,
+        long[] valuesIdxOffsets, long firstOffset, long valuesLen,
+        int numChunks, int offset)
+        implements IntArray {
+
+    @Override
+    public int getInt(long i) {
+        int absRow = (int) (i + offset);
+        int chunkIdx = absRow >>> RleArrays.FL_LOG2;
+        int rowInChunk = absRow & RleArrays.FL_MASK;
+        long valueIdxOffset = valuesIdxOffsets[chunkIdx] - firstOffset;
+        int numChunkValues = RleArrays.chunkValueCount(chunkIdx, numChunks, valuesIdxOffsets, firstOffset, valuesLen);
+        if (numChunkValues <= 1) {
+            return numChunkValues == 1 ? values[(int) valueIdxOffset] : 0;
+        }
+        int localIdx = indices[chunkIdx * RleArrays.FL_CHUNK_SIZE + rowInChunk];
+        if (localIdx >= numChunkValues) {
+            localIdx = numChunkValues - 1;
+        }
+        return values[(int) valueIdxOffset + localIdx];
+    }
+
+    @Override
+    public void forEachInt(IntConsumer c) {
+        long n = length;
+        long emitted = 0;
+        int absRow = offset;
+        int startChunk = absRow >>> RleArrays.FL_LOG2;
+        for (int chunkIdx = startChunk; chunkIdx < numChunks && emitted < n; chunkIdx++) {
+            int chunkBase = chunkIdx * RleArrays.FL_CHUNK_SIZE;
+            int rowInChunk = absRow - chunkBase;
+            long valueIdxOffset = valuesIdxOffsets[chunkIdx] - firstOffset;
+            int numChunkValues = RleArrays.chunkValueCount(chunkIdx, numChunks, valuesIdxOffsets, firstOffset, valuesLen);
+            int end = Math.min(RleArrays.FL_CHUNK_SIZE, rowInChunk + (int) (n - emitted));
+            if (numChunkValues <= 1) {
+                int v = numChunkValues == 1 ? values[(int) valueIdxOffset] : 0;
+                for (int r = rowInChunk; r < end; r++) {
+                    c.accept(v);
+                }
+            } else {
+                for (int r = rowInChunk; r < end; r++) {
+                    int localIdx = indices[chunkBase + r];
+                    if (localIdx >= numChunkValues) {
+                        localIdx = numChunkValues - 1;
+                    }
+                    c.accept(values[(int) valueIdxOffset + localIdx]);
+                }
+            }
+            int count = end - rowInChunk;
+            emitted += count;
+            absRow += count;
+        }
+    }
+
+    @Override
+    public int fold(int identity, IntBinaryOperator op) {
+        int[] acc = {identity};
+        forEachInt(v -> acc[0] = op.applyAsInt(acc[0], v));
+        return acc[0];
+    }
+}
diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleLongArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleLongArray.java
new file mode 100644
index 00000000..d08f1106
--- /dev/null
+++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleLongArray.java
@@ -0,0 +1,90 @@
+package io.github.dfa1.vortex.reader.array;
+
+import io.github.dfa1.vortex.core.DType;
+
+import java.util.function.LongBinaryOperator;
+import java.util.function.LongConsumer;
+
+/// Lazy FastLanes-RLE-encoded {@link LongArray}.
+///
+/// FastLanes RLE encodes 1024-row chunks of values + indices. For each chunk the
+/// distinct values are concatenated into {@code values}; the per-row local index
+/// into the chunk's value range lives in {@code indices}; {@code valuesIdxOffsets}
+/// gives the chunk-start offset into the global values pool. {@code getLong(i)}
+/// resolves {@code values[valuesIdxOffsets[chunkIdx(i)] + clampedLocalIdx(...)]}.
+///
+/// {@code forEachLong} / {@code fold} iterate chunk-by-chunk so the constant-run
+/// fast path (`numChunkValues <= 1`) emits each value once with a tight inner loop.
+///
+/// @param dtype             logical element type
+/// @param length            total logical row count
+/// @param values            concatenated distinct values per chunk
+/// @param indices           per-row local index table; length {@code numChunks * 1024}
+/// @param valuesIdxOffsets  per-chunk values-pool start offsets; length {@code numChunks}
+/// @param firstOffset       absolute origin of the values pool
+/// @param valuesLen         total values pool length
+/// @param numChunks         number of FastLanes chunks covered
+/// @param offset            starting absolute position; logical row {@code i} maps to
+///                          absolute {@code i + offset}
+public record LazyRleLongArray(
+        DType dtype, long length, long[] values, int[] indices,
+        long[] valuesIdxOffsets, long firstOffset, long valuesLen,
+        int numChunks, int offset)
+        implements LongArray {
+
+    @Override
+    public long getLong(long i) {
+        int absRow = (int) (i + offset);
+        int chunkIdx = absRow >>> RleArrays.FL_LOG2;
+        int rowInChunk = absRow & RleArrays.FL_MASK;
+        long valueIdxOffset = valuesIdxOffsets[chunkIdx] - firstOffset;
+        int numChunkValues = RleArrays.chunkValueCount(chunkIdx, numChunks, valuesIdxOffsets, firstOffset, valuesLen);
+        if (numChunkValues <= 1) {
+            return numChunkValues == 1 ? values[(int) valueIdxOffset] : 0L;
+        }
+        int localIdx = indices[chunkIdx * RleArrays.FL_CHUNK_SIZE + rowInChunk];
+        if (localIdx >= numChunkValues) {
+            localIdx = numChunkValues - 1;
+        }
+        return values[(int) valueIdxOffset + localIdx];
+    }
+
+    @Override
+    public void forEachLong(LongConsumer c) {
+        long n = length;
+        long emitted = 0;
+        int absRow = offset;
+        int startChunk = absRow >>> RleArrays.FL_LOG2;
+        for (int chunkIdx = startChunk; chunkIdx < numChunks && emitted < n; chunkIdx++) {
+            int chunkBase = chunkIdx * RleArrays.FL_CHUNK_SIZE;
+            int rowInChunk = absRow - chunkBase;
+            long valueIdxOffset = valuesIdxOffsets[chunkIdx] - firstOffset;
+            int numChunkValues = RleArrays.chunkValueCount(chunkIdx, numChunks, valuesIdxOffsets, firstOffset, valuesLen);
+            int end = Math.min(RleArrays.FL_CHUNK_SIZE, rowInChunk + (int) (n - emitted));
+            if (numChunkValues <= 1) {
+                long v = numChunkValues == 1 ? values[(int) valueIdxOffset] : 0L;
+                for (int r = rowInChunk; r < end; r++) {
+                    c.accept(v);
+                }
+            } else {
+                for (int r = rowInChunk; r < end; r++) {
+                    int localIdx = indices[chunkBase + r];
+                    if (localIdx >= numChunkValues) {
+                        localIdx = numChunkValues - 1;
+                    }
+                    c.accept(values[(int) valueIdxOffset + localIdx]);
+                }
+            }
+            int count = end - rowInChunk;
+            emitted += count;
+            absRow += count;
+        }
+    }
+
+    @Override
+    public long fold(long identity, LongBinaryOperator op) {
+        long[] acc = {identity};
+        forEachLong(v -> acc[0] = op.applyAsLong(acc[0], v));
+        return acc[0];
+    }
+}
diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleShortArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleShortArray.java
new file mode 100644
index 00000000..5141be75
--- /dev/null
+++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleShortArray.java
@@ -0,0 +1,88 @@
+package io.github.dfa1.vortex.reader.array;
+
+import io.github.dfa1.vortex.core.DType;
+
+import java.util.function.LongBinaryOperator;
+
+/// Lazy FastLanes-RLE-encoded {@link ShortArray}. See {@link LazyRleLongArray} for semantics.
+///
+/// @param dtype             logical element type
+/// @param length            total logical row count
+/// @param values            concatenated distinct values per chunk
+/// @param indices           per-row local index table
+/// @param valuesIdxOffsets  per-chunk values-pool start offsets
+/// @param firstOffset       absolute origin of the values pool
+/// @param valuesLen         total values pool length
+/// @param numChunks         number of FastLanes chunks covered
+/// @param offset            starting absolute position
+/// @param unsigned          {@code true} when the dtype is U16 (affects {@link #getInt(long)} widening)
+public record LazyRleShortArray(
+        DType dtype, long length, short[] values, int[] indices,
+        long[] valuesIdxOffsets, long firstOffset, long valuesLen,
+        int numChunks, int offset, boolean unsigned)
+        implements ShortArray {
+
+    private short lookup(long i) {
+        int absRow = (int) (i + offset);
+        int chunkIdx = absRow >>> RleArrays.FL_LOG2;
+        int rowInChunk = absRow & RleArrays.FL_MASK;
+        long valueIdxOffset = valuesIdxOffsets[chunkIdx] - firstOffset;
+        int numChunkValues = RleArrays.chunkValueCount(chunkIdx, numChunks, valuesIdxOffsets, firstOffset, valuesLen);
+        if (numChunkValues <= 1) {
+            return numChunkValues == 1 ? values[(int) valueIdxOffset] : (short) 0;
+        }
+        int localIdx = indices[chunkIdx * RleArrays.FL_CHUNK_SIZE + rowInChunk];
+        if (localIdx >= numChunkValues) {
+            localIdx = numChunkValues - 1;
+        }
+        return values[(int) valueIdxOffset + localIdx];
+    }
+
+    @Override
+    public short getShort(long i) {
+        return lookup(i);
+    }
+
+    @Override
+    public int getInt(long i) {
+        short v = lookup(i);
+        return unsigned ? Short.toUnsignedInt(v) : v;
+    }
+
+    @Override
+    public long fold(long identity, LongBinaryOperator op) {
+        long acc = identity;
+        long n = length;
+        long emitted = 0;
+        int absRow = offset;
+        int startChunk = absRow >>> RleArrays.FL_LOG2;
+        for (int chunkIdx = startChunk; chunkIdx < numChunks && emitted < n; chunkIdx++) {
+            int chunkBase = chunkIdx * RleArrays.FL_CHUNK_SIZE;
+            int rowInChunk = absRow - chunkBase;
+            long valueIdxOffset = valuesIdxOffsets[chunkIdx] - firstOffset;
+            int numChunkValues = RleArrays.chunkValueCount(chunkIdx, numChunks, valuesIdxOffsets, firstOffset, valuesLen);
+            int end = Math.min(RleArrays.FL_CHUNK_SIZE, rowInChunk + (int) (n - emitted));
+            if (numChunkValues <= 1) {
+                short v = numChunkValues == 1 ? values[(int) valueIdxOffset] : (short) 0;
+                long widened = unsigned ? Short.toUnsignedInt(v) : v;
+                for (int r = rowInChunk; r < end; r++) {
+                    acc = op.applyAsLong(acc, widened);
+                }
+            } else {
+                for (int r = rowInChunk; r < end; r++) {
+                    int localIdx = indices[chunkBase + r];
+                    if (localIdx >= numChunkValues) {
+                        localIdx = numChunkValues - 1;
+                    }
+                    short v = values[(int) valueIdxOffset + localIdx];
+                    long widened = unsigned ? Short.toUnsignedInt(v) : v;
+                    acc = op.applyAsLong(acc, widened);
+                }
+            }
+            int count = end - rowInChunk;
+            emitted += count;
+            absRow += count;
+        }
+        return acc;
+    }
+}
diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/RleArrays.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/RleArrays.java
new file mode 100644
index 00000000..a9105513
--- /dev/null
+++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/RleArrays.java
@@ -0,0 +1,42 @@
+package io.github.dfa1.vortex.reader.array;
+
+/// Package-private constants and helpers shared by the {@code LazyRleXxxArray} records.
+///
+/// FastLanes RLE works in fixed 1024-row chunks; per-row decode is
+/// {@code chunkIdx = absRow >> 10}, {@code rowInChunk = absRow & 1023}. The chunk's
+/// local value count comes from {@code valuesIdxOffsets[chunkIdx+1] - valuesIdxOffsets[chunkIdx]};
+/// the per-row local index sits in {@code indices[chunkIdx * 1024 + rowInChunk]} and
+/// must be clamped to {@code numChunkValues - 1} (the writer encodes the constant-run
+/// case with a single value and may leave the slot's bits as 0).
+final class RleArrays {
+
+    /// Fixed FastLanes chunk size in rows.
+    static final int FL_CHUNK_SIZE = 1024;
+
+    /// {@code log2(FL_CHUNK_SIZE)} — used for cheap shift-based chunk indexing.
+    static final int FL_LOG2 = 10;
+
+    /// Mask for {@code absRow % FL_CHUNK_SIZE}.
+    static final int FL_MASK = FL_CHUNK_SIZE - 1;
+
+    private RleArrays() {
+    }
+
+    /// Returns the number of distinct values in {@code chunkIdx}.
+    ///
+    /// @param chunkIdx         chunk index in {@code [0, numChunks)}
+    /// @param numChunks        total chunk count
+    /// @param valuesIdxOffsets per-chunk starting offsets into the global values pool
+    ///                         (length = {@code numChunks})
+    /// @param firstOffset      absolute origin of the pool (subtracted before lookup)
+    /// @param valuesLen        total length of the values pool
+    /// @return distinct value count for {@code chunkIdx}
+    static int chunkValueCount(int chunkIdx, int numChunks, long[] valuesIdxOffsets,
+            long firstOffset, long valuesLen) {
+        long start = valuesIdxOffsets[chunkIdx] - firstOffset;
+        long end = chunkIdx + 1 < numChunks
+                ? valuesIdxOffsets[chunkIdx + 1] - firstOffset
+                : valuesLen;
+        return (int) (end - start);
+    }
+}
diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/RleEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/RleEncodingDecoder.java
index bf0950bb..f2918ef7 100644
--- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/RleEncodingDecoder.java
+++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/RleEncodingDecoder.java
@@ -9,19 +9,19 @@
 import io.github.dfa1.vortex.reader.array.Array;
 import io.github.dfa1.vortex.reader.array.ArraySegments;
 import io.github.dfa1.vortex.reader.array.BoolArray;
+import io.github.dfa1.vortex.reader.array.LazyRleByteArray;
+import io.github.dfa1.vortex.reader.array.LazyRleIntArray;
+import io.github.dfa1.vortex.reader.array.LazyRleLongArray;
+import io.github.dfa1.vortex.reader.array.LazyRleShortArray;
 import io.github.dfa1.vortex.reader.array.MaskedArray;
 import io.github.dfa1.vortex.reader.array.MaterializedBoolArray;
 import io.github.dfa1.vortex.reader.array.MaterializedByteArray;
-import io.github.dfa1.vortex.reader.array.MaterializedDoubleArray;
-import io.github.dfa1.vortex.reader.array.MaterializedFloat16Array;
-import io.github.dfa1.vortex.reader.array.MaterializedFloatArray;
 import io.github.dfa1.vortex.reader.array.MaterializedIntArray;
 import io.github.dfa1.vortex.reader.array.MaterializedLongArray;
 import io.github.dfa1.vortex.reader.array.MaterializedShortArray;
 
 import java.io.IOException;
 import java.lang.foreign.MemorySegment;
-import java.lang.foreign.SegmentAllocator;
 import java.lang.foreign.ValueLayout;
 import java.nio.ByteBuffer;
 
@@ -85,43 +85,31 @@ public Array decode(DecodeContext ctx) {
             indicesValidity = masked.validity();
         }
 
-        long[] values = readLongs(ctx.decodeChildSegment(0, valuesDtype, valuesLen), (int) valuesLen, ptype);
         int[] indices = readIndices(ArraySegments.of(indicesArr), (int) indicesLen, indicesPtype);
-        long[] valuesIdxOffsets = readUnsignedLongs(ctx.decodeChildSegment(2, offsetsDtype, offsetsLen), (int) offsetsLen, offsetsPtype);
-
+        long[] valuesIdxOffsets = readUnsignedLongs(
+                ctx.decodeChildSegment(2, offsetsDtype, offsetsLen), (int) offsetsLen, offsetsPtype);
+        long firstOffset = valuesLen > 0 && valuesIdxOffsets.length > 0 ? valuesIdxOffsets[0] : 0L;
         int numChunks = (int) (indicesLen / FL_CHUNK_SIZE);
-        int chunkEnd = (int) ((offset + rowCount + FL_CHUNK_SIZE - 1) / FL_CHUNK_SIZE);
-        chunkEnd = Math.min(chunkEnd, numChunks);
-
-        long[] decoded = new long[chunkEnd * FL_CHUNK_SIZE];
-        long firstOffset = valuesLen > 0 ? valuesIdxOffsets[0] : 0L;
-
-        for (int chunkIdx = 0; chunkIdx < chunkEnd; chunkIdx++) {
-            long valueIdxOffset = valuesIdxOffsets[chunkIdx] - firstOffset;
-            long nextValueIdxOffset = (chunkIdx + 1 < numChunks)
-                                              ? (valuesIdxOffsets[chunkIdx + 1] - firstOffset)
-                                              : valuesLen;
-            int numChunkValues = (int) (nextValueIdxOffset - valueIdxOffset);
-
-            int chunkBase = chunkIdx * FL_CHUNK_SIZE;
-            if (numChunkValues <= 1) {
-                long fillVal = numChunkValues == 1 ? values[(int) valueIdxOffset] : 0L;
-                for (int i = 0; i < FL_CHUNK_SIZE; i++) {
-                    decoded[chunkBase + i] = fillVal;
-                }
-            } else {
-                for (int i = 0; i < FL_CHUNK_SIZE; i++) {
-                    int idx = indices[chunkBase + i];
-                    if (idx >= numChunkValues) {
-                        idx = numChunkValues - 1;
-                    }
-                    decoded[chunkBase + i] = values[(int) valueIdxOffset + idx];
-                }
-            }
-        }
 
-        MemorySegment seg = fromLongs(decoded, offset, (int) rowCount, ptype, ctx.arena());
-        Array result = toArray(ctx.dtype(), rowCount, seg, ptype);
+        MemorySegment valuesSeg = ctx.decodeChildSegment(0, valuesDtype, valuesLen);
+        Array result = switch (ptype) {
+            case I64, U64 -> new LazyRleLongArray(ctx.dtype(), rowCount,
+                    readLongs(valuesSeg, (int) valuesLen, ptype),
+                    indices, valuesIdxOffsets, firstOffset, valuesLen, numChunks, offset);
+            case I32, U32 -> new LazyRleIntArray(ctx.dtype(), rowCount,
+                    readInts(valuesSeg, (int) valuesLen, ptype),
+                    indices, valuesIdxOffsets, firstOffset, valuesLen, numChunks, offset);
+            case I16, U16 -> new LazyRleShortArray(ctx.dtype(), rowCount,
+                    readShorts(valuesSeg, (int) valuesLen, ptype),
+                    indices, valuesIdxOffsets, firstOffset, valuesLen, numChunks, offset,
+                    ptype == PType.U16);
+            case I8, U8 -> new LazyRleByteArray(ctx.dtype(), rowCount,
+                    readBytes(valuesSeg, (int) valuesLen),
+                    indices, valuesIdxOffsets, firstOffset, valuesLen, numChunks, offset,
+                    ptype == PType.U8);
+            default -> throw new VortexException(EncodingId.FASTLANES_RLE, "unsupported ptype " + ptype);
+        };
+
         if (indicesValidity == null) {
             return result;
         }
@@ -142,18 +130,12 @@ private static Array emptyArray(DecodeContext ctx) {
         MemorySegment empty = ctx.arena().allocate(0);
         DType dt = ctx.dtype();
         PType ptype = ((DType.Primitive) dt).ptype();
-        return toArray(dt, 0L, empty, ptype);
-    }
-
-    private static Array toArray(DType dtype, long n, MemorySegment seg, PType ptype) {
         return switch (ptype) {
-            case I64, U64 -> new MaterializedLongArray(dtype, n, seg);
-            case I32, U32 -> new MaterializedIntArray(dtype, n, seg);
-            case I16, U16 -> new MaterializedShortArray(dtype, n, seg);
-            case I8, U8 -> new MaterializedByteArray(dtype, n, seg);
-            case F64 -> new MaterializedDoubleArray(dtype, n, seg);
-            case F32 -> new MaterializedFloatArray(dtype, n, seg);
-            case F16 -> new MaterializedFloat16Array(dtype, n, seg);
+            case I64, U64 -> new MaterializedLongArray(dt, 0L, empty);
+            case I32, U32 -> new MaterializedIntArray(dt, 0L, empty);
+            case I16, U16 -> new MaterializedShortArray(dt, 0L, empty);
+            case I8, U8 -> new MaterializedByteArray(dt, 0L, empty);
+            default -> throw new VortexException(EncodingId.FASTLANES_RLE, "unsupported ptype " + ptype);
         };
     }
 
@@ -164,20 +146,45 @@ private static long[] readLongs(MemorySegment buf, int count, PType ptype) {
         for (int i = 0; i < count; i++) {
             long off = (i % cap) * elemSize;
             out[i] = switch (ptype) {
-                case I8 -> buf.get(ValueLayout.JAVA_BYTE, off);
-                case U8 -> Byte.toUnsignedLong(buf.get(ValueLayout.JAVA_BYTE, off));
-                case I16 -> buf.get(PTypeIO.LE_SHORT, off);
-                case U16, F16 -> Short.toUnsignedLong(buf.get(PTypeIO.LE_SHORT, off));
-                case I32 -> buf.get(PTypeIO.LE_INT, off);
-                case U32 -> Integer.toUnsignedLong(buf.get(PTypeIO.LE_INT, off));
-                case I64, U64 -> buf.get(PTypeIO.LE_LONG, off);
-                case F32 -> Integer.toUnsignedLong(buf.get(PTypeIO.LE_INT, off));
-                case F64 -> buf.get(PTypeIO.LE_LONG, off);
+                case I64 -> buf.get(PTypeIO.LE_LONG, off);
+                case U64 -> buf.get(PTypeIO.LE_LONG, off);
+                default -> throw new VortexException(EncodingId.FASTLANES_RLE, "expected I64/U64, got " + ptype);
             };
         }
         return out;
     }
 
+    private static int[] readInts(MemorySegment buf, int count, PType ptype) {
+        int[] out = new int[count];
+        int elemSize = ptype.byteSize();
+        long cap = SegmentBroadcast.capacity(buf, elemSize);
+        for (int i = 0; i < count; i++) {
+            long off = (i % cap) * elemSize;
+            out[i] = buf.get(PTypeIO.LE_INT, off);
+        }
+        return out;
+    }
+
+    private static short[] readShorts(MemorySegment buf, int count, PType ptype) {
+        short[] out = new short[count];
+        int elemSize = ptype.byteSize();
+        long cap = SegmentBroadcast.capacity(buf, elemSize);
+        for (int i = 0; i < count; i++) {
+            long off = (i % cap) * elemSize;
+            out[i] = buf.get(PTypeIO.LE_SHORT, off);
+        }
+        return out;
+    }
+
+    private static byte[] readBytes(MemorySegment buf, int count) {
+        byte[] out = new byte[count];
+        long cap = SegmentBroadcast.capacity(buf, 1);
+        for (int i = 0; i < count; i++) {
+            out[i] = buf.get(ValueLayout.JAVA_BYTE, i % cap);
+        }
+        return out;
+    }
+
     private static int[] readIndices(MemorySegment buf, int count, PType indicesPtype) {
         int[] out = new int[count];
         int elemSize = indicesPtype.byteSize();
@@ -216,13 +223,4 @@ private static long[] readUnsignedLongs(MemorySegment buf, int count, PType ptyp
         }
         return out;
     }
-
-    private static MemorySegment fromLongs(long[] decoded, int offset, int count, PType ptype, SegmentAllocator arena) {
-        int elemSize = ptype.byteSize();
-        MemorySegment seg = arena.allocate((long) count * elemSize);
-        for (int i = 0; i < count; i++) {
-            PTypeIO.set(seg, (long) i * elemSize, ptype, decoded[offset + i]);
-        }
-        return seg;
-    }
 }
diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/SparseEncodingDecoder.java b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/SparseEncodingDecoder.java
index 01b32fed..bfce1e77 100644
--- a/reader/src/main/java/io/github/dfa1/vortex/reader/decode/SparseEncodingDecoder.java
+++ b/reader/src/main/java/io/github/dfa1/vortex/reader/decode/SparseEncodingDecoder.java
@@ -30,7 +30,6 @@
 import java.lang.foreign.MemorySegment;
 import java.lang.foreign.ValueLayout;
 import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
 
 /// Read-only decoder for {@code vortex.sparse}.
 public final class SparseEncodingDecoder implements EncodingDecoder {
@@ -93,13 +92,11 @@ public Array decode(DecodeContext ctx) {
         long fillBits = scalarToLong(fillScalar);
 
         // Lazy path: keep fill bits + decoded patches; no n-sized buffer allocated.
+        // When numPatches == 0 we still decode zero-length children so the record's
+        // patchValues.length() and findPatch can rely on real (empty) Array instances.
         DType indicesDtype = new DType.Primitive(indicesPtype, false);
-        Array patchIndices = numPatches > 0
-                ? ctx.decodeChild(0, indicesDtype, numPatches)
-                : null;
-        Array patchValues = numPatches > 0
-                ? ctx.decodeChild(1, ctx.dtype(), numPatches)
-                : null;
+        Array patchIndices = ctx.decodeChild(0, indicesDtype, numPatches);
+        Array patchValues = ctx.decodeChild(1, ctx.dtype(), numPatches);
         Array idxData = patchIndices instanceof MaskedArray m ? m.inner() : patchIndices;
         Array valData = patchValues instanceof MaskedArray m ? m.inner() : patchValues;
 
diff --git a/reader/src/test/java/io/github/dfa1/vortex/reader/array/LazyRleArrayTest.java b/reader/src/test/java/io/github/dfa1/vortex/reader/array/LazyRleArrayTest.java
new file mode 100644
index 00000000..2cf26320
--- /dev/null
+++ b/reader/src/test/java/io/github/dfa1/vortex/reader/array/LazyRleArrayTest.java
@@ -0,0 +1,123 @@
+package io.github.dfa1.vortex.reader.array;
+
+import io.github.dfa1.vortex.core.DType;
+import io.github.dfa1.vortex.core.PType;
+import org.junit.jupiter.api.Nested;
+import org.junit.jupiter.api.Test;
+
+import java.util.ArrayList;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/// Unit tests for the lazy FastLanes RLE records. Cover scalar lookup, constant-run
+/// fast path, multi-chunk forEach iteration, fold reduction, and offset slicing.
+///
+/// Note: FastLanes RLE works in 1024-row chunks. Tests construct the records directly
+/// so they exercise the lazy semantics without needing a full encoder round-trip.
+class LazyRleArrayTest {
+
+    private static final DType I64 = new DType.Primitive(PType.I64, false);
+    private static final DType I32 = new DType.Primitive(PType.I32, false);
+
+    @Nested
+    class LongDispatch {
+
+        @Test
+        void singleChunkAllSameValue_constantRunFastPath() {
+            // Given a single 1024-row chunk with 1 distinct value (42).
+            // valuesIdxOffsets[0] = 0, indices irrelevant (constant fast path).
+            long[] values = {42L};
+            int[] indices = new int[1024];
+            long[] valuesIdxOffsets = {0L};
+            var sut = new LazyRleLongArray(I64, 1024, values, indices,
+                    valuesIdxOffsets, 0L, 1L, 1, 0);
+
+            assertThat(sut.getLong(0)).isEqualTo(42L);
+            assertThat(sut.getLong(500)).isEqualTo(42L);
+            assertThat(sut.getLong(1023)).isEqualTo(42L);
+        }
+
+        @Test
+        void singleChunkWithIndices_perRowLookup() {
+            // Given one chunk with values [10, 20, 30], indices selecting them in
+            // pattern 0,1,2,0,1,2,... over the first 6 rows.
+            long[] values = {10L, 20L, 30L};
+            int[] indices = new int[1024];
+            for (int i = 0; i < 6; i++) {
+                indices[i] = i % 3;
+            }
+            long[] valuesIdxOffsets = {0L};
+            var sut = new LazyRleLongArray(I64, 6, values, indices,
+                    valuesIdxOffsets, 0L, 3L, 1, 0);
+
+            var seen = new ArrayList<java.lang.Long>();
+            sut.forEachLong(seen::add);
+
+            assertThat(seen).containsExactly(10L, 20L, 30L, 10L, 20L, 30L);
+        }
+
+        @Test
+        void multiChunkBoundary_walksAcrossChunks() {
+            // Given two chunks: chunk 0 = constant 1, chunk 1 = constant 2.
+            long[] values = {1L, 2L};
+            int[] indices = new int[2 * 1024];  // unused for constant runs
+            long[] valuesIdxOffsets = {0L, 1L};
+            // length = 1026 covers all of chunk 0 + first 2 rows of chunk 1
+            var sut = new LazyRleLongArray(I64, 1026, values, indices,
+                    valuesIdxOffsets, 0L, 2L, 2, 0);
+
+            assertThat(sut.getLong(0)).isEqualTo(1L);
+            assertThat(sut.getLong(1023)).isEqualTo(1L);
+            assertThat(sut.getLong(1024)).isEqualTo(2L);
+            assertThat(sut.getLong(1025)).isEqualTo(2L);
+        }
+
+        @Test
+        void offsetSkipsLeadingRows() {
+            // Given chunk 0 = constant 5; offset=100 means logical row 0 -> absolute 100.
+            long[] values = {5L};
+            int[] indices = new int[1024];
+            long[] valuesIdxOffsets = {0L};
+            var sut = new LazyRleLongArray(I64, 10, values, indices,
+                    valuesIdxOffsets, 0L, 1L, 1, 100);
+
+            assertThat(sut.getLong(0)).isEqualTo(5L);
+            assertThat(sut.getLong(9)).isEqualTo(5L);
+        }
+
+        @Test
+        void foldSumsCorrectly() {
+            // Two chunks both constant runs: chunk 0 = 7, chunk 1 = 11. length = 1026.
+            long[] values = {7L, 11L};
+            int[] indices = new int[2 * 1024];
+            long[] valuesIdxOffsets = {0L, 1L};
+            var sut = new LazyRleLongArray(I64, 1026, values, indices,
+                    valuesIdxOffsets, 0L, 2L, 2, 0);
+
+            long sum = sut.fold(0L, java.lang.Long::sum);
+
+            // 1024 * 7 + 2 * 11 = 7168 + 22 = 7190
+            assertThat(sum).isEqualTo(7190L);
+        }
+    }
+
+    @Nested
+    class IntDispatch {
+
+        @Test
+        void intLookupAndForEach() {
+            int[] values = {100, 200};
+            int[] indices = new int[1024];
+            indices[0] = 0;
+            indices[1] = 1;
+            indices[2] = 0;
+            long[] valuesIdxOffsets = {0L};
+            var sut = new LazyRleIntArray(I32, 3, values, indices,
+                    valuesIdxOffsets, 0L, 2L, 1, 0);
+
+            assertThat(sut.getInt(0)).isEqualTo(100);
+            assertThat(sut.getInt(1)).isEqualTo(200);
+            assertThat(sut.getInt(2)).isEqualTo(100);
+        }
+    }
+}
diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/RleEncodingEncoderTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/RleEncodingEncoderTest.java
index 22ade017..12d6d07f 100644
--- a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/RleEncodingEncoderTest.java
+++ b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/RleEncodingEncoderTest.java
@@ -3,7 +3,6 @@
 import io.github.dfa1.vortex.core.DType;
 import io.github.dfa1.vortex.core.PType;
 import io.github.dfa1.vortex.reader.array.Array;
-import io.github.dfa1.vortex.reader.array.ArraySegments;
 import io.github.dfa1.vortex.reader.array.IntArray;
 import io.github.dfa1.vortex.reader.array.MaskedArray;
 import io.github.dfa1.vortex.reader.decode.ArrayNode;
@@ -12,7 +11,6 @@
 
 import io.github.dfa1.vortex.encoding.EncodingId;
 import io.github.dfa1.vortex.reader.decode.KnownArrayNode;
-import io.github.dfa1.vortex.encoding.PTypeIO;
 import io.github.dfa1.vortex.reader.ReadRegistry;
 import io.github.dfa1.vortex.reader.decode.TestRegistry;
 import io.github.dfa1.vortex.proto.RLEMetadata;
@@ -65,7 +63,7 @@ void roundTrip_singleElement_i32() {
             DecodeContext ctx = DecodeTestHelper.toDecodeContext(encoded, data.length, dtype, REGISTRY);
             Array result = DECODER.decode(ctx);
             assertThat(result.length()).isEqualTo(1);
-            assertThat(ArraySegments.of(result).get(PTypeIO.LE_INT, 0)).isEqualTo(42);
+            assertThat(((io.github.dfa1.vortex.reader.array.IntArray) result).getInt(0)).isEqualTo(42);
         }
 
         @Test
@@ -81,7 +79,7 @@ void roundTrip_constantArray_i32() {
             Array result = DECODER.decode(ctx);
             assertThat(result.length()).isEqualTo(n);
             for (int i = 0; i < n; i++) {
-                assertThat(ArraySegments.of(result).get(PTypeIO.LE_INT, (long) i * 4)).as("index %d", i).isEqualTo(99);
+                assertThat(((io.github.dfa1.vortex.reader.array.IntArray) result).getInt(i)).as("index %d", i).isEqualTo(99);
             }
         }
 
@@ -93,7 +91,7 @@ void roundTrip_classicRunLengthData_i32() {
             DecodeContext ctx = DecodeTestHelper.toDecodeContext(encoded, data.length, dtype, REGISTRY);
             Array result = DECODER.decode(ctx);
             for (int i = 0; i < data.length; i++) {
-                assertThat(ArraySegments.of(result).get(PTypeIO.LE_INT, (long) i * 4)).as("index %d", i).isEqualTo(data[i]);
+                assertThat(((io.github.dfa1.vortex.reader.array.IntArray) result).getInt(i)).as("index %d", i).isEqualTo(data[i]);
             }
         }
 
@@ -110,7 +108,7 @@ void roundTrip_multipleChunks_i32() {
             Array result = DECODER.decode(ctx);
             assertThat(result.length()).isEqualTo(n);
             for (int i = 0; i < n; i++) {
-                assertThat(ArraySegments.of(result).get(PTypeIO.LE_INT, (long) i * 4)).as("index %d", i).isEqualTo(i / 100);
+                assertThat(((io.github.dfa1.vortex.reader.array.IntArray) result).getInt(i)).as("index %d", i).isEqualTo(i / 100);
             }
         }
 
@@ -122,7 +120,7 @@ void roundTrip_i64() {
             DecodeContext ctx = DecodeTestHelper.toDecodeContext(encoded, data.length, dtype, REGISTRY);
             Array result = DECODER.decode(ctx);
             for (int i = 0; i < data.length; i++) {
-                assertThat(ArraySegments.of(result).get(PTypeIO.LE_LONG, (long) i * 8)).as("index %d", i).isEqualTo(data[i]);
+                assertThat(((io.github.dfa1.vortex.reader.array.LongArray) result).getLong(i)).as("index %d", i).isEqualTo(data[i]);
             }
         }
 
@@ -139,7 +137,7 @@ void roundTrip_variousLengths_i32(int n) {
             Array result = DECODER.decode(ctx);
             assertThat(result.length()).isEqualTo(n);
             for (int i = 0; i < n; i++) {
-                assertThat(ArraySegments.of(result).get(PTypeIO.LE_INT, (long) i * 4)).as("index %d", i).isEqualTo(i / 50);
+                assertThat(((io.github.dfa1.vortex.reader.array.IntArray) result).getInt(i)).as("index %d", i).isEqualTo(i / 50);
             }
         }
 
@@ -154,7 +152,7 @@ void roundTrip_allDifferent_u16() {
             DecodeContext ctx = DecodeTestHelper.toDecodeContext(encoded, data.length, dtype, REGISTRY);
             Array result = DECODER.decode(ctx);
             for (int i = 0; i < data.length; i++) {
-                assertThat(Short.toUnsignedInt(ArraySegments.of(result).get(PTypeIO.LE_SHORT, (long) i * 2)))
+                assertThat(Short.toUnsignedInt(((io.github.dfa1.vortex.reader.array.ShortArray) result).getShort(i)))
                         .as("index %d", i).isEqualTo(i);
             }
         }
@@ -167,7 +165,7 @@ void roundTrip_negativeValues_i32() {
             DecodeContext ctx = DecodeTestHelper.toDecodeContext(encoded, data.length, dtype, REGISTRY);
             Array result = DECODER.decode(ctx);
             for (int i = 0; i < data.length; i++) {
-                assertThat(ArraySegments.of(result).get(PTypeIO.LE_INT, (long) i * 4)).as("index %d", i).isEqualTo(data[i]);
+                assertThat(((io.github.dfa1.vortex.reader.array.IntArray) result).getInt(i)).as("index %d", i).isEqualTo(data[i]);
             }
         }
     }
@@ -200,7 +198,7 @@ void decode_crossesChunkBoundary_correctValues() {
             DecodeContext ctx = DecodeTestHelper.toDecodeContext(encoded, n, dtype, REGISTRY);
             Array result = DECODER.decode(ctx);
             for (int i = 1000; i < 1048; i++) {
-                assertThat(ArraySegments.of(result).get(PTypeIO.LE_INT, (long) i * 4)).as("index %d", i).isEqualTo(i / 100);
+                assertThat(((io.github.dfa1.vortex.reader.array.IntArray) result).getInt(i)).as("index %d", i).isEqualTo(i / 100);
             }
         }
 
@@ -256,7 +254,7 @@ void decode_partialLastChunk_correctLength() {
             Array result = DECODER.decode(ctx);
             assertThat(result.length()).isEqualTo(n);
             for (int i = 0; i < n; i++) {
-                assertThat(ArraySegments.of(result).get(PTypeIO.LE_INT, (long) i * 4)).as("index %d", i).isEqualTo(i / 100);
+                assertThat(((io.github.dfa1.vortex.reader.array.IntArray) result).getInt(i)).as("index %d", i).isEqualTo(i / 100);
             }
         }
 
diff --git a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/SparseEncodingEncoderTest.java b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/SparseEncodingEncoderTest.java
index 723d0e01..607491ff 100644
--- a/writer/src/test/java/io/github/dfa1/vortex/writer/encode/SparseEncodingEncoderTest.java
+++ b/writer/src/test/java/io/github/dfa1/vortex/writer/encode/SparseEncodingEncoderTest.java
@@ -3,7 +3,6 @@
 import io.github.dfa1.vortex.core.DType;
 import io.github.dfa1.vortex.core.PType;
 import io.github.dfa1.vortex.reader.array.Array;
-import io.github.dfa1.vortex.reader.array.ArraySegments;
 import io.github.dfa1.vortex.reader.array.BoolArray;
 import io.github.dfa1.vortex.reader.array.VarBinArray;
 import io.github.dfa1.vortex.reader.decode.ArrayNode;
@@ -11,7 +10,6 @@
 import io.github.dfa1.vortex.reader.decode.DecodeContext;
 
 import io.github.dfa1.vortex.encoding.EncodingId;
-import io.github.dfa1.vortex.encoding.PTypeIO;
 import io.github.dfa1.vortex.reader.ReadRegistry;
 import io.github.dfa1.vortex.reader.decode.TestRegistry;
 import io.github.dfa1.vortex.proto.NullValue;
@@ -88,7 +86,7 @@ void encode_roundTrip_i64() {
             Array decoded = decodeResult(encoded, DTypes.I64, data.length);
 
             for (int i = 0; i < data.length; i++) {
-                assertThat(ArraySegments.of(decoded).get(PTypeIO.LE_LONG, (long) i * 8))
+                assertThat(((io.github.dfa1.vortex.reader.array.LongArray) decoded).getLong(i))
                         .as("index %d", i).isEqualTo(data[i]);
             }
         }
@@ -100,7 +98,7 @@ void encode_roundTrip_f64() {
             Array decoded = decodeResult(encoded, DTypes.F64, data.length);
 
             for (int i = 0; i < data.length; i++) {
-                assertThat(ArraySegments.of(decoded).get(PTypeIO.LE_DOUBLE, (long) i * 8))
+                assertThat(((io.github.dfa1.vortex.reader.array.DoubleArray) decoded).getDouble(i))
                         .as("index %d", i).isEqualTo(data[i]);
             }
         }
@@ -217,7 +215,7 @@ void decode_noPatches_returnsFillValue() {
 
             assertThat(result.length()).isEqualTo(5L);
             for (int i = 0; i < 5; i++) {
-                assertThat(ArraySegments.of(result).get(PTypeIO.LE_LONG, (long) i * 8))
+                assertThat(((io.github.dfa1.vortex.reader.array.LongArray) result).getLong(i))
                         .as("index %d", i).isEqualTo(fill);
             }
         }
@@ -232,7 +230,7 @@ void decode_withPatches_overwritesAtIndices() {
 
             long[] expected = {0, 10, 0, 0, 0, 50, 0, 0};
             for (int i = 0; i < expected.length; i++) {
-                assertThat(ArraySegments.of(result).get(PTypeIO.LE_LONG, (long) i * 8))
+                assertThat(((io.github.dfa1.vortex.reader.array.LongArray) result).getLong(i))
                         .as("index %d", i).isEqualTo(expected[i]);
             }
         }
@@ -244,10 +242,10 @@ void decode_f64_fillAndPatches() {
             DecodeContext ctx = buildSparseCtxF64(DTypes.F64, 4, fillVal, new long[]{2L}, new double[]{patchVal});
             Array result = DECODER.decode(ctx);
 
-            assertThat(ArraySegments.of(result).get(PTypeIO.LE_DOUBLE, 0L)).isNaN();
-            assertThat(ArraySegments.of(result).get(PTypeIO.LE_DOUBLE, 8L)).isNaN();
-            assertThat(ArraySegments.of(result).get(PTypeIO.LE_DOUBLE, 16L)).isEqualTo(3.14);
-            assertThat(ArraySegments.of(result).get(PTypeIO.LE_DOUBLE, 24L)).isNaN();
+            assertThat(((io.github.dfa1.vortex.reader.array.DoubleArray) result).getDouble(0)).isNaN();
+            assertThat(((io.github.dfa1.vortex.reader.array.DoubleArray) result).getDouble(1)).isNaN();
+            assertThat(((io.github.dfa1.vortex.reader.array.DoubleArray) result).getDouble(2)).isEqualTo(3.14);
+            assertThat(((io.github.dfa1.vortex.reader.array.DoubleArray) result).getDouble(3)).isNaN();
         }
 
         @Test
@@ -257,7 +255,7 @@ void decode_offsetSubtracted() {
             DecodeContext ctx = buildSparseCtxWithOffset(DTypes.I64, 5, 0L, PType.U32, patchIndices, patchValues, 10L);
             Array result = DECODER.decode(ctx);
 
-            assertThat(ArraySegments.of(result).get(PTypeIO.LE_LONG, 16L)).isEqualTo(777L);
+            assertThat(((io.github.dfa1.vortex.reader.array.LongArray) result).getLong(2)).isEqualTo(777L);
         }
 
         @Test
@@ -268,7 +266,7 @@ void decode_nullValueFill_treatedAsZero() {
             Array result = DECODER.decode(ctx);
 
             for (int i = 0; i < 4; i++) {
-                assertThat(ArraySegments.of(result).get(PTypeIO.LE_LONG, (long) i * 8)).as("index %d", i).isZero();
+                assertThat(((io.github.dfa1.vortex.reader.array.LongArray) result).getLong(i)).as("index %d", i).isZero();
             }
         }
 

From e2c757496d2d421001fe8ea18f7046d1302e2a1e Mon Sep 17 00:00:00 2001
From: Davide Angelocola <davide.angelocola@gmail.com>
Date: Mon, 15 Jun 2026 22:57:37 +0200
Subject: [PATCH 2/2] =?UTF-8?q?docs(adr):=20ADR=200010=20=E2=86=92=20Imple?=
 =?UTF-8?q?mented;=20Phase=203=20superseded=20by=20ADR=200013?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phases 0/1/2 (lazy decode for 1:1 transforms — ALP, FoR, ZigZag) shipped
over 2026-06-14 — 2026-06-15. Update status from Proposed to Implemented
and add an Outcome section noting:

- Phase 1 — Array hierarchy refactor to non-sealed interfaces.
- Phase 2 — LazyAlpDouble/Float, LazyForLong/Int, LazyZigZagLong/Int
  records; ArraySegments.of(arr, arena) materialise fallback.
- Pattern generalised beyond 1:1 transforms: ADR 0012 (lazy
  Chunked / Dict / VarBin) plus vortex.runend, vortex.sparse, and
  fastlanes.rle adopted the same top-level record + materialise-fallback
  shape (see LazyRunEnd*, LazySparse*, LazyRle* in reader.array).

Phase 3 (compute pushdown) is superseded by ADR 0013 — that ADR lands
the masks-and-kernels framework Phase 3 sketched, plus the wider design
needed for pushdown to compose across the lazy storage types from
ADRs 0010 and 0012.

Also update docs/adr/ADR.md index entry from Proposed to Implemented.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docs/adr/0010-lazy-decode.md | 34 +++++++++++++++++++++++++++++++++-
 docs/adr/ADR.md              |  2 +-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/docs/adr/0010-lazy-decode.md b/docs/adr/0010-lazy-decode.md
index d79cbbb7..5fd37287 100644
--- a/docs/adr/0010-lazy-decode.md
+++ b/docs/adr/0010-lazy-decode.md
@@ -1,7 +1,8 @@
 # ADR 0010: Lazy decode for 1:1 transform encodings
 
-- **Status:** Proposed
+- **Status:** Implemented
 - **Date:** 2026-06-13
+- **Implemented:** 2026-06-14 — 2026-06-15
 - **Deciders:** project maintainer
 - **Supersedes:** —
 - **Superseded by:** —
@@ -606,6 +607,37 @@ Net negative.
 Rejected: only works if scan does not eagerly decode — which is exactly
 phase 1.
 
+## Outcome
+
+Phases 0/1/2 (lazy decode for the 1:1 transforms) shipped over 2026-06-14 — 2026-06-15:
+
+- **Phase 1 — Array hierarchy refactor.** Primitive `Array` interfaces (`LongArray`,
+  `IntArray`, `DoubleArray`, …) converted to non-sealed so new lazy variants can implement
+  them without touching the hierarchy.
+- **Phase 2 — Lazy ALP / FoR / ZigZag.** `LazyAlpDoubleArray`, `LazyAlpFloatArray`,
+  `LazyForLongArray`, `LazyForIntArray`, `LazyZigZagLongArray`, `LazyZigZagIntArray`
+  shipped. Each holds the encoded child + transform parameters; per-row dispatch applies
+  the transform on demand. `ArraySegments.of(arr, arena)` materialises into a fresh segment
+  only when a downstream caller demands a contiguous buffer.
+
+The lazy-storage pattern generalised beyond the 1:1 transform scope of this ADR. Adjacent
+encodings adopted the same top-level record shape:
+
+- [ADR 0012 — Lazy Chunked / Dict / VarBin layouts](0012-zero-copy-layout-decoding.md).
+- `vortex.runend`, `vortex.sparse`, `fastlanes.rle` (lazy lookup tables; not 1:1 transforms
+  but the same lazy-record + `ArraySegments` materialise pattern; see
+  `LazyRunEndXxxArray`, `LazySparseXxxArray`, `LazyRleXxxArray` in `reader.array`).
+
+`docs/compatibility.md` Decode shape table tracks per-encoding status.
+
+### Phase 3 — superseded
+
+The compute-pushdown phase (per-encoding `compareXxx` / `take` / `filter` operating on the
+encoded form) is **superseded by [ADR 0013 — Compute primitives: masks, kernels,
+no-materialise](0013-compute-primitives.md)**. ADR 0013 lands the masks-and-kernels
+framework that Phase 3 sketched, plus the wider design needed to make pushdown compose
+across the lazy storage types from ADRs 0010 and 0012.
+
 ## References
 
 - Rust reference: `https://github.com/spiraldb/vortex/tree/main/encodings/alp/src/alp/compute`
diff --git a/docs/adr/ADR.md b/docs/adr/ADR.md
index c27ba488..ec4f7c84 100644
--- a/docs/adr/ADR.md
+++ b/docs/adr/ADR.md
@@ -22,7 +22,7 @@ Each ADR is a Markdown file named `NNNN-short-title.md`. Use `template.md` as th
 | 0007 | Pure-Java vortex.pco encoder                  | Implemented |
 | 0008 | Domain primitives for unsigned integers       | Proposed  |
 | 0009 | Write API ergonomics                          | Completed |
-| 0010 | Lazy decode                                   | Proposed  |
+| 0010 | Lazy decode                                   | Implemented |
 | 0011 | Writer zero-copy MemorySegment overload       | Deferred  |
 | 0012 | Zero-copy layout decoding: lazy Chunked/Dict  | Implemented |
 | 0013 | Compute primitives: masks, kernels, no-materialise | Proposed  |