From 3ebef6e956c575dff3ca71b8e53135c1b1c3cd95 Mon Sep 17 00:00:00 2001 From: Dan Reynolds Date: Tue, 12 May 2026 07:18:57 -0400 Subject: [PATCH 1/2] Exp 137: long-text cell-size scaling audit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sweeps the exp 110 unchanged-fanout shape across [4KB, 16KB, 32KB, 64KB, 128KB] cells. Wall scales linearly with bytes from 16KB up; per-byte cost converges to a stable 0.12–0.19 ns/byte band on the existing 8-byte FNV chunked loop. The 4KB release shape sits ~2x above the band because per-iteration overhead dominates at that size — a faster hash variant would barely move it. Closes the long-text-stream-hashing direction's blockedOnMeasurement gate and replaces the broader-payload openCandidate with a wider FNV / SIMD probe candidate gated on a real ≥16KB workload. Co-Authored-By: Claude Opus 4.7 --- .../profile/long_text_scaling_audit.dart | 370 ++++++++++++++++++ .../exp-137-long-text-scaling-aggregate.md | 46 +++ docs/experiments/history.json | 15 +- experiments/137-long-text-cell-scaling.md | 201 ++++++++++ experiments/README.md | 1 + experiments/signals.json | 45 ++- 6 files changed, 663 insertions(+), 15 deletions(-) create mode 100644 benchmark/profile/long_text_scaling_audit.dart create mode 100644 benchmark/profile/results/exp-137-long-text-scaling-aggregate.md create mode 100644 experiments/137-long-text-cell-scaling.md diff --git a/benchmark/profile/long_text_scaling_audit.dart b/benchmark/profile/long_text_scaling_audit.dart new file mode 100644 index 0000000..7b3cb3d --- /dev/null +++ b/benchmark/profile/long_text_scaling_audit.dart @@ -0,0 +1,370 @@ +// ignore_for_file: avoid_print +// +// Long-text cell-size scaling audit — exp 137. +// +// Exp 099 added an 8-byte FNV main loop and exp 110 wired in the +// matching long-text unchanged-fanout benchmark (8 unchanged streams +// x 256 rows x 4KB ASCII TEXT cells). The 4KB cell shape produced the +// -76% headline that justified accepting exp 099's revival. +// +// `signals.json#long-text-stream-hashing` carries the next gate as a +// `blockedOnMeasurement` entry plus a 2026-04-29 open candidate: +// +// "broader long-payload workload (>= 32KB TEXT cells, mixed +// BLOB/TEXT)" +// +// Without that workload we cannot tell whether the per-byte hashing +// cost continues to drive wall as cell sizes grow, or whether some +// other cost (SQLite text fetch, page cache, GC, isolate transfer) +// takes over. Either outcome closes the open candidate: +// +// - linear scaling (per-byte wall stable across sizes) -> hashing +// dominates; further hash-loop variants remain interesting and a +// wider FNV unroll / SIMD probe is the natural next attempt. +// - sub-linear scaling (per-byte wall drops at large sizes) -> some +// non-hash floor dominates short-cell wall; long-text hashing is +// not the next implementation target. +// - super-linear scaling (per-byte wall climbs at large sizes) -> +// allocation, GC, or isolate-transfer cost emerges; new direction. +// +// The harness mirrors exp 110's shape (8 unchanged streams x 256 rows, +// single barrier stream, INSERT per iteration) and just sweeps the +// per-cell byte size. Each size runs `iterationsPerSize` insert +// iterations, the wall is the per-iteration `Stopwatch` around the +// INSERT + barrier-emission wait, and the report is the median / +// p90 / p99 / per-byte wall at each size. +// +// Usage: +// dart run -DRESQLITE_PROFILE=true \ +// benchmark/profile/long_text_scaling_audit.dart --markdown + +import 'dart:async'; +import 'dart:io'; + +import 'package:resqlite/resqlite.dart' as resqlite; +import 'package:resqlite/src/profile_mode.dart'; + +const int unchangedStreamCount = 8; +const int rowCount = 256; +const int iterationsPerSize = 30; +const int warmupIterations = 3; +const List cellSizesBytes = [4096, 16384, 32768, 65536, 131072]; + +class _ScalingResult { + _ScalingResult({ + required this.cellBytes, + required this.medianUs, + required this.p90Us, + required this.p99Us, + required this.minUs, + required this.maxUs, + }); + + final int cellBytes; + final int medianUs; + final int p90Us; + final int p99Us; + final int minUs; + final int maxUs; + + // Per-byte wall is a stand-in for "would 2x the bytes give 2x the + // wall?". The fanout wave hashes: + // - every unchanged stream's full result (rowCount rows), times + // `unchangedStreamCount` streams, + // - plus the barrier stream's full result (rowCount + 1 rows after + // the INSERT lands). + // SQLite TEXT cells stored on the same row as INTEGER columns return + // the full payload pointer for hashing, so per-row hashed bytes are + // approximately `cellBytes` (id/marker integer columns add a few + // bytes of fold work each — kept in the formula as cellBytes only + // because the integer-column contribution is negligible at >=4KB). + double get totalHashedBytes => + cellBytes.toDouble() * + (unchangedStreamCount * rowCount + (rowCount + 1)); + double get nsPerByte => (medianUs * 1000.0) / totalHashedBytes; +} + +Future<_ScalingResult> runOneSize(int cellBytes) async { + final tempDir = await Directory.systemTemp.createTemp( + 'long_text_scaling_${cellBytes}_', + ); + final db = await resqlite.Database.open('${tempDir.path}/r.db'); + try { + const createSql = + 'CREATE TABLE long_items(' + 'id INTEGER PRIMARY KEY, ' + 'body TEXT NOT NULL, ' + 'marker INTEGER NOT NULL)'; + const insertSql = + 'INSERT INTO long_items(id, body, marker) VALUES (?, ?, ?)'; + + await db.execute(createSql); + await db.executeBatch(insertSql, [ + for (var i = 0; i < rowCount; i++) + [i, _longTextPayload(cellBytes, i), i], + ]); + + final unchangedSubs = >>>[]; + StreamSubscription>>? barrierSub; + + try { + final unchangedEmissions = List.filled(unchangedStreamCount, 0); + final unchangedReady = >[ + for (var i = 0; i < unchangedStreamCount; i++) Completer(), + ]; + + for (var s = 0; s < unchangedStreamCount; s++) { + final idx = s; + unchangedSubs.add( + db + .stream( + 'SELECT id, body, $s as sid FROM long_items ' + 'WHERE id < $rowCount ORDER BY id', + ) + .listen((_) { + unchangedEmissions[idx]++; + if (!unchangedReady[idx].isCompleted) { + unchangedReady[idx].complete(); + } + }), + ); + } + + final barrierStream = db.stream( + 'SELECT id, body FROM long_items ORDER BY id', + ); + final barrierReady = Completer(); + Completer? waitBarrier; + barrierSub = barrierStream.listen((_) { + if (!barrierReady.isCompleted) { + barrierReady.complete(); + } else if (waitBarrier != null && !waitBarrier.isCompleted) { + waitBarrier.complete(); + } + }); + + await Future.wait( + unchangedReady.map((c) => c.future), + ).timeout(const Duration(seconds: 30)); + await barrierReady.future.timeout(const Duration(seconds: 30)); + + var counter = 100000; + final wallUs = []; + + // Warmups stabilize cache state and Dart JIT/AOT hot paths so the + // first measured iteration is not an outlier. Discarded from the + // result. + for (var w = 0; w < warmupIterations; w++) { + waitBarrier = Completer(); + await db.execute(insertSql, [ + counter, + _longTextPayload(cellBytes, counter), + w, + ]); + counter++; + await waitBarrier.future.timeout(const Duration(seconds: 30)); + } + + for (var i = 0; i < iterationsPerSize; i++) { + waitBarrier = Completer(); + final before = List.from(unchangedEmissions); + + final sw = Stopwatch()..start(); + await db.execute(insertSql, [ + counter, + _longTextPayload(cellBytes, counter), + i, + ]); + counter++; + await waitBarrier.future.timeout(const Duration(seconds: 30)); + sw.stop(); + wallUs.add(sw.elapsedMicroseconds); + + for (var s = 0; s < unchangedStreamCount; s++) { + if (unchangedEmissions[s] != before[s]) { + throw StateError( + 'Long-text unchanged stream $s emitted at cell size ' + '$cellBytes; the unchanged-fanout invariant has been ' + 'broken (the hash-only fast path is supposed to suppress ' + 'this stream).', + ); + } + } + } + + final sorted = [...wallUs]..sort(); + return _ScalingResult( + cellBytes: cellBytes, + medianUs: sorted[sorted.length ~/ 2], + p90Us: sorted[(sorted.length * 0.9).floor().clamp(0, sorted.length - 1)], + p99Us: sorted[(sorted.length * 0.99).floor().clamp(0, sorted.length - 1)], + minUs: sorted.first, + maxUs: sorted.last, + ); + } finally { + await barrierSub?.cancel(); + for (final sub in unchangedSubs) { + await sub.cancel(); + } + } + } finally { + await db.close(); + await tempDir.delete(recursive: true); + } +} + +String _longTextPayload(int targetBytes, int seed) { + final prefix = 'seed_$seed:'; + const chunk = 'abcdefghijklmnopqrstuvwxyz0123456789'; + final buffer = StringBuffer(prefix); + while (buffer.length < targetBytes) { + buffer.write(chunk); + } + return buffer.toString().substring(0, targetBytes); +} + +Future main(List args) async { + if (!kProfileMode) { + stderr.writeln( + 'WARNING: kProfileMode=false; profile-only counters from other ' + 'audits will stay zero. Wall measurements still work — the ' + 'scaling decision in this audit is end-to-end wall, not ' + 'counter-derived.', + ); + } + + final writeMarkdown = args.contains('--markdown'); + + final results = <_ScalingResult>[]; + for (final size in cellSizesBytes) { + stderr.writeln( + 'Running cell size ${(size / 1024).toStringAsFixed(0)}KB ' + '($iterationsPerSize iterations)...', + ); + results.add(await runOneSize(size)); + } + + final markdown = _renderMarkdown(results); + print(markdown); + + if (writeMarkdown) { + final outFile = File( + 'benchmark/profile/results/exp-137-long-text-scaling-aggregate.md', + ); + await outFile.writeAsString(markdown); + print('Wrote ${outFile.path}'); + } +} + +String _renderMarkdown(List<_ScalingResult> results) { + final buf = StringBuffer(); + buf.writeln('# Experiment 137 - Long-Text Cell-Size Scaling Audit'); + buf.writeln(); + buf.writeln( + 'Profile-mode harness: ' + '`benchmark/profile/long_text_scaling_audit.dart`', + ); + buf.writeln(); + buf.writeln( + 'Workload shape: $unchangedStreamCount unchanged streams x $rowCount ' + 'rows, one barrier stream, $iterationsPerSize timed INSERT ' + 'iterations per cell size after $warmupIterations warmups.', + ); + buf.writeln(); + buf.writeln( + 'Wall convention: per-iteration `Stopwatch` brackets the INSERT ' + 'plus the wait for the barrier stream to re-emit. The unchanged ' + 'streams must not emit (their hash-only fast path is supposed to ' + 'suppress re-delivery); the harness asserts this on every ' + 'iteration. The hash-loop work the unchanged streams do during ' + 'each iteration is the cost the scaling sweep is targeting.', + ); + buf.writeln(); + buf.writeln('Command:'); + buf.writeln(); + buf.writeln('```text'); + buf.writeln( + 'dart run -DRESQLITE_PROFILE=true ' + 'benchmark/profile/long_text_scaling_audit.dart --markdown', + ); + buf.writeln('```'); + buf.writeln(); + buf.writeln('## Wall by cell size'); + buf.writeln(); + buf.writeln( + '| cell size | median_ms | p90_ms | p99_ms | min_ms | max_ms |', + ); + buf.writeln( + '|---|---:|---:|---:|---:|---:|', + ); + for (final row in results) { + buf.writeln( + '| ${_formatCellSize(row.cellBytes)} | ' + '${(row.medianUs / 1000).toStringAsFixed(2)} | ' + '${(row.p90Us / 1000).toStringAsFixed(2)} | ' + '${(row.p99Us / 1000).toStringAsFixed(2)} | ' + '${(row.minUs / 1000).toStringAsFixed(2)} | ' + '${(row.maxUs / 1000).toStringAsFixed(2)} |', + ); + } + buf.writeln(); + buf.writeln('## Per-byte cost'); + buf.writeln(); + buf.writeln( + 'The fanout wave hashes every unchanged stream\'s full result ' + '($rowCount rows x $unchangedStreamCount unchanged streams) plus ' + 'the barrier stream\'s full result (${rowCount + 1} rows after ' + 'the INSERT lands). `hashed_bytes_per_iter ≈ cell_bytes x ' + '(${unchangedStreamCount * rowCount} + ${rowCount + 1})`. ' + '`ns_per_byte` divides the median wall by the total hashed bytes ' + 'to isolate the per-byte cost from the per-iteration overhead.', + ); + buf.writeln(); + buf.writeln( + '| cell size | hashed_bytes_per_iter | ns_per_byte (median) |', + ); + buf.writeln('|---|---:|---:|'); + for (final row in results) { + buf.writeln( + '| ${_formatCellSize(row.cellBytes)} | ' + '${row.totalHashedBytes.toStringAsFixed(0)} | ' + '${row.nsPerByte.toStringAsFixed(3)} |', + ); + } + buf.writeln(); + buf.writeln('## Reading the table'); + buf.writeln(); + buf.writeln( + '- `median_ms` is the per-iteration wall: one INSERT plus the ' + 'fanout wave that re-hashes every unchanged stream\'s result.', + ); + buf.writeln( + '- `ns_per_byte` is the per-byte cost averaged across the full ' + 'hashed payload. If hashing is the bottleneck, this number stays ' + 'roughly flat across cell sizes.', + ); + buf.writeln( + '- Drift downward as cell sizes grow points to a per-iteration ' + 'overhead floor (mutex acquisition, microtask scheduling, ' + 'isolate dispatch) hiding the per-byte cost at small sizes.', + ); + buf.writeln( + '- Drift upward at large sizes points to a non-hash cost emerging ' + '— allocation, GC pressure, page cache misses, or SQLite text ' + 'fetch stalling on disk.', + ); + buf.writeln(); + buf.writeln('## Interpretation'); + buf.writeln(); + buf.writeln( + 'See `experiments/137-long-text-cell-scaling.md` for the decision ' + 'and follow-up notes attached to these numbers.', + ); + return buf.toString(); +} + +String _formatCellSize(int bytes) { + if (bytes < 1024) return '${bytes}B'; + if (bytes < 1024 * 1024) return '${(bytes / 1024).toStringAsFixed(0)}KB'; + return '${(bytes / (1024 * 1024)).toStringAsFixed(1)}MB'; +} diff --git a/benchmark/profile/results/exp-137-long-text-scaling-aggregate.md b/benchmark/profile/results/exp-137-long-text-scaling-aggregate.md new file mode 100644 index 0000000..22957cc --- /dev/null +++ b/benchmark/profile/results/exp-137-long-text-scaling-aggregate.md @@ -0,0 +1,46 @@ +# Experiment 137 - Long-Text Cell-Size Scaling Audit + +Profile-mode harness: `benchmark/profile/long_text_scaling_audit.dart` + +Workload shape: 8 unchanged streams x 256 rows, one barrier stream, 30 timed INSERT iterations per cell size after 3 warmups. + +Wall convention: per-iteration `Stopwatch` brackets the INSERT plus the wait for the barrier stream to re-emit. The unchanged streams must not emit (their hash-only fast path is supposed to suppress re-delivery); the harness asserts this on every iteration. The hash-loop work the unchanged streams do during each iteration is the cost the scaling sweep is targeting. + +Command: + +```text +dart run -DRESQLITE_PROFILE=true benchmark/profile/long_text_scaling_audit.dart --markdown +``` + +## Wall by cell size + +| cell size | median_ms | p90_ms | p99_ms | min_ms | max_ms | +|---|---:|---:|---:|---:|---:| +| 4KB | 2.11 | 3.09 | 3.23 | 1.82 | 3.23 | +| 16KB | 4.50 | 5.53 | 6.98 | 4.00 | 6.98 | +| 32KB | 9.49 | 10.67 | 12.87 | 8.40 | 12.87 | +| 64KB | 27.52 | 33.72 | 34.72 | 16.01 | 34.72 | +| 128KB | 44.51 | 53.92 | 55.84 | 35.07 | 55.84 | + +## Per-byte cost + +The fanout wave hashes every unchanged stream's full result (256 rows x 8 unchanged streams) plus the barrier stream's full result (257 rows after the INSERT lands). `hashed_bytes_per_iter ≈ cell_bytes x (2048 + 257)`. `ns_per_byte` divides the median wall by the total hashed bytes to isolate the per-byte cost from the per-iteration overhead. + +| cell size | hashed_bytes_per_iter | ns_per_byte (median) | +|---|---:|---:| +| 4KB | 9441280 | 0.224 | +| 16KB | 37765120 | 0.119 | +| 32KB | 75530240 | 0.126 | +| 64KB | 151060480 | 0.182 | +| 128KB | 302120960 | 0.147 | + +## Reading the table + +- `median_ms` is the per-iteration wall: one INSERT plus the fanout wave that re-hashes every unchanged stream's result. +- `ns_per_byte` is the per-byte cost averaged across the full hashed payload. If hashing is the bottleneck, this number stays roughly flat across cell sizes. +- Drift downward as cell sizes grow points to a per-iteration overhead floor (mutex acquisition, microtask scheduling, isolate dispatch) hiding the per-byte cost at small sizes. +- Drift upward at large sizes points to a non-hash cost emerging — allocation, GC pressure, page cache misses, or SQLite text fetch stalling on disk. + +## Interpretation + +See `experiments/137-long-text-cell-scaling.md` for the decision and follow-up notes attached to these numbers. diff --git a/docs/experiments/history.json b/docs/experiments/history.json index 366057f..c36c218 100644 --- a/docs/experiments/history.json +++ b/docs/experiments/history.json @@ -1,5 +1,5 @@ { - "generated": "2026-05-09T15:12:01.705071", + "generated": "2026-05-12T07:18:20.382138", "runs": [ { "id": "baseline-before-authorizer-hooks", @@ -18452,6 +18452,19 @@ "approach": "The archived implementation accumulates dirty `(table, rowid)` pairs in a\nbounded native set. It borrows the already-stable dirty-table name storage, so\nsingle-row writes do not allocate another table string for rowid precision.\nOverflow or allocation uncertainty returns zero rowid details and keeps the\nexisting table/column invalidation path.\n\nDart bindings decode those rowids into `TableRowDependency`. Row precision is\nan optimization layer: when both stream and write sides have rowids, a\nnon-overlap skips immediately; when either side lacks rowids, the existing\ncolumn/table logic decides.\n\n`StreamEngine` attaches read-side rowids only for narrow SQL it can prove:\n\n- one positional parameter;\n- a simple `FROM table WHERE id = ?` or intrinsic `rowid = ?` shape;\n- exactly one tracked de...", "results": "Focused profile:\n[`benchmark/profile/results/exp-134-keyed-pk-dirty-elision.md`](../benchmark/profile/results/exp-134-keyed-pk-dirty-elision.md)\n\n| workload | baseline wall_ms | candidate wall_ms | delta | baseline intersection_entries | candidate intersection_entries |\n|---|---:|---:|---:|---:|---:|\n| keyed PK subscriptions | 25.54 | 12.45 | -51.3% | 10000 | 3 |\n\nRelease guardrails:\n\n| workload | baseline | candidate | delta |\n|---|---:|---:|---:|\n| many-streams disjoint | 23,946 w/s | 24,618 w/s | +2.8% |\n| many-streams overlap | 9,297 w/s | 8,763 w/s | -5.7% |\n| public keyed-PK wall | 223.32 ms | 217.75 ms | -2.5% |\n\nThe public keyed-PK suite includes a quiet-window drain, so its 200 ms floor\nhides most of the writer-burst improvement. The profile harness uses the exp\n121 wall conven...", "reasoning": "**Rejected, but recorded as future evidence.**\n\nThis is a real keyed-PK miss-path optimization: 10,000 per-stream\nintersection probes collapse to the 3 actual watched-row hits. The result is\nstrong enough to keep as evidence that row-level invalidation can matter for\nkeyed subscriptions.\n\nThe implementation shape is the problem. Making this an internal optimization\nrequires `StreamEngine` to recognize and prove more SQL text shapes over time.\nThat is too fragile for the value captured here, especially because aliases,\njoins, composite keys, non-`id` aliases, views, and `WITHOUT ROWID` tables all\nneed conservative escape hatches. The production implementation has been\nremoved from the PR; the implementation commit is preserved by the archive tag\nfor future reference." + }, + { + "id": "137", + "title": "Long-text cell-size scaling audit", + "date": "2026-05-12", + "status": "in_review", + "summary": "Measurement-only: sweeps the exp 110 unchanged-fanout shape across [4KB, 16KB, 32KB, 64KB, 128KB] cells. Wall scales linearly with bytes from 16KB up; per-byte cost converges to a stable 0.12–0.19 ns/byte band on the existing 8-byte FNV chunked loop. The 4KB release shape sits ~2x above the band because per-iteration overhead dominates at that size — a faster hash variant would barely move it. Closes the long-text-stream-hashing direction's `blockedOnMeasurement` gate, replaces the broader-payload `openCandidate` with a wider FNV / SIMD probe candidate gated on a real ≥16KB workload", + "commit": null, + "problem": "[Exp 099](099-fnv-8byte-bytestream.md) was rejected as benchmark-invisible\nbecause the streaming suite at the time only carried short cells.\n[Exp 110](110-long-text-fnv-8byte.md) added a long-text unchanged-fanout\nbenchmark — 8 unchanged streams x 256 rows x **4KB** ASCII TEXT cells —\nand the same 8-byte FNV change measured **-76%** on its median wall.\n\n`signals.json#long-text-stream-hashing` left two related entries open\nafter that win:\n\n- a `blockedOnMeasurement` requesting \"long-payload streaming workload\n at sizes beyond exp 110's 4KB cells\", and\n- a 2026-04-29 `openCandidate` (`addedAfter: 110`) for \"broader\n long-payload workload (>= 32KB TEXT cells, mixed BLOB/TEXT)\" with\n `blockedOn: no benchmark covers payloads larger than the exp 110\n 4KB shape`.\n\nUntil those entries close...", + "hypothesis": "After exp 110 + the existing 8-byte FNV path, per-byte hashing cost\nshould be roughly flat across cell sizes once the per-iteration\noverhead floor (writer round-trip, mutex acquisition, microtask\nscheduling) is amortized. The 4KB shape sits below the amortization\npoint, so its `ns_per_byte` should be artificially high; sizes from\n≥16KB onward should converge to a stable per-byte band that\ncharacterizes the actual hash-loop throughput on the test machine.\n\nAccept this as a measurement experiment if:\n\n- the audit produces stable per-cell-size median walls across\n repeated passes (≤ ±10% drift on the dominant signal);\n- the audit resolves the `signals.json` open candidate one way or\n the other;\n- the run updates `blockedOnMeasurement` and `openCandidates`\n accordingly.", + "approach": "Added one profile-mode harness:\n\n```text\nbenchmark/profile/long_text_scaling_audit.dart\n```\n\nThe harness reuses exp 110's workload shape — 8 unchanged streams\nprojecting `id, body, sid` with `WHERE id < 256` predicates, one\nbarrier stream projecting the full table, 256 seed rows, one INSERT\nper iteration with a row outside every unchanged stream's predicate\n— and sweeps the per-cell byte size across `[4KB, 16KB, 32KB, 64KB,\n128KB]`. Each cell size runs 3 warmup iterations followed by 30\ntimed iterations; the per-iteration wall is the `Stopwatch` around\n`db.execute(insert)` plus the wait for the barrier stream to\nre-emit. The unchanged streams must not emit (the hash-only fast\npath is supposed to suppress re-delivery); the harness asserts\nthis on every iteration.\n\nThe `ns_per_byte` colum...", + "results": "Three repeated passes; values bracket the per-run band.\n\nPer-iteration wall:\n\n| cell size | median_ms (a/b/c) | p90_ms | p99_ms |\n|-----------|--------------------------:|-------------:|-------------:|\n| 4KB | 2.11 / 2.23 / 2.11 | 2.89 – 3.45 | 3.23 – 4.62 |\n| 16KB | 4.50 / 4.28 / 4.52 | 5.27 – 5.55 | 6.25 – 7.14 |\n| 32KB | 9.49 / 9.62 / 9.47 | 10.38 – 10.67| 11.66 – 12.87|\n| 64KB | 27.52 / 28.13 / 25.26 | 33.72 – 35.16| 34.72 – 36.09|\n| 128KB | 44.51 / 42.91 / 47.41 | 53.92 – 55.60| 55.84 – 62.64|\n\nPer-byte cost:\n\n| cell size | hashed_bytes_per_iter | ns_per_byte (median, a/b/c) |\n|-----------|----------------------:|----------------------------:|\n| 4KB | 9,441,280 | 0.224 / 0.236 / 0.223 ...", + "reasoning": "**Accept for review — measurement.**\n\nThe audit answers the open `signals.json` question for the\n`long-text-stream-hashing` direction:\n\n> long-payload streaming workload at sizes beyond exp 110's 4KB cells\n\nWall scales **linearly** with bytes from 16KB up, with per-byte cost\nin a stable 0.12 – 0.19 ns/byte band. Hashing — not SQLite text-fetch,\nnot Dart-side allocation, not isolate transfer — is the dominant cost\non long-cell unchanged-fanout workloads at meaningful cell sizes.\n\nWhat the resolution changes:\n\n- **Removes the `blockedOnMeasurement` entry**: the workload that was\n missing now exists at five sizes.\n- **Closes the matching `openCandidate`** (broader long-payload\n workload).\n- **Adds a new `openCandidate`**: a wider FNV unroll or SIMD\n (AVX2/NEON) probe for the byte-stream..." } ], "tracked": [ diff --git a/experiments/137-long-text-cell-scaling.md b/experiments/137-long-text-cell-scaling.md new file mode 100644 index 0000000..02ca7b1 --- /dev/null +++ b/experiments/137-long-text-cell-scaling.md @@ -0,0 +1,201 @@ +# Experiment 137: Long-text cell-size scaling audit + +**Date:** 2026-05-12 +**Status:** In Review +**Direction:** `long-text-stream-hashing`, `measurement-system` + +## Problem + +[Exp 099](099-fnv-8byte-bytestream.md) was rejected as benchmark-invisible +because the streaming suite at the time only carried short cells. +[Exp 110](110-long-text-fnv-8byte.md) added a long-text unchanged-fanout +benchmark — 8 unchanged streams x 256 rows x **4KB** ASCII TEXT cells — +and the same 8-byte FNV change measured **-76%** on its median wall. + +`signals.json#long-text-stream-hashing` left two related entries open +after that win: + +- a `blockedOnMeasurement` requesting "long-payload streaming workload + at sizes beyond exp 110's 4KB cells", and +- a 2026-04-29 `openCandidate` (`addedAfter: 110`) for "broader + long-payload workload (>= 32KB TEXT cells, mixed BLOB/TEXT)" with + `blockedOn: no benchmark covers payloads larger than the exp 110 + 4KB shape`. + +Until those entries close, the direction cannot tell whether the +hash-loop cost continues to drive wall as cells grow, or whether some +other cost — SQLite text-fetch over overflow pages, page cache, GC, +isolate transfer — takes over. Either outcome is decision-making: + +- If wall scales linearly with bytes at the larger sizes, hashing + dominates and another hash variant (wider unroll, SIMD probe) is the + natural next attempt for any future workload that ships ≥16KB cells. +- If wall scales sub-linearly, a non-hash cost is the bottleneck at + the long end and the direction can deprioritize hash-loop work. +- If wall scales super-linearly, allocation / GC / isolate-transfer + cost emerges and points at a new direction entirely. + +## Hypothesis + +After exp 110 + the existing 8-byte FNV path, per-byte hashing cost +should be roughly flat across cell sizes once the per-iteration +overhead floor (writer round-trip, mutex acquisition, microtask +scheduling) is amortized. The 4KB shape sits below the amortization +point, so its `ns_per_byte` should be artificially high; sizes from +≥16KB onward should converge to a stable per-byte band that +characterizes the actual hash-loop throughput on the test machine. + +Accept this as a measurement experiment if: + +- the audit produces stable per-cell-size median walls across + repeated passes (≤ ±10% drift on the dominant signal); +- the audit resolves the `signals.json` open candidate one way or + the other; +- the run updates `blockedOnMeasurement` and `openCandidates` + accordingly. + +## Approach + +Added one profile-mode harness: + +```text +benchmark/profile/long_text_scaling_audit.dart +``` + +The harness reuses exp 110's workload shape — 8 unchanged streams +projecting `id, body, sid` with `WHERE id < 256` predicates, one +barrier stream projecting the full table, 256 seed rows, one INSERT +per iteration with a row outside every unchanged stream's predicate +— and sweeps the per-cell byte size across `[4KB, 16KB, 32KB, 64KB, +128KB]`. Each cell size runs 3 warmup iterations followed by 30 +timed iterations; the per-iteration wall is the `Stopwatch` around +`db.execute(insert)` plus the wait for the barrier stream to +re-emit. The unchanged streams must not emit (the hash-only fast +path is supposed to suppress re-delivery); the harness asserts +this on every iteration. + +The `ns_per_byte` column divides the median wall by +`cell_bytes × (unchanged_streams × row_count + (row_count + 1))`. +Each unchanged stream re-hashes its full 256-row result every +fanout wave; the barrier stream re-hashes 257 rows after the new +row lands. At 16KB cells that is ~38 MB of hashed payload per +iteration; at 128KB it is ~302 MB. `ns_per_byte` is the per-byte +cost averaged across that payload, so it isolates hash-loop +throughput from the per-iteration overhead floor. + +The harness does not require `kProfileMode` to produce a useful +report (the scaling decision rests on end-to-end wall, not +profile-only counters), but it warns when run without it because +peer audits use the same convention. + +## Results + +Three repeated passes; values bracket the per-run band. + +Per-iteration wall: + +| cell size | median_ms (a/b/c) | p90_ms | p99_ms | +|-----------|--------------------------:|-------------:|-------------:| +| 4KB | 2.11 / 2.23 / 2.11 | 2.89 – 3.45 | 3.23 – 4.62 | +| 16KB | 4.50 / 4.28 / 4.52 | 5.27 – 5.55 | 6.25 – 7.14 | +| 32KB | 9.49 / 9.62 / 9.47 | 10.38 – 10.67| 11.66 – 12.87| +| 64KB | 27.52 / 28.13 / 25.26 | 33.72 – 35.16| 34.72 – 36.09| +| 128KB | 44.51 / 42.91 / 47.41 | 53.92 – 55.60| 55.84 – 62.64| + +Per-byte cost: + +| cell size | hashed_bytes_per_iter | ns_per_byte (median, a/b/c) | +|-----------|----------------------:|----------------------------:| +| 4KB | 9,441,280 | 0.224 / 0.236 / 0.223 | +| 16KB | 37,765,120 | 0.119 / 0.113 / 0.120 | +| 32KB | 75,530,240 | 0.126 / 0.127 / 0.125 | +| 64KB | 151,060,480 | 0.182 / 0.186 / 0.167 | +| 128KB | 302,120,960 | 0.147 / 0.142 / 0.157 | + +Aggregate file: +[`benchmark/profile/results/exp-137-long-text-scaling-aggregate.md`](../benchmark/profile/results/exp-137-long-text-scaling-aggregate.md). + +The 4KB row sits ~2x above the larger-size band because the +per-iteration overhead (writer round-trip, microtask scheduling, +isolate dispatch, the per-iteration String allocation in the harness +itself) is comparable in absolute terms to the hashing work at small +sizes. From 16KB upward the per-byte cost converges to the **0.12 – +0.19 ns/byte** band — the implied hash-loop throughput is roughly +~6 GB/s per stream, about what the 8-byte FNV chunked loop should +sustain on a modern desktop CPU. + +The 64KB row carries a small per-byte hump (~0.17 – 0.19 ns/byte vs +~0.13 – 0.16 ns/byte at 32KB and 128KB) and a wider min-to-max spread +(min 15 – 16 ms vs median 25 – 28 ms vs max 34 – 36 ms). The +per-iteration String allocation crosses an old-generation GC threshold +at that size on this VM build; the 32KB and 128KB rows happen to land +on cleaner sides of that boundary. The hump sits inside the broader +0.12 – 0.19 ns/byte band and does not change the linear-scaling +verdict. + +## Decision + +**Accept for review — measurement.** + +The audit answers the open `signals.json` question for the +`long-text-stream-hashing` direction: + +> long-payload streaming workload at sizes beyond exp 110's 4KB cells + +Wall scales **linearly** with bytes from 16KB up, with per-byte cost +in a stable 0.12 – 0.19 ns/byte band. Hashing — not SQLite text-fetch, +not Dart-side allocation, not isolate transfer — is the dominant cost +on long-cell unchanged-fanout workloads at meaningful cell sizes. + +What the resolution changes: + +- **Removes the `blockedOnMeasurement` entry**: the workload that was + missing now exists at five sizes. +- **Closes the matching `openCandidate`** (broader long-payload + workload). +- **Adds a new `openCandidate`**: a wider FNV unroll or SIMD + (AVX2/NEON) probe for the byte-stream loop. Its `blockedOn` is the + shape of any *real* workload that ships ≥16KB cells; exp 110's + release-suite shape is 4KB and would not see a measurable change + because the 4KB row sits above the per-byte band. +- **Updates `currentRead`**: hash-loop variants are workload-dependent. + The 4KB release-suite shape is per-iteration-overhead-bound; ≥16KB + workloads would be per-byte-bound and could see a hash variant pay + off proportional to bytes. + +What the resolution does *not* change: + +- The 8-byte FNV chunked loop stays correct and accepted; nothing in + the data argues for reverting exp 110 or its underlying exp 099 + implementation. +- Mixed BLOB/TEXT workload coverage is still missing (the original + candidate listed both). BLOB cells go through the same + `fnv_combine_bytes` path as TEXT cells inside `resqlite_query_hash`, + so the per-byte band should be substantially identical, but that + is not directly measured here. Filed as a smaller follow-up + candidate rather than a blocker. + +## Future Notes + +- A future hash-loop experiment (16-byte unroll, 32-byte unroll, AVX2 + probe, NEON probe, or an architecturally different state-pipelined + hash) should compare medians against this audit's 16KB / 32KB / + 64KB / 128KB band, not against exp 110's 4KB benchmark. The 4KB + benchmark is per-iteration-overhead-bound and will not move + proportionally to a per-byte hash improvement. +- A future workload at ≥16KB cells (long content streams, document + archives, large JSON blobs) is the natural trigger for revisiting + hash-loop work. Without one, removing the entire byte-stream hash + loop would still only save ~0.12 – 0.19 ns per byte hashed, which + is below the ±10% per-benchmark decision threshold for the + current 4KB release-suite shape. +- A future BLOB-shape audit at the same sweep sizes would confirm + TEXT/BLOB symmetry; the underlying C path is shared, so a + divergence between TEXT and BLOB at the same cell size would point + at a SQLite text-fetch difference rather than a hash difference. +- The 64KB GC-spread observation (min 16 ms / max 36 ms / median ~26 + ms across passes) suggests the harness's per-iteration String + payload allocation is itself a measurable signal at that size. A + pre-built payload pool variant of this harness would reduce the + spread; deferred because the `ns_per_byte` band is already stable + enough to support the linear-scaling verdict. diff --git a/experiments/README.md b/experiments/README.md index d170ae4..4134301 100644 --- a/experiments/README.md +++ b/experiments/README.md @@ -73,6 +73,7 @@ moved them. | [122](122-concrete-reader-pool-stream-admission.md) | Concrete reader-pool stream admission | Initializes `StreamEngine` with a concrete `ReaderPool` so `_flushQueue` stays synchronous and bounded by `availableWorkerCount`; tests now use diagnostics for stream registry size, and post-rebase profile counters stay at zero parks/retries/max-parked on A11c overlap and keyed-PK workloads | | | [125](125-wide-ascii-batch-params.md) | Wide ASCII batch parameter encoding | Direct ASCII payload packing skips temporary per-string UTF-8 lists in large wide batches; focused 10k x20 improves 17.199 → 12.760 ms and release Wide Batch Insert improves 18.201 → 13.031 ms | | | [126](126-wide-utf8-batch-packing.md) | Wide UTF-8 batch parameter packing | Direct UTF-8 payload packing extends exp 125's allocation win to guarded non-ASCII wide batches; focused Unicode 10k x20 improves 21.945 → 18.988 ms and emoji 10k x20 improves 24.187 → 17.458 ms while release write-suite guardrails remain neutral | | +| [137](137-long-text-cell-scaling.md) | Long-text cell-size scaling audit | Measurement-only: sweeps the exp 110 unchanged-fanout shape across [4KB, 16KB, 32KB, 64KB, 128KB] cells. Wall scales linearly with bytes from 16KB up; per-byte cost converges to a stable 0.12–0.19 ns/byte band on the existing 8-byte FNV chunked loop. The 4KB release shape sits ~2x above the band because per-iteration overhead dominates at that size — a faster hash variant would barely move it. Closes the long-text-stream-hashing direction's `blockedOnMeasurement` gate, replaces the broader-payload `openCandidate` with a wider FNV / SIMD probe candidate gated on a real ≥16KB workload | | ## Rejected diff --git a/experiments/signals.json b/experiments/signals.json index 36a3064..d5befc5 100644 --- a/experiments/signals.json +++ b/experiments/signals.json @@ -100,30 +100,33 @@ "id": "long-text-stream-hashing", "status": "watch", "subsystems": ["streaming", "hashing", "text"], - "currentRead": "Native hashing is valuable, and long TEXT cells now have a representative unchanged-fanout benchmark. Chunked byte-stream folding produced a large targeted win; further variants need a new profile signal or a broader long-payload workload.", - "keyPriors": ["075", "099", "110"], + "currentRead": "Native hashing is valuable, and long TEXT cells now have a representative unchanged-fanout benchmark plus a 4KB→128KB scaling audit. Exp 110 produced a -76% targeted win on the 4KB shape with the 8-byte FNV chunked loop. Exp 137 then swept the same workload across [4KB, 16KB, 32KB, 64KB, 128KB] and found wall scales linearly with bytes from 16KB up, with per-byte cost in a stable 0.12–0.19 ns/byte band. The 4KB shape sits ~2x above the band because per-iteration overhead (writer round-trip, mutex, microtask scheduling) is comparable to the hashing work at that size. Hash-loop variants are workload-dependent: the 4KB release-suite shape is per-iteration-overhead-bound and would barely move under a faster hash; ≥16KB workloads are per-byte-bound and would see a hash variant pay proportional to bytes.", + "keyPriors": ["075", "099", "110", "137"], "archive": ["033"], "interestingIf": [ - "a production profile shows long TEXT/BLOB stream hashing remains hot after chunked folding", - "the workload includes unchanged fan-out with larger text payloads", - "a hash-loop variant improves the long-text benchmark without hurting short-cell streams" + "a production profile or shipped workload uses ≥16KB TEXT/BLOB cells in unchanged fan-out", + "a hash-loop variant (wider unroll, SIMD probe) improves the exp 137 16KB+ band without hurting short-cell streams", + "a benchmark surfaces a non-hash long-cell cost (SQLite overflow page IO, allocation, GC) the exp 137 audit could not see" ], "openQuestions": [ "What text sizes appear in realistic stream workloads?", - "Does the remaining long-text cost sit in hashing, SQLite text access, Dart decode, or result delivery?" + "Do BLOB cells share the same per-byte band as TEXT cells at the same sizes (the underlying C path is shared, so divergence would indicate a SQLite text-fetch difference)?" ], "openCandidates": [ { - "idea": "broader long-payload workload (≥ 32KB TEXT cells, mixed BLOB/TEXT)", - "addedDate": "2026-04-29", - "addedAfter": "110", - "blockedOn": "no benchmark covers payloads larger than the exp 110 4KB shape" + "idea": "wider FNV unroll or SIMD (AVX2/NEON) probe for the byte-stream loop", + "addedDate": "2026-05-12", + "addedAfter": "137", + "blockedOn": "no shipped workload uses ≥16KB cells; exp 110's 4KB release-suite shape is per-iteration-overhead-bound and would not measurably move" + }, + { + "idea": "BLOB-shape companion of the exp 137 sweep to confirm TEXT/BLOB symmetry", + "addedDate": "2026-05-12", + "addedAfter": "137" } ], - "blockedOnMeasurement": [ - "long-payload streaming workload at sizes beyond exp 110's 4KB cells" - ], - "notesForExperimenters": "Use the long-text unchanged-fanout benchmark before trying another hash-loop change. The direction is no longer blocked on measurement, but the obvious 8-byte fold has already been tried." + "blockedOnMeasurement": [], + "notesForExperimenters": "Compare hash-loop variants against the exp 137 16KB+ band, not against exp 110's 4KB benchmark — the 4KB shape is per-iteration-overhead-bound and will not move proportionally to a per-byte hash improvement. The structural ceiling for removing all byte-stream hashing on the current release shape is ~0.22 ns/byte × 4KB × 256 rows × 8 streams ≈ 1.8 ms per fanout wave (vs ~2.1 ms median wall) — a faster hash would save almost all of that, but the absolute saving is below the ±10% per-benchmark decision threshold." }, { "id": "sqlite-version-and-build-config", @@ -542,6 +545,20 @@ "watch release-suite Wide Batch Insert and narrow Batch Insert together because the public suite is still ASCII-heavy", "only pursue blob-heavy or broader embedded-NUL work with a workload that crosses the same large/wide guard" ] + }, + "137": { + "directions": ["long-text-stream-hashing", "measurement-system"], + "outcomeClass": "in_review_measurement", + "changedBeliefs": [ + "Long-cell unchanged-fanout wall scales linearly with bytes from 16KB up; per-byte cost converges to a stable 0.12–0.19 ns/byte band on the 8-byte FNV chunked loop", + "Exp 110's 4KB release shape sits ~2x above the per-byte band because per-iteration overhead (writer round-trip, mutex, microtask scheduling) is comparable to the hashing work at that size — a faster hash would barely move it", + "Hash-loop variants (wider unroll, SIMD probe) are interesting only for ≥16KB workloads; current release-suite shape is per-iteration-overhead-bound, not hash-bound" + ], + "nextSignals": [ + "compare any future hash-loop variant against the exp 137 16KB+ band, not the exp 110 4KB benchmark", + "trigger a wider FNV / SIMD probe only when a real workload ships ≥16KB cells", + "confirm TEXT/BLOB symmetry with a BLOB-shape companion sweep before generalizing the band to non-text cells" + ] } } } From f1497ce3688190e67ec4046b09699fc755968a4a Mon Sep 17 00:00:00 2001 From: Dan Reynolds Date: Tue, 12 May 2026 07:32:11 -0400 Subject: [PATCH 2/2] Address Copilot review on PR #112 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch the long-text scaling audit harness from an INSERT-driven barrier (whose `SELECT id, body FROM long_items ORDER BY id` projection grows by one row per iteration) to a fixed-row UPDATE-driven barrier at `id = 999999`, picked outside every unchanged stream's `id < 256` predicate. The barrier stream becomes `SELECT id, body FROM long_items WHERE id = ?` so its result stays at exactly one row across every iteration; the unchanged streams stay at exactly 256 rows. Per-iteration hashed payload is now constant within each cell size, so `ns_per_byte` is no longer biased toward later (heavier) iterations. Also fixes a tempdir leak: `Database.open` is now inside the outer `try` so the `await tempDir.delete(recursive: true)` in the `finally` always runs even if open throws. Re-ran the audit three passes; the corrected per-byte band sits at 0.065 – 0.080 ns/byte from 16KB up (~13–15 GB/s implied per-stream throughput). The qualitative verdict is unchanged: linear scaling with bytes from 16KB up, 4KB shape sits ~2x above the band because per-iteration overhead dominates. signals.json, the experiment writeup, the aggregate markdown, and the regenerated docs/experiments/history.json are all updated to match. Co-Authored-By: Claude Opus 4.7 --- .../profile/long_text_scaling_audit.dart | 270 ++++++++++-------- .../exp-137-long-text-scaling-aggregate.md | 28 +- docs/experiments/history.json | 10 +- experiments/137-long-text-cell-scaling.md | 126 ++++---- experiments/README.md | 2 +- experiments/signals.json | 9 +- 6 files changed, 249 insertions(+), 196 deletions(-) diff --git a/benchmark/profile/long_text_scaling_audit.dart b/benchmark/profile/long_text_scaling_audit.dart index 7b3cb3d..6612eee 100644 --- a/benchmark/profile/long_text_scaling_audit.dart +++ b/benchmark/profile/long_text_scaling_audit.dart @@ -69,146 +69,171 @@ class _ScalingResult { // Per-byte wall is a stand-in for "would 2x the bytes give 2x the // wall?". The fanout wave hashes: - // - every unchanged stream's full result (rowCount rows), times + // - every unchanged stream's full result (`rowCount` rows), times // `unchangedStreamCount` streams, - // - plus the barrier stream's full result (rowCount + 1 rows after - // the INSERT lands). + // - plus the barrier stream's single fixed row (its body changes + // each iteration so the barrier emits, but the result-set size + // stays at exactly one row — the unchanged streams remain the + // dominant hashed-payload source). // SQLite TEXT cells stored on the same row as INTEGER columns return // the full payload pointer for hashing, so per-row hashed bytes are // approximately `cellBytes` (id/marker integer columns add a few // bytes of fold work each — kept in the formula as cellBytes only // because the integer-column contribution is negligible at >=4KB). double get totalHashedBytes => - cellBytes.toDouble() * - (unchangedStreamCount * rowCount + (rowCount + 1)); + cellBytes.toDouble() * (unchangedStreamCount * rowCount + 1); double get nsPerByte => (medianUs * 1000.0) / totalHashedBytes; } +// Fixed primary key for the barrier row. Sits well outside every +// unchanged stream's `id < rowCount` predicate so the unchanged +// streams never observe a column change from the barrier UPDATE, +// while the barrier stream's `id = $barrierRowId` predicate keeps +// the result set at exactly one row across every iteration. This +// matters for `ns_per_byte`: an INSERT-driven barrier would grow +// the barrier's hashed payload by one row per iteration, drifting +// the per-iteration hashed-bytes denominator by ~1.3% over 30 +// timed iterations and biasing later iterations heavier. +const int barrierRowId = 999999; + Future<_ScalingResult> runOneSize(int cellBytes) async { final tempDir = await Directory.systemTemp.createTemp( 'long_text_scaling_${cellBytes}_', ); - final db = await resqlite.Database.open('${tempDir.path}/r.db'); try { - const createSql = - 'CREATE TABLE long_items(' - 'id INTEGER PRIMARY KEY, ' - 'body TEXT NOT NULL, ' - 'marker INTEGER NOT NULL)'; - const insertSql = - 'INSERT INTO long_items(id, body, marker) VALUES (?, ?, ?)'; + final db = await resqlite.Database.open('${tempDir.path}/r.db'); + try { + const createSql = + 'CREATE TABLE long_items(' + 'id INTEGER PRIMARY KEY, ' + 'body TEXT NOT NULL, ' + 'marker INTEGER NOT NULL)'; + const insertSql = + 'INSERT INTO long_items(id, body, marker) VALUES (?, ?, ?)'; + const updateBarrierSql = + 'UPDATE long_items SET body = ? WHERE id = ?'; - await db.execute(createSql); - await db.executeBatch(insertSql, [ - for (var i = 0; i < rowCount; i++) - [i, _longTextPayload(cellBytes, i), i], - ]); + await db.execute(createSql); + await db.executeBatch(insertSql, [ + for (var i = 0; i < rowCount; i++) + [i, _longTextPayload(cellBytes, i), i], + ]); + // Seed the fixed barrier row outside every unchanged stream's + // predicate. Subsequent iterations UPDATE this row to trigger + // the barrier emission without growing any result set. + await db.execute(insertSql, [ + barrierRowId, + _longTextPayload(cellBytes, barrierRowId), + 0, + ]); - final unchangedSubs = >>>[]; - StreamSubscription>>? barrierSub; + final unchangedSubs = >>>[]; + StreamSubscription>>? barrierSub; - try { - final unchangedEmissions = List.filled(unchangedStreamCount, 0); - final unchangedReady = >[ - for (var i = 0; i < unchangedStreamCount; i++) Completer(), - ]; + try { + final unchangedEmissions = List.filled(unchangedStreamCount, 0); + final unchangedReady = >[ + for (var i = 0; i < unchangedStreamCount; i++) Completer(), + ]; - for (var s = 0; s < unchangedStreamCount; s++) { - final idx = s; - unchangedSubs.add( - db - .stream( - 'SELECT id, body, $s as sid FROM long_items ' - 'WHERE id < $rowCount ORDER BY id', - ) - .listen((_) { - unchangedEmissions[idx]++; - if (!unchangedReady[idx].isCompleted) { - unchangedReady[idx].complete(); - } - }), - ); - } - - final barrierStream = db.stream( - 'SELECT id, body FROM long_items ORDER BY id', - ); - final barrierReady = Completer(); - Completer? waitBarrier; - barrierSub = barrierStream.listen((_) { - if (!barrierReady.isCompleted) { - barrierReady.complete(); - } else if (waitBarrier != null && !waitBarrier.isCompleted) { - waitBarrier.complete(); + for (var s = 0; s < unchangedStreamCount; s++) { + final idx = s; + unchangedSubs.add( + db + .stream( + 'SELECT id, body, $s as sid FROM long_items ' + 'WHERE id < $rowCount ORDER BY id', + ) + .listen((_) { + unchangedEmissions[idx]++; + if (!unchangedReady[idx].isCompleted) { + unchangedReady[idx].complete(); + } + }), + ); } - }); - await Future.wait( - unchangedReady.map((c) => c.future), - ).timeout(const Duration(seconds: 30)); - await barrierReady.future.timeout(const Duration(seconds: 30)); + final barrierStream = db.stream( + 'SELECT id, body FROM long_items WHERE id = ?', + [barrierRowId], + ); + final barrierReady = Completer(); + Completer? waitBarrier; + barrierSub = barrierStream.listen((_) { + if (!barrierReady.isCompleted) { + barrierReady.complete(); + } else if (waitBarrier != null && !waitBarrier.isCompleted) { + waitBarrier.complete(); + } + }); + + await Future.wait( + unchangedReady.map((c) => c.future), + ).timeout(const Duration(seconds: 30)); + await barrierReady.future.timeout(const Duration(seconds: 30)); - var counter = 100000; - final wallUs = []; + var counter = 100000; + final wallUs = []; - // Warmups stabilize cache state and Dart JIT/AOT hot paths so the - // first measured iteration is not an outlier. Discarded from the - // result. - for (var w = 0; w < warmupIterations; w++) { - waitBarrier = Completer(); - await db.execute(insertSql, [ - counter, - _longTextPayload(cellBytes, counter), - w, - ]); - counter++; - await waitBarrier.future.timeout(const Duration(seconds: 30)); - } + // Warmups stabilize cache state and Dart JIT/AOT hot paths so the + // first measured iteration is not an outlier. Discarded from the + // result. + for (var w = 0; w < warmupIterations; w++) { + waitBarrier = Completer(); + await db.execute(updateBarrierSql, [ + _longTextPayload(cellBytes, counter), + barrierRowId, + ]); + counter++; + await waitBarrier.future.timeout(const Duration(seconds: 30)); + } - for (var i = 0; i < iterationsPerSize; i++) { - waitBarrier = Completer(); - final before = List.from(unchangedEmissions); + for (var i = 0; i < iterationsPerSize; i++) { + waitBarrier = Completer(); + final before = List.from(unchangedEmissions); - final sw = Stopwatch()..start(); - await db.execute(insertSql, [ - counter, - _longTextPayload(cellBytes, counter), - i, - ]); - counter++; - await waitBarrier.future.timeout(const Duration(seconds: 30)); - sw.stop(); - wallUs.add(sw.elapsedMicroseconds); + final sw = Stopwatch()..start(); + await db.execute(updateBarrierSql, [ + _longTextPayload(cellBytes, counter), + barrierRowId, + ]); + counter++; + await waitBarrier.future.timeout(const Duration(seconds: 30)); + sw.stop(); + wallUs.add(sw.elapsedMicroseconds); - for (var s = 0; s < unchangedStreamCount; s++) { - if (unchangedEmissions[s] != before[s]) { - throw StateError( - 'Long-text unchanged stream $s emitted at cell size ' - '$cellBytes; the unchanged-fanout invariant has been ' - 'broken (the hash-only fast path is supposed to suppress ' - 'this stream).', - ); + for (var s = 0; s < unchangedStreamCount; s++) { + if (unchangedEmissions[s] != before[s]) { + throw StateError( + 'Long-text unchanged stream $s emitted at cell size ' + '$cellBytes; the unchanged-fanout invariant has been ' + 'broken (the hash-only fast path is supposed to suppress ' + 'this stream).', + ); + } } } - } - final sorted = [...wallUs]..sort(); - return _ScalingResult( - cellBytes: cellBytes, - medianUs: sorted[sorted.length ~/ 2], - p90Us: sorted[(sorted.length * 0.9).floor().clamp(0, sorted.length - 1)], - p99Us: sorted[(sorted.length * 0.99).floor().clamp(0, sorted.length - 1)], - minUs: sorted.first, - maxUs: sorted.last, - ); - } finally { - await barrierSub?.cancel(); - for (final sub in unchangedSubs) { - await sub.cancel(); + final sorted = [...wallUs]..sort(); + return _ScalingResult( + cellBytes: cellBytes, + medianUs: sorted[sorted.length ~/ 2], + p90Us: + sorted[(sorted.length * 0.9).floor().clamp(0, sorted.length - 1)], + p99Us: sorted[(sorted.length * 0.99).floor().clamp(0, sorted.length - 1)], + minUs: sorted.first, + maxUs: sorted.last, + ); + } finally { + await barrierSub?.cancel(); + for (final sub in unchangedSubs) { + await sub.cancel(); + } } + } finally { + await db.close(); } } finally { - await db.close(); await tempDir.delete(recursive: true); } } @@ -267,17 +292,21 @@ String _renderMarkdown(List<_ScalingResult> results) { buf.writeln(); buf.writeln( 'Workload shape: $unchangedStreamCount unchanged streams x $rowCount ' - 'rows, one barrier stream, $iterationsPerSize timed INSERT ' - 'iterations per cell size after $warmupIterations warmups.', + 'rows, one fixed-row barrier stream (id = $barrierRowId, outside ' + 'every unchanged stream\'s `id < $rowCount` predicate), ' + '$iterationsPerSize timed UPDATE iterations against the barrier ' + 'row per cell size after $warmupIterations warmups.', ); buf.writeln(); buf.writeln( - 'Wall convention: per-iteration `Stopwatch` brackets the INSERT ' + 'Wall convention: per-iteration `Stopwatch` brackets the UPDATE ' 'plus the wait for the barrier stream to re-emit. The unchanged ' 'streams must not emit (their hash-only fast path is supposed to ' 'suppress re-delivery); the harness asserts this on every ' - 'iteration. The hash-loop work the unchanged streams do during ' - 'each iteration is the cost the scaling sweep is targeting.', + 'iteration. Using UPDATE against a fixed barrier row keeps every ' + 'result set at constant size across iterations, so per-iteration ' + 'hashed-byte work is constant and the median is not biased toward ' + 'later (heavier) iterations.', ); buf.writeln(); buf.writeln('Command:'); @@ -313,11 +342,11 @@ String _renderMarkdown(List<_ScalingResult> results) { buf.writeln( 'The fanout wave hashes every unchanged stream\'s full result ' '($rowCount rows x $unchangedStreamCount unchanged streams) plus ' - 'the barrier stream\'s full result (${rowCount + 1} rows after ' - 'the INSERT lands). `hashed_bytes_per_iter ≈ cell_bytes x ' - '(${unchangedStreamCount * rowCount} + ${rowCount + 1})`. ' - '`ns_per_byte` divides the median wall by the total hashed bytes ' - 'to isolate the per-byte cost from the per-iteration overhead.', + 'the barrier stream\'s single fixed row. ' + '`hashed_bytes_per_iter = cell_bytes x ' + '(${unchangedStreamCount * rowCount} + 1)`. `ns_per_byte` ' + 'divides the median wall by the total hashed bytes to isolate ' + 'the per-byte cost from the per-iteration overhead.', ); buf.writeln(); buf.writeln( @@ -335,8 +364,9 @@ String _renderMarkdown(List<_ScalingResult> results) { buf.writeln('## Reading the table'); buf.writeln(); buf.writeln( - '- `median_ms` is the per-iteration wall: one INSERT plus the ' - 'fanout wave that re-hashes every unchanged stream\'s result.', + '- `median_ms` is the per-iteration wall: one UPDATE against the ' + 'fixed barrier row plus the fanout wave that re-hashes every ' + 'unchanged stream\'s result.', ); buf.writeln( '- `ns_per_byte` is the per-byte cost averaged across the full ' diff --git a/benchmark/profile/results/exp-137-long-text-scaling-aggregate.md b/benchmark/profile/results/exp-137-long-text-scaling-aggregate.md index 22957cc..17e81c3 100644 --- a/benchmark/profile/results/exp-137-long-text-scaling-aggregate.md +++ b/benchmark/profile/results/exp-137-long-text-scaling-aggregate.md @@ -2,9 +2,9 @@ Profile-mode harness: `benchmark/profile/long_text_scaling_audit.dart` -Workload shape: 8 unchanged streams x 256 rows, one barrier stream, 30 timed INSERT iterations per cell size after 3 warmups. +Workload shape: 8 unchanged streams x 256 rows, one fixed-row barrier stream (id = 999999, outside every unchanged stream's `id < 256` predicate), 30 timed UPDATE iterations against the barrier row per cell size after 3 warmups. -Wall convention: per-iteration `Stopwatch` brackets the INSERT plus the wait for the barrier stream to re-emit. The unchanged streams must not emit (their hash-only fast path is supposed to suppress re-delivery); the harness asserts this on every iteration. The hash-loop work the unchanged streams do during each iteration is the cost the scaling sweep is targeting. +Wall convention: per-iteration `Stopwatch` brackets the UPDATE plus the wait for the barrier stream to re-emit. The unchanged streams must not emit (their hash-only fast path is supposed to suppress re-delivery); the harness asserts this on every iteration. Using UPDATE against a fixed barrier row keeps every result set at constant size across iterations, so per-iteration hashed-byte work is constant and the median is not biased toward later (heavier) iterations. Command: @@ -16,27 +16,27 @@ dart run -DRESQLITE_PROFILE=true benchmark/profile/long_text_scaling_audit.dart | cell size | median_ms | p90_ms | p99_ms | min_ms | max_ms | |---|---:|---:|---:|---:|---:| -| 4KB | 2.11 | 3.09 | 3.23 | 1.82 | 3.23 | -| 16KB | 4.50 | 5.53 | 6.98 | 4.00 | 6.98 | -| 32KB | 9.49 | 10.67 | 12.87 | 8.40 | 12.87 | -| 64KB | 27.52 | 33.72 | 34.72 | 16.01 | 34.72 | -| 128KB | 44.51 | 53.92 | 55.84 | 35.07 | 55.84 | +| 4KB | 1.35 | 1.98 | 2.66 | 1.02 | 2.66 | +| 16KB | 2.46 | 2.89 | 3.94 | 2.10 | 3.94 | +| 32KB | 5.28 | 5.60 | 5.85 | 5.00 | 5.85 | +| 64KB | 9.21 | 10.88 | 15.69 | 8.80 | 15.69 | +| 128KB | 17.40 | 18.78 | 21.98 | 16.83 | 21.98 | ## Per-byte cost -The fanout wave hashes every unchanged stream's full result (256 rows x 8 unchanged streams) plus the barrier stream's full result (257 rows after the INSERT lands). `hashed_bytes_per_iter ≈ cell_bytes x (2048 + 257)`. `ns_per_byte` divides the median wall by the total hashed bytes to isolate the per-byte cost from the per-iteration overhead. +The fanout wave hashes every unchanged stream's full result (256 rows x 8 unchanged streams) plus the barrier stream's single fixed row. `hashed_bytes_per_iter = cell_bytes x (2048 + 1)`. `ns_per_byte` divides the median wall by the total hashed bytes to isolate the per-byte cost from the per-iteration overhead. | cell size | hashed_bytes_per_iter | ns_per_byte (median) | |---|---:|---:| -| 4KB | 9441280 | 0.224 | -| 16KB | 37765120 | 0.119 | -| 32KB | 75530240 | 0.126 | -| 64KB | 151060480 | 0.182 | -| 128KB | 302120960 | 0.147 | +| 4KB | 8392704 | 0.160 | +| 16KB | 33570816 | 0.073 | +| 32KB | 67141632 | 0.079 | +| 64KB | 134283264 | 0.069 | +| 128KB | 268566528 | 0.065 | ## Reading the table -- `median_ms` is the per-iteration wall: one INSERT plus the fanout wave that re-hashes every unchanged stream's result. +- `median_ms` is the per-iteration wall: one UPDATE against the fixed barrier row plus the fanout wave that re-hashes every unchanged stream's result. - `ns_per_byte` is the per-byte cost averaged across the full hashed payload. If hashing is the bottleneck, this number stays roughly flat across cell sizes. - Drift downward as cell sizes grow points to a per-iteration overhead floor (mutex acquisition, microtask scheduling, isolate dispatch) hiding the per-byte cost at small sizes. - Drift upward at large sizes points to a non-hash cost emerging — allocation, GC pressure, page cache misses, or SQLite text fetch stalling on disk. diff --git a/docs/experiments/history.json b/docs/experiments/history.json index c36c218..93f677b 100644 --- a/docs/experiments/history.json +++ b/docs/experiments/history.json @@ -1,5 +1,5 @@ { - "generated": "2026-05-12T07:18:20.382138", + "generated": "2026-05-12T07:31:46.268160", "runs": [ { "id": "baseline-before-authorizer-hooks", @@ -18458,13 +18458,13 @@ "title": "Long-text cell-size scaling audit", "date": "2026-05-12", "status": "in_review", - "summary": "Measurement-only: sweeps the exp 110 unchanged-fanout shape across [4KB, 16KB, 32KB, 64KB, 128KB] cells. Wall scales linearly with bytes from 16KB up; per-byte cost converges to a stable 0.12–0.19 ns/byte band on the existing 8-byte FNV chunked loop. The 4KB release shape sits ~2x above the band because per-iteration overhead dominates at that size — a faster hash variant would barely move it. Closes the long-text-stream-hashing direction's `blockedOnMeasurement` gate, replaces the broader-payload `openCandidate` with a wider FNV / SIMD probe candidate gated on a real ≥16KB workload", + "summary": "Measurement-only: sweeps the exp 110 unchanged-fanout shape across [4KB, 16KB, 32KB, 64KB, 128KB] cells using a fixed-row UPDATE trigger so per-iteration hashed payload stays constant. Wall scales linearly with bytes from 16KB up; per-byte cost converges to a stable 0.065–0.080 ns/byte band on the existing 8-byte FNV chunked loop (~13–15 GB/s implied per-stream throughput). The 4KB release shape sits ~2x above the band because per-iteration overhead dominates at that size — a faster hash variant would barely move it. Closes the long-text-stream-hashing direction's `blockedOnMeasurement` gate, replaces the broader-payload `openCandidate` with a wider FNV / SIMD probe candidate gated on a real ≥16KB workload", "commit": null, "problem": "[Exp 099](099-fnv-8byte-bytestream.md) was rejected as benchmark-invisible\nbecause the streaming suite at the time only carried short cells.\n[Exp 110](110-long-text-fnv-8byte.md) added a long-text unchanged-fanout\nbenchmark — 8 unchanged streams x 256 rows x **4KB** ASCII TEXT cells —\nand the same 8-byte FNV change measured **-76%** on its median wall.\n\n`signals.json#long-text-stream-hashing` left two related entries open\nafter that win:\n\n- a `blockedOnMeasurement` requesting \"long-payload streaming workload\n at sizes beyond exp 110's 4KB cells\", and\n- a 2026-04-29 `openCandidate` (`addedAfter: 110`) for \"broader\n long-payload workload (>= 32KB TEXT cells, mixed BLOB/TEXT)\" with\n `blockedOn: no benchmark covers payloads larger than the exp 110\n 4KB shape`.\n\nUntil those entries close...", "hypothesis": "After exp 110 + the existing 8-byte FNV path, per-byte hashing cost\nshould be roughly flat across cell sizes once the per-iteration\noverhead floor (writer round-trip, mutex acquisition, microtask\nscheduling) is amortized. The 4KB shape sits below the amortization\npoint, so its `ns_per_byte` should be artificially high; sizes from\n≥16KB onward should converge to a stable per-byte band that\ncharacterizes the actual hash-loop throughput on the test machine.\n\nAccept this as a measurement experiment if:\n\n- the audit produces stable per-cell-size median walls across\n repeated passes (≤ ±10% drift on the dominant signal);\n- the audit resolves the `signals.json` open candidate one way or\n the other;\n- the run updates `blockedOnMeasurement` and `openCandidates`\n accordingly.", - "approach": "Added one profile-mode harness:\n\n```text\nbenchmark/profile/long_text_scaling_audit.dart\n```\n\nThe harness reuses exp 110's workload shape — 8 unchanged streams\nprojecting `id, body, sid` with `WHERE id < 256` predicates, one\nbarrier stream projecting the full table, 256 seed rows, one INSERT\nper iteration with a row outside every unchanged stream's predicate\n— and sweeps the per-cell byte size across `[4KB, 16KB, 32KB, 64KB,\n128KB]`. Each cell size runs 3 warmup iterations followed by 30\ntimed iterations; the per-iteration wall is the `Stopwatch` around\n`db.execute(insert)` plus the wait for the barrier stream to\nre-emit. The unchanged streams must not emit (the hash-only fast\npath is supposed to suppress re-delivery); the harness asserts\nthis on every iteration.\n\nThe `ns_per_byte` colum...", - "results": "Three repeated passes; values bracket the per-run band.\n\nPer-iteration wall:\n\n| cell size | median_ms (a/b/c) | p90_ms | p99_ms |\n|-----------|--------------------------:|-------------:|-------------:|\n| 4KB | 2.11 / 2.23 / 2.11 | 2.89 – 3.45 | 3.23 – 4.62 |\n| 16KB | 4.50 / 4.28 / 4.52 | 5.27 – 5.55 | 6.25 – 7.14 |\n| 32KB | 9.49 / 9.62 / 9.47 | 10.38 – 10.67| 11.66 – 12.87|\n| 64KB | 27.52 / 28.13 / 25.26 | 33.72 – 35.16| 34.72 – 36.09|\n| 128KB | 44.51 / 42.91 / 47.41 | 53.92 – 55.60| 55.84 – 62.64|\n\nPer-byte cost:\n\n| cell size | hashed_bytes_per_iter | ns_per_byte (median, a/b/c) |\n|-----------|----------------------:|----------------------------:|\n| 4KB | 9,441,280 | 0.224 / 0.236 / 0.223 ...", - "reasoning": "**Accept for review — measurement.**\n\nThe audit answers the open `signals.json` question for the\n`long-text-stream-hashing` direction:\n\n> long-payload streaming workload at sizes beyond exp 110's 4KB cells\n\nWall scales **linearly** with bytes from 16KB up, with per-byte cost\nin a stable 0.12 – 0.19 ns/byte band. Hashing — not SQLite text-fetch,\nnot Dart-side allocation, not isolate transfer — is the dominant cost\non long-cell unchanged-fanout workloads at meaningful cell sizes.\n\nWhat the resolution changes:\n\n- **Removes the `blockedOnMeasurement` entry**: the workload that was\n missing now exists at five sizes.\n- **Closes the matching `openCandidate`** (broader long-payload\n workload).\n- **Adds a new `openCandidate`**: a wider FNV unroll or SIMD\n (AVX2/NEON) probe for the byte-stream..." + "approach": "Added one profile-mode harness:\n\n```text\nbenchmark/profile/long_text_scaling_audit.dart\n```\n\nThe harness mirrors exp 110's unchanged-fanout shape — 8 unchanged\nstreams projecting `id, body, sid` with `WHERE id < 256` predicates —\nand sweeps the per-cell byte size across `[4KB, 16KB, 32KB, 64KB,\n128KB]`. Each cell size runs 3 warmup iterations followed by 30\ntimed iterations.\n\nThe fanout *trigger* differs from exp 110 to keep per-iteration\nhashed-byte work constant. Exp 110's release benchmark inserts a new\nrow each iteration and the barrier stream selects the full table, so\nthe barrier's hashed payload grows by one row per iteration (~1.3%\ndrift on the per-iteration denominator over 30 timed iterations,\nbiasing later iterations heavier). This audit uses a fixed barrier\nrow at `id = 9999...", + "results": "Three repeated passes; values bracket the per-run band.\n\nPer-iteration wall:\n\n| cell size | median_ms (a/b/c) | p90_ms | p99_ms |\n|-----------|--------------------------:|-------------:|-------------:|\n| 4KB | 1.35 / 1.29 / 1.35 | 1.83 – 2.15 | 2.39 – 2.66 |\n| 16KB | 2.46 / 2.49 / 2.42 | 2.89 – 3.41 | 3.68 – 4.29 |\n| 32KB | 5.28 / 5.03 / 5.24 | 5.60 – 6.11 | 5.85 – 8.76 |\n| 64KB | 9.21 / 8.92 / 9.11 | 9.35 – 10.88 | 15.19 – 18.73|\n| 128KB | 17.40 / 18.85 / 17.31 | 18.78 – 27.93| 21.98 – 32.93|\n\nPer-byte cost:\n\n| cell size | hashed_bytes_per_iter | ns_per_byte (median, a/b/c) |\n|-----------|----------------------:|----------------------------:|\n| 4KB | 8,392,704 | 0.160 / 0.154 / 0.161 ...", + "reasoning": "**Accept for review — measurement.**\n\nThe audit answers the open `signals.json` question for the\n`long-text-stream-hashing` direction:\n\n> long-payload streaming workload at sizes beyond exp 110's 4KB cells\n\nWall scales **linearly** with bytes from 16KB up, with per-byte cost\nin a stable 0.065 – 0.080 ns/byte band. Hashing — not SQLite\ntext-fetch, not Dart-side allocation, not isolate transfer — is the\ndominant cost on long-cell unchanged-fanout workloads at meaningful\ncell sizes.\n\nWhat the resolution changes:\n\n- **Removes the `blockedOnMeasurement` entry**: the workload that was\n missing now exists at five sizes.\n- **Closes the matching `openCandidate`** (broader long-payload\n workload).\n- **Adds a new `openCandidate`**: a wider FNV unroll or SIMD\n (AVX2/NEON) probe for the byte-stre..." } ], "tracked": [ diff --git a/experiments/137-long-text-cell-scaling.md b/experiments/137-long-text-cell-scaling.md index 02ca7b1..a9962f9 100644 --- a/experiments/137-long-text-cell-scaling.md +++ b/experiments/137-long-text-cell-scaling.md @@ -62,26 +62,37 @@ Added one profile-mode harness: benchmark/profile/long_text_scaling_audit.dart ``` -The harness reuses exp 110's workload shape — 8 unchanged streams -projecting `id, body, sid` with `WHERE id < 256` predicates, one -barrier stream projecting the full table, 256 seed rows, one INSERT -per iteration with a row outside every unchanged stream's predicate -— and sweeps the per-cell byte size across `[4KB, 16KB, 32KB, 64KB, +The harness mirrors exp 110's unchanged-fanout shape — 8 unchanged +streams projecting `id, body, sid` with `WHERE id < 256` predicates — +and sweeps the per-cell byte size across `[4KB, 16KB, 32KB, 64KB, 128KB]`. Each cell size runs 3 warmup iterations followed by 30 -timed iterations; the per-iteration wall is the `Stopwatch` around -`db.execute(insert)` plus the wait for the barrier stream to -re-emit. The unchanged streams must not emit (the hash-only fast -path is supposed to suppress re-delivery); the harness asserts -this on every iteration. +timed iterations. + +The fanout *trigger* differs from exp 110 to keep per-iteration +hashed-byte work constant. Exp 110's release benchmark inserts a new +row each iteration and the barrier stream selects the full table, so +the barrier's hashed payload grows by one row per iteration (~1.3% +drift on the per-iteration denominator over 30 timed iterations, +biasing later iterations heavier). This audit uses a fixed barrier +row at `id = 999999` (well outside every unchanged stream's +predicate) and UPDATEs that row's `body` each iteration. The barrier +stream's `WHERE id = ?` projection therefore stays at exactly one +row across every iteration; the unchanged streams stay at exactly +256 rows. Per-iteration hashed payload is constant within each cell +size. + +The per-iteration wall is a `Stopwatch` around `db.execute(update)` +plus the wait for the barrier stream to re-emit. The unchanged +streams must not emit (the hash-only fast path is supposed to +suppress re-delivery); the harness asserts this on every iteration +and fails loudly if it sees one. The `ns_per_byte` column divides the median wall by -`cell_bytes × (unchanged_streams × row_count + (row_count + 1))`. -Each unchanged stream re-hashes its full 256-row result every -fanout wave; the barrier stream re-hashes 257 rows after the new -row lands. At 16KB cells that is ~38 MB of hashed payload per -iteration; at 128KB it is ~302 MB. `ns_per_byte` is the per-byte -cost averaged across that payload, so it isolates hash-loop -throughput from the per-iteration overhead floor. +`cell_bytes × (unchanged_streams × row_count + 1)`. At 16KB cells +that is ~33 MB of hashed payload per iteration; at 128KB it is +~270 MB. `ns_per_byte` is the per-byte cost averaged across that +payload, isolating hash-loop throughput from the per-iteration +overhead floor. The harness does not require `kProfileMode` to produce a useful report (the scaling decision rests on end-to-end wall, not @@ -96,42 +107,45 @@ Per-iteration wall: | cell size | median_ms (a/b/c) | p90_ms | p99_ms | |-----------|--------------------------:|-------------:|-------------:| -| 4KB | 2.11 / 2.23 / 2.11 | 2.89 – 3.45 | 3.23 – 4.62 | -| 16KB | 4.50 / 4.28 / 4.52 | 5.27 – 5.55 | 6.25 – 7.14 | -| 32KB | 9.49 / 9.62 / 9.47 | 10.38 – 10.67| 11.66 – 12.87| -| 64KB | 27.52 / 28.13 / 25.26 | 33.72 – 35.16| 34.72 – 36.09| -| 128KB | 44.51 / 42.91 / 47.41 | 53.92 – 55.60| 55.84 – 62.64| +| 4KB | 1.35 / 1.29 / 1.35 | 1.83 – 2.15 | 2.39 – 2.66 | +| 16KB | 2.46 / 2.49 / 2.42 | 2.89 – 3.41 | 3.68 – 4.29 | +| 32KB | 5.28 / 5.03 / 5.24 | 5.60 – 6.11 | 5.85 – 8.76 | +| 64KB | 9.21 / 8.92 / 9.11 | 9.35 – 10.88 | 15.19 – 18.73| +| 128KB | 17.40 / 18.85 / 17.31 | 18.78 – 27.93| 21.98 – 32.93| Per-byte cost: | cell size | hashed_bytes_per_iter | ns_per_byte (median, a/b/c) | |-----------|----------------------:|----------------------------:| -| 4KB | 9,441,280 | 0.224 / 0.236 / 0.223 | -| 16KB | 37,765,120 | 0.119 / 0.113 / 0.120 | -| 32KB | 75,530,240 | 0.126 / 0.127 / 0.125 | -| 64KB | 151,060,480 | 0.182 / 0.186 / 0.167 | -| 128KB | 302,120,960 | 0.147 / 0.142 / 0.157 | +| 4KB | 8,392,704 | 0.160 / 0.154 / 0.161 | +| 16KB | 33,570,816 | 0.073 / 0.074 / 0.072 | +| 32KB | 67,141,632 | 0.079 / 0.075 / 0.078 | +| 64KB | 134,283,264 | 0.069 / 0.066 / 0.068 | +| 128KB | 268,566,528 | 0.065 / 0.070 / 0.064 | Aggregate file: [`benchmark/profile/results/exp-137-long-text-scaling-aggregate.md`](../benchmark/profile/results/exp-137-long-text-scaling-aggregate.md). The 4KB row sits ~2x above the larger-size band because the per-iteration overhead (writer round-trip, microtask scheduling, -isolate dispatch, the per-iteration String allocation in the harness -itself) is comparable in absolute terms to the hashing work at small -sizes. From 16KB upward the per-byte cost converges to the **0.12 – -0.19 ns/byte** band — the implied hash-loop throughput is roughly -~6 GB/s per stream, about what the 8-byte FNV chunked loop should -sustain on a modern desktop CPU. - -The 64KB row carries a small per-byte hump (~0.17 – 0.19 ns/byte vs -~0.13 – 0.16 ns/byte at 32KB and 128KB) and a wider min-to-max spread -(min 15 – 16 ms vs median 25 – 28 ms vs max 34 – 36 ms). The -per-iteration String allocation crosses an old-generation GC threshold -at that size on this VM build; the 32KB and 128KB rows happen to land -on cleaner sides of that boundary. The hump sits inside the broader -0.12 – 0.19 ns/byte band and does not change the linear-scaling -verdict. +isolate dispatch) is comparable in absolute terms to the hashing +work at small sizes. From 16KB upward the per-byte cost converges to +the **0.065 – 0.080 ns/byte** band — the implied hash-loop throughput +is roughly ~13 – 15 GB/s per stream, in line with what the 8-byte FNV +chunked loop should sustain on a modern desktop CPU. + +The 64KB and 128KB rows carry wider min-to-max spreads (e.g. 64KB +min 5.81 ms vs median 9.21 ms vs max 18.73 ms; 128KB min 10.30 ms +vs median 17.40 ms vs max 32.93 ms). The per-iteration String +allocation built by `_longTextPayload` itself crosses Dart VM +old-generation heap-region thresholds at those sizes; the harness's +median is robust against the spread, but the p99 is not. The +medians sit cleanly in the 0.065 – 0.080 ns/byte band and do not +change the linear-scaling verdict. + +Wall scales linearly with bytes from 16KB up: 16→32 doubles bytes +and roughly doubles wall (2.15x median), 32→64 doubles bytes and +1.74x wall, 64→128 doubles bytes and 1.89x wall. ## Decision @@ -143,9 +157,10 @@ The audit answers the open `signals.json` question for the > long-payload streaming workload at sizes beyond exp 110's 4KB cells Wall scales **linearly** with bytes from 16KB up, with per-byte cost -in a stable 0.12 – 0.19 ns/byte band. Hashing — not SQLite text-fetch, -not Dart-side allocation, not isolate transfer — is the dominant cost -on long-cell unchanged-fanout workloads at meaningful cell sizes. +in a stable 0.065 – 0.080 ns/byte band. Hashing — not SQLite +text-fetch, not Dart-side allocation, not isolate transfer — is the +dominant cost on long-cell unchanged-fanout workloads at meaningful +cell sizes. What the resolution changes: @@ -186,16 +201,23 @@ What the resolution does *not* change: - A future workload at ≥16KB cells (long content streams, document archives, large JSON blobs) is the natural trigger for revisiting hash-loop work. Without one, removing the entire byte-stream hash - loop would still only save ~0.12 – 0.19 ns per byte hashed, which - is below the ±10% per-benchmark decision threshold for the - current 4KB release-suite shape. + loop would still only save ~0.07 ns per byte hashed, which is + below the ±10% per-benchmark decision threshold for the current + 4KB release-suite shape. - A future BLOB-shape audit at the same sweep sizes would confirm TEXT/BLOB symmetry; the underlying C path is shared, so a divergence between TEXT and BLOB at the same cell size would point at a SQLite text-fetch difference rather than a hash difference. -- The 64KB GC-spread observation (min 16 ms / max 36 ms / median ~26 - ms across passes) suggests the harness's per-iteration String - payload allocation is itself a measurable signal at that size. A +- The 64KB / 128KB GC-spread observation (max ~2x median across + passes) suggests the harness's per-iteration String payload + allocation is itself a measurable signal at those sizes. A pre-built payload pool variant of this harness would reduce the - spread; deferred because the `ns_per_byte` band is already stable + spread; deferred because the median per-byte band is already stable enough to support the linear-scaling verdict. +- The fixed-barrier-row trigger this audit uses (UPDATE against + `id = 999999`, picked outside every unchanged stream's predicate) + is the right shape for any future per-iteration-constant hashing + audit. Exp 110's INSERT-driven release benchmark grows the barrier + stream's hashed payload by one row per iteration; that is fine for + the per-benchmark shape but biases per-byte audit denominators by + ~1.3% over 30 iterations. diff --git a/experiments/README.md b/experiments/README.md index 4134301..63e8d7e 100644 --- a/experiments/README.md +++ b/experiments/README.md @@ -73,7 +73,7 @@ moved them. | [122](122-concrete-reader-pool-stream-admission.md) | Concrete reader-pool stream admission | Initializes `StreamEngine` with a concrete `ReaderPool` so `_flushQueue` stays synchronous and bounded by `availableWorkerCount`; tests now use diagnostics for stream registry size, and post-rebase profile counters stay at zero parks/retries/max-parked on A11c overlap and keyed-PK workloads | | | [125](125-wide-ascii-batch-params.md) | Wide ASCII batch parameter encoding | Direct ASCII payload packing skips temporary per-string UTF-8 lists in large wide batches; focused 10k x20 improves 17.199 → 12.760 ms and release Wide Batch Insert improves 18.201 → 13.031 ms | | | [126](126-wide-utf8-batch-packing.md) | Wide UTF-8 batch parameter packing | Direct UTF-8 payload packing extends exp 125's allocation win to guarded non-ASCII wide batches; focused Unicode 10k x20 improves 21.945 → 18.988 ms and emoji 10k x20 improves 24.187 → 17.458 ms while release write-suite guardrails remain neutral | | -| [137](137-long-text-cell-scaling.md) | Long-text cell-size scaling audit | Measurement-only: sweeps the exp 110 unchanged-fanout shape across [4KB, 16KB, 32KB, 64KB, 128KB] cells. Wall scales linearly with bytes from 16KB up; per-byte cost converges to a stable 0.12–0.19 ns/byte band on the existing 8-byte FNV chunked loop. The 4KB release shape sits ~2x above the band because per-iteration overhead dominates at that size — a faster hash variant would barely move it. Closes the long-text-stream-hashing direction's `blockedOnMeasurement` gate, replaces the broader-payload `openCandidate` with a wider FNV / SIMD probe candidate gated on a real ≥16KB workload | | +| [137](137-long-text-cell-scaling.md) | Long-text cell-size scaling audit | Measurement-only: sweeps the exp 110 unchanged-fanout shape across [4KB, 16KB, 32KB, 64KB, 128KB] cells using a fixed-row UPDATE trigger so per-iteration hashed payload stays constant. Wall scales linearly with bytes from 16KB up; per-byte cost converges to a stable 0.065–0.080 ns/byte band on the existing 8-byte FNV chunked loop (~13–15 GB/s implied per-stream throughput). The 4KB release shape sits ~2x above the band because per-iteration overhead dominates at that size — a faster hash variant would barely move it. Closes the long-text-stream-hashing direction's `blockedOnMeasurement` gate, replaces the broader-payload `openCandidate` with a wider FNV / SIMD probe candidate gated on a real ≥16KB workload | | ## Rejected diff --git a/experiments/signals.json b/experiments/signals.json index d5befc5..8dc7fae 100644 --- a/experiments/signals.json +++ b/experiments/signals.json @@ -100,7 +100,7 @@ "id": "long-text-stream-hashing", "status": "watch", "subsystems": ["streaming", "hashing", "text"], - "currentRead": "Native hashing is valuable, and long TEXT cells now have a representative unchanged-fanout benchmark plus a 4KB→128KB scaling audit. Exp 110 produced a -76% targeted win on the 4KB shape with the 8-byte FNV chunked loop. Exp 137 then swept the same workload across [4KB, 16KB, 32KB, 64KB, 128KB] and found wall scales linearly with bytes from 16KB up, with per-byte cost in a stable 0.12–0.19 ns/byte band. The 4KB shape sits ~2x above the band because per-iteration overhead (writer round-trip, mutex, microtask scheduling) is comparable to the hashing work at that size. Hash-loop variants are workload-dependent: the 4KB release-suite shape is per-iteration-overhead-bound and would barely move under a faster hash; ≥16KB workloads are per-byte-bound and would see a hash variant pay proportional to bytes.", + "currentRead": "Native hashing is valuable, and long TEXT cells now have a representative unchanged-fanout benchmark plus a 4KB→128KB scaling audit. Exp 110 produced a -76% targeted win on the 4KB shape with the 8-byte FNV chunked loop. Exp 137 then swept the same workload across [4KB, 16KB, 32KB, 64KB, 128KB] using a fixed-row UPDATE-driven barrier (so per-iteration hashed payload stays constant, unlike exp 110's growing-row INSERT) and found wall scales linearly with bytes from 16KB up, with per-byte cost in a stable 0.065–0.080 ns/byte band (~13–15 GB/s implied per-stream throughput). The 4KB shape sits ~2x above the band because per-iteration overhead (writer round-trip, mutex, microtask scheduling) is comparable to the hashing work at that size. Hash-loop variants are workload-dependent: the 4KB release-suite shape is per-iteration-overhead-bound and would barely move under a faster hash; ≥16KB workloads are per-byte-bound and would see a hash variant pay proportional to bytes.", "keyPriors": ["075", "099", "110", "137"], "archive": ["033"], "interestingIf": [ @@ -126,7 +126,7 @@ } ], "blockedOnMeasurement": [], - "notesForExperimenters": "Compare hash-loop variants against the exp 137 16KB+ band, not against exp 110's 4KB benchmark — the 4KB shape is per-iteration-overhead-bound and will not move proportionally to a per-byte hash improvement. The structural ceiling for removing all byte-stream hashing on the current release shape is ~0.22 ns/byte × 4KB × 256 rows × 8 streams ≈ 1.8 ms per fanout wave (vs ~2.1 ms median wall) — a faster hash would save almost all of that, but the absolute saving is below the ±10% per-benchmark decision threshold." + "notesForExperimenters": "Compare hash-loop variants against the exp 137 16KB+ band (0.065–0.080 ns/byte), not against exp 110's 4KB benchmark — the 4KB shape is per-iteration-overhead-bound and will not move proportionally to a per-byte hash improvement. Per-iteration hashing on the current 4KB release shape totals ~0.16 ns/byte × 4KB × 256 rows × 8 streams ≈ 1.3 ms per fanout wave (vs ~1.4 ms exp 137 median wall once row-growth bias is removed) — a faster hash would save most of that, but the absolute saving is below the ±10% per-benchmark decision threshold. When designing a new long-cell audit, prefer a fixed-row UPDATE trigger (exp 137 pattern) over an INSERT trigger (exp 110 release pattern) — INSERT grows the barrier stream's hashed payload by one row per iteration and biases the per-byte denominator." }, { "id": "sqlite-version-and-build-config", @@ -550,9 +550,10 @@ "directions": ["long-text-stream-hashing", "measurement-system"], "outcomeClass": "in_review_measurement", "changedBeliefs": [ - "Long-cell unchanged-fanout wall scales linearly with bytes from 16KB up; per-byte cost converges to a stable 0.12–0.19 ns/byte band on the 8-byte FNV chunked loop", + "Long-cell unchanged-fanout wall scales linearly with bytes from 16KB up; per-byte cost converges to a stable 0.065–0.080 ns/byte band on the 8-byte FNV chunked loop (~13–15 GB/s implied per-stream throughput)", "Exp 110's 4KB release shape sits ~2x above the per-byte band because per-iteration overhead (writer round-trip, mutex, microtask scheduling) is comparable to the hashing work at that size — a faster hash would barely move it", - "Hash-loop variants (wider unroll, SIMD probe) are interesting only for ≥16KB workloads; current release-suite shape is per-iteration-overhead-bound, not hash-bound" + "Hash-loop variants (wider unroll, SIMD probe) are interesting only for ≥16KB workloads; current release-suite shape is per-iteration-overhead-bound, not hash-bound", + "Per-byte hashing audits should use a fixed-row UPDATE trigger (constant per-iteration denominator) rather than an INSERT trigger (drifts the denominator by ~1.3% over 30 iterations)" ], "nextSignals": [ "compare any future hash-loop variant against the exp 137 16KB+ band, not the exp 110 4KB benchmark",