From 7f94757dc545a021f7ed1924a6ac5089a5603e2c Mon Sep 17 00:00:00 2001
From: Vasyl Vdovychenko <mrviduus@gmail.com>
Date: Fri, 19 Jun 2026 02:28:45 -0400
Subject: [PATCH] =?UTF-8?q?feat(rag):=20Ask=20this=20book=20=E2=80=94=20co?=
 =?UTF-8?q?nversational=20streaming=20chat=20+=20gpt-4.1-mini?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Turns the single-turn FAQ into a real chat (grounding + citations + spoiler gate
unchanged).
- Model: rag.ask -> gpt-4.1-mini (dedicated openai-rag provider, mirrors
  openai-explain; translate/podcast stay nano).
- Companion system prompt: warm, conversational, handles greetings; grounding is
  STRICT and overrides everything (covers themes/symbolism/interpretation, not
  just plot) -> every book-fact cites [n]; greeting/meta the only no-cite case.
- Multi-turn memory: AskRequest.History (client-held, server-clamped last 6
  turns / 4000 chars each); real LlmMessage[] (system -> excerpts -> history ->
  question); retrieval still per latest question. Eval call passes [].
- SSE streaming (content-negotiated, copies Explain): delta* then terminal done
  {citations,lastReadOrd,insufficient}; empty chunks -> friendly delta + done,
  NO model call (spoiler/hallucination short-circuit, both paths). JSON fallback
  unchanged. Both catalog + user-book ask. MaxOutputTokens 320->400.
- Web: streaming token render + blinking caret, suggested starters on empty
  thread, last-6 history sent, abort on new-question/unmount. Mobile keeps JSON.

architect -> backend+web -> adversarial QA (SHIP; closed grounding loophole +
added empty-chunk-no-LLM-call tests). 878 unit + 564 web green.
NOTE: re-run the paid grounding/citation eval on /ai-quality post-deploy.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                                  |   8 +
 apps/web/src/api/ask.ts                       |  71 ++++++-
 apps/web/src/components/reader/AskPanel.tsx   |  43 ++++-
 .../reader/__tests__/AskPanel.test.tsx        |  31 ++-
 apps/web/src/hooks/useAsk.test.ts             | 140 ++++++++++----
 apps/web/src/hooks/useAsk.ts                  | 131 +++++++++++--
 apps/web/src/locales/en.json                  |   9 +-
 apps/web/src/styles/reader.css                |  37 ++++
 .../TextStack.Ai.EvalSuite/RagEvalRunner.cs   |   2 +-
 backend/src/Api/Endpoints/AskEndpoints.cs     |  41 ++--
 backend/src/Api/Endpoints/AskSse.cs           | 144 ++++++++++++++
 .../src/Api/Endpoints/UserBookAskEndpoints.cs |  45 ++---
 backend/src/Api/appsettings.json              |   5 +-
 backend/src/Application/Ai/RagAskHistory.cs   |  74 ++++++++
 backend/src/Application/Ai/RagAskPrompt.cs    |  36 ++--
 .../src/Application/DependencyInjection.cs    |  11 +-
 backend/src/Application/Rag/RagAskService.cs  | 152 +++++++++++++--
 backend/src/Contracts/Books/AskDtos.cs        |  15 +-
 .../Persistence/ModelRegistrySeeder.cs        |   2 +-
 packages/shared/src/types/api.ts              |   9 +
 tests/TextStack.AiEvals/RagEvalRunnerTests.cs |   4 +-
 .../AskEndpointTests.cs                       |  54 ++++++
 tests/TextStack.UnitTests/AskSseTests.cs      | 103 ++++++++++
 .../TextStack.UnitTests/RagAskHistoryTests.cs | 127 +++++++++++++
 .../TextStack.UnitTests/RagAskPromptTests.cs  |  80 ++++++--
 .../TextStack.UnitTests/RagAskServiceTests.cs | 178 ++++++++++++++++++
 26 files changed, 1403 insertions(+), 149 deletions(-)
 create mode 100644 backend/src/Api/Endpoints/AskSse.cs
 create mode 100644 backend/src/Application/Ai/RagAskHistory.cs
 create mode 100644 tests/TextStack.UnitTests/AskSseTests.cs
 create mode 100644 tests/TextStack.UnitTests/RagAskHistoryTests.cs
 create mode 100644 tests/TextStack.UnitTests/RagAskServiceTests.cs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1c02d583..e367e9ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 ## [Unreleased]
 
+### Ask this book — conversational, streaming web chat — backend (AI-028) (2026-06-19)
+
+Backend for the conversational "Ask this book" upgrade: **model bump + multi-turn memory + warm-companion prompt + SSE streaming**, with grounding, citations, and the spoiler gate intact. `rag.ask` now routes to a dedicated keyed provider `openai-rag` on **gpt-4.1-mini** (was gpt-4.1-nano), mirroring `openai-explain` (`OpenAI:RagAsk:Model`, `Ai:Routes:rag.ask → openai-rag`, decorator-loop entry, `ModelRegistrySeeder` row). The system prompt is rewritten from "answer ONLY from excerpts else refuse" to a **warm reading companion** that is still strictly grounded — every book-fact claim must come from the numbered excerpts and cite `[n]` (citation contract + parser unchanged), but greetings/meta ("hi", "what can you do") get a warm invite with **no forced citation and no refusal**, and a genuine question with no matching excerpt gets a graceful "I don't see that in what you've read so far" rather than an invented fact. **Multi-turn**: `AskRequest` gains `History: AskTurnDto[]` (role `"user"`/`"assistant"`); the server defensively clamps to the **last 6 turns**, caps each turn at 4000 chars, normalizes roles, and assembles a real chat (system → numbered-excerpts context block → prior turns → new question last). Retrieval still runs on the latest question only, so the grounding eval is byte-identical with `[]` history. **SSE** (content-negotiated, mirrors Explain): `Accept: text/event-stream` → `delta` events (token fragments) then a terminal `done` carrying `{ citations, lastReadOrd, insufficient }` (camelCase, identical citation shape to the JSON path); empty-chunks → one friendly `delta` + `done {insufficient:true}` with **no model call**; provider/mid-stream failure → terminal `error`. JSON path returns the unchanged `AskResponse` (eval + mobile keep working). Ask `MaxOutputTokens` raised 320 → 400 for conversational length. `dotnet build -c Release` clean; 868 unit tests green (history clamp, multi-turn message assembly, SSE event sequencing over a fake delta stream, companion greeting-vs-content prompt structure) + integration (catalog spoiler-gate, owner-404, SSE content-type + framing, JSON history passthrough — skip-on-unavailable). **Note: the grounding golden eval (`RagEvalRunner`) MUST be re-run on mini post-deploy** — the companion prompt loosened the refusal rule, so this is the real hallucination-risk gate (paid; not runnable in CI). Frontend = parallel agent (AI-026e).
+
+### Ask this book — conversational, streaming web chat (AI-026e) (2026-06-19)
+
+"Ask this book" goes from one-shot Q&A to a streaming, multi-turn reading companion on the web. Answers now stream **token-by-token**: the in-progress assistant turn grows live with a subtle blinking caret, citation chips render under it once the stream's `done` event lands. The panel sends the **last 6 turns** of history (bounded client-side) on each request for follow-up context, and shows **3–4 suggested starter questions** ("Summarize what I've read so far", "Who are the main characters?", "Explain the key idea so far", "What should I pay attention to?") on an empty, Ready thread — one click submits. Reuses the existing `postSse` SSE-over-POST client (same path Explain streams on) — `askStream(target, question, history, {onDelta,onDone,onError,signal})` POSTs `{question, history, currentChapterId?}` with `Accept: text/event-stream`, routing by `target.kind` to catalog `/books/{id}/ask` or userbook `/me/books/{id}/ask`. Browsers that can't stream fall back to the JSON single-turn endpoint; 401 → sign-in. Grounding, citation chips + jump-to-passage, and the Prepare → Preparing N/M → Ready/Failed index state machine are **unchanged** (server owns grounding). Shared `AskTurnDto` added to `@textstack/shared` (web + mobile consume; mobile keeps its JSON path). tsc + web build clean; mobile tsc clean; 564 web tests green (extended `useAsk` for delta accumulation → done-sets-citations, history bounded to 6, abort-on-unmount; `AskPanel` for starters render + submit). Backend = parallel agent.
+
 ### Ask this book — on-demand indexing, Phase 2 (user-uploaded books) — backend (2026-06-19)
 
 Backend for "Ask this book" over **user-uploaded books** (`UserBook`/`UserChapter`), mirroring the P1 catalog path with **per-user isolation as a hard requirement** (a user must never retrieve another user's chunks). New **isolated** `user_chapter_chunk` table (NOT polymorphic with `chapter_chunk`) carries a **denormalized `user_id`** alongside `user_book_id`; retrieval filters on **both** in SQL (defense in depth), in both the vector and lexical branches. `UserBook` gains the same `rag_status/rag_chunk_count/rag_embedded_count/rag_indexed_at/rag_error` fields as `Edition`. `BookChunkingService.ChunkUserBookAsync` chunks `UserChapter.PlainText` into the new table (stamping the owner id from the book); the existing `ChapterEmbeddingWorker` now runs a **second batch poll** over `user_chapter_chunk` on the **same single OpenAI drain** (no second worker) and flips `user_books.rag_status → Ready` when `embedded == chunk`. `IRagService.RetrieveUserBookAsync` reuses the identical RRF hybrid (vector NN + lexical FTS) via a shared private helper, so fusion/vector-format/timeout stay byte-identical to the catalog path. Owner-scoped endpoints: `POST/GET /me/books/{id}/index` (atomic claim `WHERE id=@id AND user_id=@uid AND rag_status IN (0,3)`, clears stale chunks before re-chunk, rate-limited `rag.index`) and `POST /me/books/{id}/ask` (**no spoiler gate** — full-book retrieval over the user's own document, no private-notes corpus; reuses `RagAskService.AskFromChunksAsync`; 404 if not the owner's book). `GET /me/books/{id}` detail DTO gains `ragStatus`/counts. P1 catalog path unchanged. Migration `AddUserBookRagIndex` (creates `user_chapter_chunk` + HNSW/GIN/`(user_id, user_book_id)` indexes + generated `search_vector` + cascade FKs from both `user_books` and `user_chapters`, plus the `user_books.rag_*` columns; Up/Down verified against pgvector). 852 unit tests green (incl. `ChunkUserBookAsync` row shape + a SQL-level isolation guard asserting both `user_id`/`user_book_id` filters in both retrievers) + integration tests (unauth→401, non-owner→404, owner→202/200/answer). Frontend = parallel agent.
diff --git a/apps/web/src/api/ask.ts b/apps/web/src/api/ask.ts
index 34b021ae..03e9d0fd 100644
--- a/apps/web/src/api/ask.ts
+++ b/apps/web/src/api/ask.ts
@@ -1,10 +1,77 @@
 import { authFetch } from './client'
-import type { AskResponse } from '@textstack/shared'
+import type { AskResponse, AskCitation, AskTurnDto } from '@textstack/shared'
+import { postSse } from '../lib/sse'
 import type { RagIndexState, RagIndexStatus } from '../types/api'
 
-export type { AskResponse, AskCitation } from '@textstack/shared'
+export type { AskResponse, AskCitation, AskTurnDto } from '@textstack/shared'
 export type { RagIndexState, RagIndexStatus } from '../types/api'
 
+// API_BASE: host (dev) or '' (prod, nginx strips /api). Backend route has no prefix.
+const API_BASE = import.meta.env.VITE_API_URL ?? ''
+
+/** Most recent conversation turns to send back for multi-turn context (AI-026e). */
+export const MAX_HISTORY_TURNS = 6
+
+/** Terminal `done` payload of a streamed ask (AI-026e). */
+export interface AskDone {
+  citations: AskCitation[]
+  lastReadOrd: number
+  insufficient: boolean
+}
+
+interface AskStreamCallbacks {
+  onDelta: (fragment: string) => void
+  onDone: (done: AskDone) => void
+  onError: (message: string) => void
+  signal?: AbortSignal
+}
+
+/**
+ * Streaming "Ask this book" (AI-026e). POSTs the question + bounded `history` with
+ * `Accept: text/event-stream` and consumes SSE via {@link postSse}: `delta` fragments append to the
+ * answer, `done` carries citations/insufficient, `error` surfaces a message. URL routes by
+ * `target.kind` (catalog vs userbook). Throws `SseUnauthorizedError`/`SseUnsupportedError` from
+ * `postSse` for the caller to handle (sign-in / JSON fallback).
+ */
+export function askStream(
+  target: AskTarget,
+  question: string,
+  history: AskTurnDto[],
+  { onDelta, onDone, onError, signal }: AskStreamCallbacks,
+  currentChapterId?: string,
+): Promise<void> {
+  const url =
+    target.kind === 'userbook'
+      ? `${API_BASE}/me/books/${target.id}/ask`
+      : `${API_BASE}/books/${target.id}/ask`
+  const body = {
+    question,
+    history: history.slice(-MAX_HISTORY_TURNS),
+    ...(currentChapterId ? { currentChapterId } : {}),
+  }
+  return postSse(
+    url,
+    body,
+    e => {
+      if (signal?.aborted) return
+      if (e.event === 'delta') onDelta(e.data)
+      else if (e.event === 'done') {
+        try {
+          const parsed = JSON.parse(e.data) as Partial<AskDone>
+          onDone({
+            citations: parsed.citations ?? [],
+            lastReadOrd: parsed.lastReadOrd ?? 0,
+            insufficient: Boolean(parsed.insufficient),
+          })
+        } catch {
+          onDone({ citations: [], lastReadOrd: 0, insufficient: false })
+        }
+      } else if (e.event === 'error') onError(e.data || 'Ask failed')
+    },
+    signal,
+  )
+}
+
 /**
  * Identifies what the "Ask this book" panel is pointed at (AI-027 P2). A catalog `edition`
  * routes to `/books/{id}/...`; a user-uploaded `userbook` routes to `/me/books/{id}/...`.
diff --git a/apps/web/src/components/reader/AskPanel.tsx b/apps/web/src/components/reader/AskPanel.tsx
index b8f95acd..6c7b1d5f 100644
--- a/apps/web/src/components/reader/AskPanel.tsx
+++ b/apps/web/src/components/reader/AskPanel.tsx
@@ -52,6 +52,15 @@ export function AskPanel({
     setInput('')
   }
 
+  const submitStarter = (q: string) => {
+    if (isLoading) return
+    ask(q)
+  }
+
+  // Suggested starter questions, shown only on an empty, Ready thread (AI-026e).
+  const starterKeys = ['summary', 'characters', 'keyIdea', 'attention'] as const
+  const showStarters = history.length === 0 && status === 'Ready' && isAuthenticated && !isLoading
+
   return (
     <>
       <div className="reader-drawer-backdrop" onClick={onClose} />
@@ -69,11 +78,35 @@ export function AskPanel({
           {history.length === 0 && !isLoading && (
             <p className="ask-panel__empty">{t('reader.ask.empty')}</p>
           )}
+          {showStarters && (
+            <div className="ask-panel__starters">
+              <p className="ask-panel__starters-title">{t('reader.ask.startersTitle')}</p>
+              {starterKeys.map(key => (
+                <button
+                  key={key}
+                  type="button"
+                  className="ask-panel__starter"
+                  onClick={() => submitStarter(t(`reader.ask.starters.${key}`))}
+                >
+                  {t(`reader.ask.starters.${key}`)}
+                </button>
+              ))}
+            </div>
+          )}
           {history.map((turn, i) => (
             <div key={i} className="ask-panel__turn">
               <p className="ask-panel__question">{turn.question}</p>
-              <p className="ask-panel__answer">{turn.answer}</p>
-              {turn.citations.length > 0 && (
+              <p className="ask-panel__answer">
+                {turn.answer}
+                {turn.streaming && <span className="ask-panel__cursor" aria-hidden="true" />}
+              </p>
+              {turn.streaming && !turn.answer && (
+                <div className="ask-panel__loading">
+                  <span className="ask-panel__spinner" />
+                  {t('reader.ask.thinking')}
+                </div>
+              )}
+              {!turn.streaming && turn.citations.length > 0 && (
                 <div className="ask-panel__citations">
                   {turn.citations.map(c => (
                     <button
@@ -89,12 +122,6 @@ export function AskPanel({
               )}
             </div>
           ))}
-          {isLoading && (
-            <div className="ask-panel__loading">
-              <span className="ask-panel__spinner" />
-              {t('reader.ask.thinking')}
-            </div>
-          )}
           {error && error !== 'auth' && <p className="ask-panel__error">{error}</p>}
         </div>
 
diff --git a/apps/web/src/components/reader/__tests__/AskPanel.test.tsx b/apps/web/src/components/reader/__tests__/AskPanel.test.tsx
index 25959232..9e619314 100644
--- a/apps/web/src/components/reader/__tests__/AskPanel.test.tsx
+++ b/apps/web/src/components/reader/__tests__/AskPanel.test.tsx
@@ -38,6 +38,7 @@ const baseProps = {
 afterEach(() => {
   cleanup()
   askState.history = []
+  askState.ask = vi.fn()
   ragState.status = 'Ready'
   ragState.chunkCount = 0
   ragState.embeddedCount = 0
@@ -96,7 +97,7 @@ describe('AskPanel', () => {
 
   it('renders a citation chip and navigates on click', () => {
     const citation = { marker: 1, chunkId: 'c1', chapterId: 'ch1', chapterOrd: 4, charStart: 0, charEnd: 1, preview: 'snippet' }
-    askState.history = [{ question: 'q', answer: 'a [1]', citations: [citation], insufficient: false }]
+    askState.history = [{ question: 'q', answer: 'a [1]', citations: [citation], insufficient: false, streaming: false }]
     const onNavigateToCitation = vi.fn()
 
     render(<AskPanel {...baseProps} isAuthenticated={true} onNavigateToCitation={onNavigateToCitation} />)
@@ -105,4 +106,32 @@ describe('AskPanel', () => {
 
     expect(onNavigateToCitation).toHaveBeenCalledWith(citation)
   })
+
+  it('shows starter questions on an empty, Ready thread and submits one on click', () => {
+    askState.history = []
+    ragState.status = 'Ready'
+    const ask = vi.fn()
+    askState.ask = ask
+
+    render(<AskPanel {...baseProps} isAuthenticated={true} />)
+
+    expect(screen.getByText('reader.ask.startersTitle')).toBeTruthy()
+    const starter = screen.getByText('reader.ask.starters.summary')
+    fireEvent.click(starter)
+    expect(ask).toHaveBeenCalledWith('reader.ask.starters.summary')
+  })
+
+  it('hides starters once the thread has a turn', () => {
+    askState.history = [{ question: 'q', answer: 'a', citations: [], insufficient: false, streaming: false }]
+    ragState.status = 'Ready'
+    render(<AskPanel {...baseProps} isAuthenticated={true} />)
+    expect(screen.queryByText('reader.ask.startersTitle')).toBeNull()
+  })
+
+  it('does not show starters until the index is Ready', () => {
+    askState.history = []
+    ragState.status = 'NotIndexed'
+    render(<AskPanel {...baseProps} isAuthenticated={true} />)
+    expect(screen.queryByText('reader.ask.startersTitle')).toBeNull()
+  })
 })
diff --git a/apps/web/src/hooks/useAsk.test.ts b/apps/web/src/hooks/useAsk.test.ts
index 75927c04..6285e3ae 100644
--- a/apps/web/src/hooks/useAsk.test.ts
+++ b/apps/web/src/hooks/useAsk.test.ts
@@ -1,12 +1,24 @@
 import { describe, it, expect, vi, beforeEach } from 'vitest'
 import { renderHook, act, waitFor } from '@testing-library/react'
 
-vi.mock('../api/ask', () => ({ ask: vi.fn(), askUserBook: vi.fn() }))
-import { ask as askApi, askUserBook as askUserBookApi, type AskTarget } from '../api/ask'
+vi.mock('../api/ask', () => ({
+  ask: vi.fn(),
+  askUserBook: vi.fn(),
+  askStream: vi.fn(),
+  MAX_HISTORY_TURNS: 6,
+}))
+import {
+  ask as askJsonApi,
+  askUserBook as askUserBookJsonApi,
+  askStream as askStreamApi,
+  type AskTarget,
+} from '../api/ask'
+import { SseUnsupportedError } from '../lib/sse'
 import { useAsk } from './useAsk'
 
-const mockAsk = askApi as unknown as ReturnType<typeof vi.fn>
-const mockAskUserBook = askUserBookApi as unknown as ReturnType<typeof vi.fn>
+const mockAskStream = askStreamApi as unknown as ReturnType<typeof vi.fn>
+const mockAskJson = askJsonApi as unknown as ReturnType<typeof vi.fn>
+const mockAskUserBookJson = askUserBookJsonApi as unknown as ReturnType<typeof vi.fn>
 
 const edition: AskTarget = { kind: 'edition', id: 'ed-1' }
 const userbook: AskTarget = { kind: 'userbook', id: 'ub-1' }
@@ -15,69 +27,133 @@ const citation = {
   marker: 1, chunkId: 'c1', chapterId: 'ch1', chapterOrd: 2, charStart: 0, charEnd: 5, preview: 'preview',
 }
 
-describe('useAsk', () => {
+type Callbacks = {
+  onDelta: (s: string) => void
+  onDone: (d: { citations: unknown[]; lastReadOrd: number; insufficient: boolean }) => void
+  onError: (m: string) => void
+  signal?: AbortSignal
+}
+
+/** Drives askStream: emit deltas then a done payload. */
+function streamDeltas(deltas: string[], done: Partial<{ citations: unknown[]; insufficient: boolean }> = {}) {
+  mockAskStream.mockImplementationOnce(
+    async (_t: AskTarget, _q: string, _h: unknown, cb: Callbacks) => {
+      for (const d of deltas) cb.onDelta(d)
+      cb.onDone({ citations: done.citations ?? [], lastReadOrd: 0, insufficient: Boolean(done.insufficient) })
+    },
+  )
+}
+
+describe('useAsk (streaming)', () => {
   beforeEach(() => {
-    mockAsk.mockReset()
-    mockAskUserBook.mockReset()
+    mockAskStream.mockReset()
+    mockAskJson.mockReset()
+    mockAskUserBookJson.mockReset()
   })
 
-  it('appends a turn on success', async () => {
-    mockAsk.mockResolvedValueOnce({ answer: 'Because [1].', citations: [citation], lastReadOrd: 3, insufficient: false })
+  it('accumulates deltas into the turn answer, then done sets citations', async () => {
+    streamDeltas(['Because ', 'of [1].'], { citations: [citation] })
     const { result } = renderHook(() => useAsk(edition))
 
     await act(() => result.current.ask('why?'))
 
-    await waitFor(() => expect(result.current.history).toHaveLength(1))
-    expect(result.current.history[0]).toMatchObject({ question: 'why?', answer: 'Because [1].' })
+    await waitFor(() => expect(result.current.isLoading).toBe(false))
+    expect(result.current.history).toHaveLength(1)
+    expect(result.current.history[0]).toMatchObject({ question: 'why?', answer: 'Because of [1].', streaming: false })
     expect(result.current.history[0].citations).toHaveLength(1)
     expect(result.current.error).toBeNull()
   })
 
-  it('sets error and keeps history empty on failure', async () => {
-    mockAsk.mockRejectedValueOnce(new Error('boom'))
+  it('flags insufficient turns from the done payload', async () => {
+    streamDeltas(['Read more first.'], { insufficient: true })
     const { result } = renderHook(() => useAsk(edition))
 
     await act(() => result.current.ask('why?'))
 
-    await waitFor(() => expect(result.current.error).toBe('boom'))
-    expect(result.current.history).toHaveLength(0)
+    await waitFor(() => expect(result.current.history).toHaveLength(1))
+    expect(result.current.history[0].insufficient).toBe(true)
   })
 
-  it('flags insufficient turns', async () => {
-    mockAsk.mockResolvedValueOnce({ answer: 'Read more first.', citations: [], lastReadOrd: 0, insufficient: true })
+  it('sends bounded last-6-turn history on the next question', async () => {
+    // Seed 7 prior answered turns via repeated streamed asks.
     const { result } = renderHook(() => useAsk(edition))
+    for (let i = 0; i < 7; i++) {
+      streamDeltas([`answer-${i}`])
+      await act(() => result.current.ask(`q-${i}`))
+      await waitFor(() => expect(result.current.isLoading).toBe(false))
+    }
+
+    // The 8th question should carry only the last 6 turns (12 user/assistant messages).
+    streamDeltas(['final'])
+    await act(() => result.current.ask('q-final'))
+    await waitFor(() => expect(mockAskStream).toHaveBeenCalledTimes(8))
+
+    const lastCall = mockAskStream.mock.calls[7]
+    const sentHistory = lastCall[2] as { role: string; content: string }[]
+    // Hook bounds to last 6 *turns* (slice(-6)); each turn = 2 messages → 12 messages.
+    expect(sentHistory.length).toBeLessThanOrEqual(12)
+    // Earliest sent message must not be q-0 (it was dropped by the bound).
+    expect(sentHistory[0].content).not.toBe('q-0')
+  })
+
+  it('routes a userbook target to the userbook ask url', async () => {
+    streamDeltas(['ok'])
+    const { result } = renderHook(() => useAsk(userbook, 'chap-guid-9'))
 
     await act(() => result.current.ask('why?'))
 
-    await waitFor(() => expect(result.current.history).toHaveLength(1))
-    expect(result.current.history[0].insufficient).toBe(true)
+    await waitFor(() => expect(mockAskStream).toHaveBeenCalled())
+    expect(mockAskStream.mock.calls[0][0]).toEqual(userbook)
+    expect(mockAskStream.mock.calls[0][4]).toBe('chap-guid-9')
   })
 
-  it('forwards currentChapterId to the api when provided', async () => {
-    mockAsk.mockResolvedValueOnce({ answer: 'ok', citations: [], lastReadOrd: 0, insufficient: false })
-    const { result } = renderHook(() => useAsk(edition, 'chap-guid-4'))
+  it('surfaces a stream error event and clears streaming flag', async () => {
+    mockAskStream.mockImplementationOnce(async (_t, _q, _h, cb: Callbacks) => {
+      cb.onDelta('Partial ')
+      cb.onError('Ask service unavailable')
+    })
+    const { result } = renderHook(() => useAsk(edition))
 
     await act(() => result.current.ask('why?'))
 
-    await waitFor(() => expect(mockAsk).toHaveBeenCalled())
-    expect(mockAsk).toHaveBeenCalledWith('ed-1', 'why?', undefined, expect.anything(), 'chap-guid-4')
+    await waitFor(() => expect(result.current.error).toBe('Ask service unavailable'))
+    expect(result.current.history[0].answer).toBe('Partial ')
+    expect(result.current.history[0].streaming).toBe(false)
   })
 
-  it('routes a userbook target to askUserBook (not the catalog ask)', async () => {
-    mockAskUserBook.mockResolvedValueOnce({ answer: 'ok', citations: [], lastReadOrd: 0, insufficient: false })
-    const { result } = renderHook(() => useAsk(userbook, 'chap-guid-9'))
+  it('falls back to the JSON request when streaming is unsupported', async () => {
+    mockAskStream.mockRejectedValueOnce(new SseUnsupportedError())
+    mockAskJson.mockResolvedValueOnce({ answer: 'Plain answer.', citations: [citation], lastReadOrd: 1, insufficient: false })
+    const { result } = renderHook(() => useAsk(edition))
 
     await act(() => result.current.ask('why?'))
 
-    await waitFor(() => expect(mockAskUserBook).toHaveBeenCalled())
-    expect(mockAskUserBook).toHaveBeenCalledWith('ub-1', 'why?', undefined, expect.anything(), 'chap-guid-9')
-    expect(mockAsk).not.toHaveBeenCalled()
+    await waitFor(() => expect(result.current.history[0].answer).toBe('Plain answer.'))
+    expect(mockAskJson).toHaveBeenCalledTimes(1)
+    expect(result.current.error).toBeNull()
+  })
+
+  it('aborts the in-flight stream on unmount', async () => {
+    let captured: AbortSignal | undefined
+    mockAskStream.mockImplementationOnce(
+      (_t, _q, _h, cb: Callbacks) => {
+        captured = cb.signal
+        return new Promise(() => {}) // never resolves
+      },
+    )
+    const { result, unmount } = renderHook(() => useAsk(edition))
+
+    act(() => { void result.current.ask('why?') })
+    await waitFor(() => expect(captured).toBeDefined())
+    expect(captured!.aborted).toBe(false)
+
+    unmount()
+    expect(captured!.aborted).toBe(true)
   })
 
   it('no-ops without a target', async () => {
     const { result } = renderHook(() => useAsk(undefined))
     await act(() => result.current.ask('why?'))
-    expect(mockAsk).not.toHaveBeenCalled()
-    expect(mockAskUserBook).not.toHaveBeenCalled()
+    expect(mockAskStream).not.toHaveBeenCalled()
   })
 })
diff --git a/apps/web/src/hooks/useAsk.ts b/apps/web/src/hooks/useAsk.ts
index 9c38e2b2..f25270ea 100644
--- a/apps/web/src/hooks/useAsk.ts
+++ b/apps/web/src/hooks/useAsk.ts
@@ -1,35 +1,87 @@
 import { useState, useCallback, useRef, useEffect } from 'react'
-import { ask as askApi, askUserBook as askUserBookApi, type AskCitation, type AskTarget } from '../api/ask'
+import {
+  ask as askJsonApi,
+  askUserBook as askUserBookJsonApi,
+  askStream,
+  MAX_HISTORY_TURNS,
+  type AskCitation,
+  type AskTarget,
+  type AskTurnDto,
+} from '../api/ask'
 import { ApiError } from '../api/client'
+import { SseUnauthorizedError, SseUnsupportedError } from '../lib/sse'
 
 export interface AskTurn {
   question: string
   answer: string
   citations: AskCitation[]
   insufficient: boolean
+  /** True while the answer is still streaming in (token-by-token). */
+  streaming: boolean
+}
+
+/** Flattens the visible Q&A history into the user/assistant turn list the server expects. */
+function toDto(history: AskTurn[]): AskTurnDto[] {
+  const turns: AskTurnDto[] = []
+  for (const t of history) {
+    turns.push({ role: 'user', content: t.question })
+    if (t.answer) turns.push({ role: 'assistant', content: t.answer })
+  }
+  return turns
 }
 
 /**
- * Session "Ask this book" state (AI-026a): an in-memory Q&A history (not persisted), plus loading
- * and error. `ask` appends a turn; in-flight requests are aborted on a new question / unmount.
+ * Session "Ask this book" state (AI-026e): a streamed, multi-turn in-memory Q&A history (not
+ * persisted), plus loading and error. `ask` optimistically appends a turn, streams the answer
+ * token-by-token (`delta` → append; `done` → citations/insufficient), and sends the last 6 turns
+ * for context. In-flight requests abort on a new question / unmount.
  *
- * `target.kind` (AI-027 P2) routes the POST — catalog `/books/{id}/ask` vs user-upload
- * `/me/books/{id}/ask`.
+ * `target.kind` (AI-027 P2) routes the request — catalog `/books/{id}/ask` vs user-upload
+ * `/me/books/{id}/ask`. Browsers that can't stream fall back to the JSON single-turn endpoint.
  */
 export function useAsk(target: AskTarget | undefined, currentChapterId?: string) {
-  const id = target?.id
-  const kind = target?.kind
   const [history, setHistory] = useState<AskTurn[]>([])
   const [isLoading, setIsLoading] = useState(false)
   const [error, setError] = useState<string | null>(null)
   const abortRef = useRef<AbortController | null>(null)
+  const mountedRef = useRef(true)
 
-  useEffect(() => () => abortRef.current?.abort(), [])
+  useEffect(() => {
+    mountedRef.current = true
+    return () => {
+      mountedRef.current = false
+      abortRef.current?.abort()
+    }
+  }, [])
+
+  /** Append a fragment to the last (streaming) turn's answer. */
+  const appendDelta = useCallback((fragment: string) => {
+    setHistory(prev => {
+      if (prev.length === 0) return prev
+      const next = prev.slice()
+      const last = next[next.length - 1]
+      next[next.length - 1] = { ...last, answer: last.answer + fragment }
+      return next
+    })
+  }, [])
+
+  const finishTurn = useCallback(
+    (patch: Partial<AskTurn>) => {
+      setHistory(prev => {
+        if (prev.length === 0) return prev
+        const next = prev.slice()
+        const last = next[next.length - 1]
+        next[next.length - 1] = { ...last, ...patch, streaming: false }
+        return next
+      })
+    },
+    [],
+  )
 
   const ask = useCallback(
     async (question: string) => {
       const q = question.trim()
-      if (!q || !id || isLoading) return
+      if (!q || !target || isLoading) return
 
       abortRef.current?.abort()
       const ctrl = new AbortController()
@@ -37,22 +89,63 @@ export function useAsk(target: AskTarget | undefined, currentChapterId?: string)
       setIsLoading(true)
       setError(null)
 
+      // History to send is the last N turns *before* this question, flattened to messages.
+      const priorHistory = toDto(history.slice(-MAX_HISTORY_TURNS))
+      setHistory(prev => [
+        ...prev,
+        { question: q, answer: '', citations: [], insufficient: false, streaming: true },
+      ])
+
       try {
-        const fn = kind === 'userbook' ? askUserBookApi : askApi
-        const res = await fn(id, q, undefined, ctrl.signal, currentChapterId)
-        setHistory(prev => [
-          ...prev,
-          { question: q, answer: res.answer, citations: res.citations, insufficient: res.insufficient },
-        ])
+        await askStream(
+          target,
+          q,
+          priorHistory,
+          {
+            onDelta: appendDelta,
+            onDone: done => {
+              if (ctrl.signal.aborted) return
+              finishTurn({ citations: done.citations, insufficient: done.insufficient })
+            },
+            onError: msg => {
+              if (ctrl.signal.aborted) return
+              finishTurn({})
+              setError(msg)
+            },
+            signal: ctrl.signal,
+          },
+          currentChapterId,
+        )
+        if (!ctrl.signal.aborted) finishTurn({})
       } catch (err) {
         if ((err as { name?: string })?.name === 'AbortError') return
-        if (err instanceof ApiError && err.status === 401) setError('auth')
-        else setError(err instanceof Error ? err.message : 'Ask failed')
+        if (ctrl.signal.aborted) return
+
+        // No streaming support → one-shot JSON request (single-turn fallback).
+        if (err instanceof SseUnsupportedError) {
+          try {
+            const fn = target.kind === 'userbook' ? askUserBookJsonApi : askJsonApi
+            const res = await fn(target.id, q, undefined, ctrl.signal, currentChapterId)
+            if (ctrl.signal.aborted) return
+            finishTurn({ answer: res.answer, citations: res.citations, insufficient: res.insufficient })
+          } catch (fallbackErr) {
+            if ((fallbackErr as { name?: string })?.name === 'AbortError') return
+            finishTurn({})
+            if (fallbackErr instanceof ApiError && fallbackErr.status === 401) setError('auth')
+            else setError(fallbackErr instanceof Error ? fallbackErr.message : 'Ask failed')
+          }
+        } else if (err instanceof SseUnauthorizedError) {
+          finishTurn({})
+          setError('auth')
+        } else {
+          finishTurn({})
+          setError(err instanceof Error ? err.message : 'Ask failed')
+        }
       } finally {
-        if (abortRef.current === ctrl) setIsLoading(false)
+        if (abortRef.current === ctrl && mountedRef.current) setIsLoading(false)
       }
     },
-    [id, kind, isLoading, currentChapterId],
+    [target, isLoading, history, currentChapterId, appendDelta, finishTurn],
   )
 
   return { history, isLoading, error, ask }
diff --git a/apps/web/src/locales/en.json b/apps/web/src/locales/en.json
index b88eca93..3c6e471d 100644
--- a/apps/web/src/locales/en.json
+++ b/apps/web/src/locales/en.json
@@ -200,7 +200,14 @@
       "prepareCta": "Prepare this book for questions",
       "preparing": "Preparing this book… {{done}}/{{total}}",
       "indexFailed": "Preparation failed.",
-      "indexRetry": "Retry"
+      "indexRetry": "Retry",
+      "startersTitle": "Try asking",
+      "starters": {
+        "summary": "Summarize what I've read so far",
+        "characters": "Who are the main characters?",
+        "keyIdea": "Explain the key idea so far",
+        "attention": "What should I pay attention to?"
+      }
     },
     "studyBuddy": {
       "title": "Help me understand this",
diff --git a/apps/web/src/styles/reader.css b/apps/web/src/styles/reader.css
index 6752d3db..ac619130 100644
--- a/apps/web/src/styles/reader.css
+++ b/apps/web/src/styles/reader.css
@@ -2270,6 +2270,43 @@ html[data-theme="dark"] .vocab-translation-overlay__item {
   font-size: 14px;
   margin: 0;
 }
+/* Streaming caret on the in-progress answer (AI-026e) */
+.ask-panel__cursor {
+  display: inline-block;
+  width: 2px;
+  height: 1em;
+  margin-left: 2px;
+  vertical-align: text-bottom;
+  background: var(--reader-accent, #2563eb);
+  animation: stream-cursor-blink 1s steps(2, start) infinite;
+}
+/* Suggested starter questions on an empty thread (AI-026e) */
+.ask-panel__starters {
+  display: flex;
+  flex-direction: column;
+  gap: 8px;
+  margin-top: 8px;
+}
+.ask-panel__starters-title {
+  margin: 0 0 2px;
+  font-size: 12px;
+  text-transform: uppercase;
+  letter-spacing: 0.04em;
+  color: var(--reader-secondary);
+}
+.ask-panel__starter {
+  text-align: left;
+  font-size: 14px;
+  padding: 10px 12px;
+  border-radius: 10px;
+  border: 1px solid var(--reader-border);
+  background: var(--reader-bar-bg);
+  color: var(--reader-text);
+  cursor: pointer;
+}
+.ask-panel__starter:hover {
+  border-color: var(--reader-text);
+}
 .ask-panel__composer {
   flex: 0 0 auto;
   border-top: 1px solid var(--reader-border);
diff --git a/backend/src/Ai/TextStack.Ai.EvalSuite/RagEvalRunner.cs b/backend/src/Ai/TextStack.Ai.EvalSuite/RagEvalRunner.cs
index 15cb6166..1ce45dbe 100644
--- a/backend/src/Ai/TextStack.Ai.EvalSuite/RagEvalRunner.cs
+++ b/backend/src/Ai/TextStack.Ai.EvalSuite/RagEvalRunner.cs
@@ -153,7 +153,7 @@ private async Task<RagCitationSummary> JudgeCitationsAsync(
         {
             ct.ThrowIfCancellationRequested();
             // lastReadOrd is irrelevant here — chunks are supplied directly (no user gating).
-            var answer = await ask.AskFromChunksAsync(question, chunks, [], lastReadOrd: int.MaxValue, ct);
+            var answer = await ask.AskFromChunksAsync(question, chunks, [], [], lastReadOrd: int.MaxValue, ct);
             if (answer.Insufficient || answer.Citations.Count == 0)
                 continue;
             answersGenerated++;
diff --git a/backend/src/Api/Endpoints/AskEndpoints.cs b/backend/src/Api/Endpoints/AskEndpoints.cs
index c8890502..4a8d7674 100644
--- a/backend/src/Api/Endpoints/AskEndpoints.cs
+++ b/backend/src/Api/Endpoints/AskEndpoints.cs
@@ -1,5 +1,6 @@
 using Api.Extensions;
 using Api.Sites;
+using Application.Ai;
 using Application.Auth;
 using Application.Common.Interfaces;
 using Application.Rag;
@@ -11,14 +12,14 @@
 namespace Api.Endpoints;
 
 /// <summary>
-/// Public "Ask this book" (Phase 4 RAG, AI-025). Authenticated question over a catalog edition →
-/// spoiler-safe retrieval (<see cref="RagContextService"/>) → grounded answer with citations. JSON
-/// for now; SSE/token streaming lands in AI-028. The reader UI + citation deep-links are AI-026.
+/// Public "Ask this book" (Phase 4 RAG, AI-025; conversational upgrade AI-028). Authenticated question
+/// over a catalog edition → spoiler-safe retrieval (<see cref="RagContextService"/>) → grounded,
+/// conversational answer with citations. Content-negotiated: <c>Accept: text/event-stream</c> streams
+/// tokens (SSE — <c>delta</c>* then <c>done</c>); anything else returns the original JSON
+/// <see cref="AskResponse"/> (eval + mobile keep working). Multi-turn history is carried on the request.
 /// </summary>
 public static class AskEndpoints
 {
-    private const int PreviewChars = 200;
-
     public static void MapAskEndpoints(this WebApplication app)
     {
         app.MapPost("/books/{editionId:guid}/ask", Ask)
@@ -33,6 +34,7 @@ private static async Task<IResult> Ask(
         AuthService authService,
         IAppDbContext db,
         IServiceProvider services,
+        ILogger<Program> logger,
         CancellationToken ct)
     {
         var userId = httpContext.GetUserId(authService);
@@ -48,9 +50,11 @@ private static async Task<IResult> Ask(
 
         // RagAskService construction pulls in the embedder, which throws without an OpenAI key —
         // surface that as a clean 503 rather than a generic 500.
+        RagContextService context;
         RagAskService ask;
         try
         {
+            context = services.GetRequiredService<RagContextService>();
             ask = services.GetRequiredService<RagAskService>();
         }
         catch (InvalidOperationException)
@@ -58,17 +62,25 @@ private static async Task<IResult> Ask(
             return Results.Problem("Ask is not configured (no OpenAI key).", statusCode: 503);
         }
 
+        var k = request.K is > 0 ? request.K.Value : IRagService.DefaultK;
+        var history = RagAskHistory.Clamp(request.History);
+
+        if (AskSse.WantsSse(httpContext))
+        {
+            // Build the spoiler-safe context first (the gate/retrieval), then stream the answer.
+            var ctx = await context.BuildAsync(userId.Value, siteId, editionId, request.Question, k, request.CurrentChapterId, ct);
+            var noteTexts = ctx.Notes.Select(n => n.Text).ToList();
+            return AskSse.Stream(
+                httpContext,
+                ct2 => ask.StreamAsync(request.Question, ctx.Chunks, noteTexts, history, ctx.LastReadOrd, ct2),
+                logger);
+        }
+
         try
         {
-            var k = request.K is > 0 ? request.K.Value : IRagService.DefaultK;
             var answer = await ask.AskAsync(
-                userId.Value, siteId, editionId, request.Question, k, request.CurrentChapterId, ct);
-
-            var citations = answer.Citations.Select(c => new AskCitation(
-                c.Marker, c.Chunk.ChunkId, c.Chunk.ChapterId, c.Chunk.ChapterOrd,
-                c.Chunk.CharStart, c.Chunk.CharEnd, Preview(c.Chunk.Text))).ToList();
-
-            return Results.Ok(new AskResponse(answer.Answer, citations, answer.LastReadOrd, answer.Insufficient));
+                userId.Value, siteId, editionId, request.Question, k, request.CurrentChapterId, history, ct);
+            return Results.Ok(AskSse.ToResponse(answer));
         }
         catch (OperationCanceledException) when (ct.IsCancellationRequested)
         {
@@ -79,7 +91,4 @@ private static async Task<IResult> Ask(
             return Results.Problem("Ask is temporarily unavailable.", statusCode: 503);
         }
     }
-
-    private static string Preview(string text) =>
-        text.Length <= PreviewChars ? text : text[..PreviewChars] + "…";
 }
diff --git a/backend/src/Api/Endpoints/AskSse.cs b/backend/src/Api/Endpoints/AskSse.cs
new file mode 100644
index 00000000..87190d3c
--- /dev/null
+++ b/backend/src/Api/Endpoints/AskSse.cs
@@ -0,0 +1,144 @@
+using System.Net.ServerSentEvents;
+using System.Runtime.CompilerServices;
+using System.Text.Json;
+using Application.Rag;
+using Contracts.Books;
+
+namespace Api.Endpoints;
+
+/// <summary>
+/// Shared "Ask this book" plumbing for the catalog + user-book endpoints (AI-028): content negotiation,
+/// the JSON-response projection, and the SSE event stream. Mirrors <see cref="ExplainEndpoints"/>: an
+/// <c>Accept: text/event-stream</c> request gets <c>delta</c> events (one per token fragment) then a
+/// terminal <c>done</c> carrying <c>{ citations, lastReadOrd, insufficient }</c>; a failure mid-stream
+/// emits a terminal <c>error</c> event. Buffering is disabled so Cloudflare/nginx don't hold the stream.
+/// </summary>
+public static class AskSse
+{
+    private const int PreviewChars = 200;
+
+    // Match ASP.NET Core's default camelCase JSON (Results.Ok) so SSE + JSON citations are identical.
+    private static readonly JsonSerializerOptions DoneJson = new(JsonSerializerDefaults.Web);
+
+    /// <summary>True when the caller asked for an SSE token stream.</summary>
+    public static bool WantsSse(HttpContext httpContext) =>
+        httpContext.Request.Headers.Accept.Any(v =>
+            v is not null && v.Contains("text/event-stream", StringComparison.OrdinalIgnoreCase));
+
+    /// <summary>Projects the service answer onto the public JSON contract (citations → previews).</summary>
+    public static AskResponse ToResponse(AskAnswer answer)
+    {
+        var citations = answer.Citations.Select(ToCitation).ToList();
+        return new AskResponse(answer.Answer, citations, answer.LastReadOrd, answer.Insufficient);
+    }
+
+    /// <summary>
+    /// SSE response over a service <see cref="AskStreamItem"/> stream. Disables proxy buffering, then
+    /// relays text deltas as <c>delta</c> events and the terminal item as <c>done</c> (or <c>error</c>).
+    /// </summary>
+    public static IResult Stream(
+        HttpContext httpContext,
+        Func<CancellationToken, IAsyncEnumerable<AskStreamItem>> source,
+        ILogger logger)
+    {
+        // Cloudflare/nginx must not buffer the stream.
+        httpContext.Response.Headers["X-Accel-Buffering"] = "no";
+        httpContext.Response.Headers.CacheControl = "no-cache";
+
+        return TypedResults.ServerSentEvents(
+            StreamEventsAsync(source, ex => logger.LogError(ex, "Ask stream failed"), httpContext.RequestAborted));
+    }
+
+    /// <summary>
+    /// Maps service stream items to SSE items: a <c>delta</c> per text fragment, a terminal <c>done</c>
+    /// (serialized citations + gating), or a terminal <c>error</c>. Provider resolution can throw before
+    /// the first item — the service surfaces that as an <see cref="AskStreamItem.Error"/>. Pure over its
+    /// delegate so it's unit-tested directly.
+    /// </summary>
+    public static async IAsyncEnumerable<SseItem<string>> StreamEventsAsync(
+        Func<CancellationToken, IAsyncEnumerable<AskStreamItem>> source,
+        Action<Exception>? onException = null,
+        [EnumeratorCancellation] CancellationToken ct = default)
+    {
+        IAsyncEnumerable<AskStreamItem>? stream = null;
+        string? error = null;
+        try
+        {
+            stream = source(ct);
+        }
+        catch (Exception ex)
+        {
+            onException?.Invoke(ex);
+            error = "Ask is temporarily unavailable.";
+        }
+
+        if (error is not null)
+        {
+            yield return new SseItem<string>(error, "error");
+            yield break;
+        }
+
+        await using var e = stream!.GetAsyncEnumerator(ct);
+        while (true)
+        {
+            AskStreamItem item;
+            string? moveError = null;
+            try
+            {
+                if (!await e.MoveNextAsync())
+                    break;
+                item = e.Current;
+            }
+            catch (OperationCanceledException) when (ct.IsCancellationRequested)
+            {
+                throw; // client disconnected — nothing to emit
+            }
+            catch (Exception ex)
+            {
+                onException?.Invoke(ex);
+                moveError = "Ask is temporarily unavailable.";
+                item = new AskStreamItem();
+            }
+
+            if (moveError is not null)
+            {
+                yield return new SseItem<string>(moveError, "error");
+                yield break;
+            }
+
+            if (item.Error is { } err)
+            {
+                yield return new SseItem<string>(err, "error");
+                yield break;
+            }
+
+            if (item.TextDelta is { Length: > 0 } fragment)
+            {
+                yield return new SseItem<string>(fragment, "delta");
+            }
+            else if (item.Terminal is { } terminal)
+            {
+                yield return Done(terminal);
+            }
+        }
+    }
+
+    private static SseItem<string> Done(AskTerminal terminal)
+    {
+        var citations = terminal.Citations.Select(ToCitation).ToList();
+        var payload = JsonSerializer.Serialize(new
+        {
+            citations,
+            lastReadOrd = terminal.LastReadOrd,
+            insufficient = terminal.Insufficient,
+        }, DoneJson);
+        return new SseItem<string>(payload, "done");
+    }
+
+    private static AskCitation ToCitation(AskCitationSource c) =>
+        new(c.Marker, c.Chunk.ChunkId, c.Chunk.ChapterId, c.Chunk.ChapterOrd,
+            c.Chunk.CharStart, c.Chunk.CharEnd, Preview(c.Chunk.Text));
+
+    private static string Preview(string text) =>
+        text.Length <= PreviewChars ? text : text[..PreviewChars] + "…";
+}
diff --git a/backend/src/Api/Endpoints/UserBookAskEndpoints.cs b/backend/src/Api/Endpoints/UserBookAskEndpoints.cs
index 1c5c479c..45ff4ad7 100644
--- a/backend/src/Api/Endpoints/UserBookAskEndpoints.cs
+++ b/backend/src/Api/Endpoints/UserBookAskEndpoints.cs
@@ -1,4 +1,5 @@
 using Api.Extensions;
+using Application.Ai;
 using Application.Auth;
 using Application.Rag;
 using Contracts.Books;
@@ -8,18 +9,16 @@
 namespace Api.Endpoints;
 
 /// <summary>
-/// "Ask this book" over a USER-uploaded book (Phase 2). Owner-scoped: the authenticated user asks a
-/// question over their OWN uploaded document → per-user isolated retrieval
-/// (<see cref="UserBookRagContextService"/>) → grounded answer with citations
-/// (<see cref="RagAskService.AskFromChunksAsync"/>, shared with the catalog path). No spoiler gate —
-/// it's the user's own book — so the full book is in scope. Returns the same
-/// <see cref="AskResponse"/> contract as the catalog endpoint. <c>CurrentChapterId</c> on the request
-/// is accepted but ignored (no gate). 404 when the book isn't this user's.
+/// "Ask this book" over a USER-uploaded book (Phase 2; conversational upgrade AI-028). Owner-scoped:
+/// the authenticated user asks a question over their OWN uploaded document → per-user isolated retrieval
+/// (<see cref="UserBookRagContextService"/>) → grounded, conversational answer with citations
+/// (<see cref="RagAskService"/>, shared with the catalog path). No spoiler gate — it's the user's own
+/// book — so the full book is in scope. Content-negotiated SSE/JSON like the catalog endpoint;
+/// multi-turn history on the request. <c>CurrentChapterId</c> is accepted but ignored (no gate). 404
+/// when the book isn't this user's.
 /// </summary>
 public static class UserBookAskEndpoints
 {
-    private const int PreviewChars = 200;
-
     public static void MapUserBookAskEndpoints(this WebApplication app)
     {
         app.MapPost("/me/books/{id:guid}/ask", Ask)
@@ -33,6 +32,7 @@ private static async Task<IResult> Ask(
         HttpContext httpContext,
         AuthService authService,
         IServiceProvider services,
+        ILogger<Program> logger,
         CancellationToken ct)
     {
         var userId = httpContext.GetUserId(authService);
@@ -54,24 +54,28 @@ private static async Task<IResult> Ask(
             return Results.Problem("Ask is not configured (no OpenAI key).", statusCode: 503);
         }
 
+        var k = request.K is > 0 ? request.K.Value : IRagService.DefaultK;
+        var history = RagAskHistory.Clamp(request.History);
+
         try
         {
-            var k = request.K is > 0 ? request.K.Value : IRagService.DefaultK;
-
             // Ownership-scoped context build. Null => not this user's book (or taken down) → 404.
             var ctx = await context.BuildAsync(userId.Value, id, request.Question, k, ct);
             if (ctx is null) return Results.NotFound("Book not found");
 
             // Full-book retrieval (no gate), no private-notes corpus. lastReadOrd is 0 (unused for
-            // user books). AskFromChunksAsync handles the empty-chunks case (book not indexed yet) by
-            // returning an "insufficient" answer without an LLM call.
-            var answer = await ask.AskFromChunksAsync(request.Question, ctx.Chunks, [], lastReadOrd: 0, ct);
+            // user books). The service handles the empty-chunks case (book not indexed yet) without an
+            // LLM call (JSON: "insufficient" answer; SSE: friendly delta + insufficient terminal).
+            if (AskSse.WantsSse(httpContext))
+            {
+                return AskSse.Stream(
+                    httpContext,
+                    ct2 => ask.StreamAsync(request.Question, ctx.Chunks, [], history, lastReadOrd: 0, ct2),
+                    logger);
+            }
 
-            var citations = answer.Citations.Select(c => new AskCitation(
-                c.Marker, c.Chunk.ChunkId, c.Chunk.ChapterId, c.Chunk.ChapterOrd,
-                c.Chunk.CharStart, c.Chunk.CharEnd, Preview(c.Chunk.Text))).ToList();
-
-            return Results.Ok(new AskResponse(answer.Answer, citations, answer.LastReadOrd, answer.Insufficient));
+            var answer = await ask.AskFromChunksAsync(request.Question, ctx.Chunks, [], history, lastReadOrd: 0, ct);
+            return Results.Ok(AskSse.ToResponse(answer));
         }
         catch (OperationCanceledException) when (ct.IsCancellationRequested)
         {
@@ -82,7 +86,4 @@ private static async Task<IResult> Ask(
             return Results.Problem("Ask is temporarily unavailable.", statusCode: 503);
         }
     }
-
-    private static string Preview(string text) =>
-        text.Length <= PreviewChars ? text : text[..PreviewChars] + "…";
 }
diff --git a/backend/src/Api/appsettings.json b/backend/src/Api/appsettings.json
index 3f4025a6..354ba1e4 100644
--- a/backend/src/Api/appsettings.json
+++ b/backend/src/Api/appsettings.json
@@ -26,6 +26,9 @@
     "Explain": {
       "Model": "gpt-4.1-mini"
     },
+    "RagAsk": {
+      "Model": "gpt-4.1-mini"
+    },
     "Embedding": {
       "Model": "text-embedding-3-small"
     },
@@ -97,7 +100,7 @@
       "bookmeta": "ollama",
       "tagsuggestion": "ollama",
       "podcast.script": "openai",
-      "rag.ask": "openai"
+      "rag.ask": "openai-rag"
     },
     "Tracing": {
       "Sampling": {
diff --git a/backend/src/Application/Ai/RagAskHistory.cs b/backend/src/Application/Ai/RagAskHistory.cs
new file mode 100644
index 00000000..78ac3288
--- /dev/null
+++ b/backend/src/Application/Ai/RagAskHistory.cs
@@ -0,0 +1,74 @@
+using Contracts.Books;
+using TextStack.Ai.Core;
+
+namespace Application.Ai;
+
+/// <summary>
+/// Multi-turn "Ask this book" history handling (AI-028). The server is stateless — the client sends
+/// the prior conversation on each request — so this clamps the untrusted history (last N turns, each
+/// content-capped) and assembles the final <see cref="LlmMessage"/> sequence: the grounded excerpts/
+/// context block, then the clamped prior turns, then the new question last. Pure so it's unit-tested.
+/// </summary>
+public static class RagAskHistory
+{
+    /// <summary>How many trailing turns of history survive the clamp (defense against unbounded prompts).</summary>
+    public const int MaxTurns = 6;
+
+    /// <summary>Per-turn content cap in chars (a single pasted turn can't blow the prompt budget).</summary>
+    public const int MaxTurnChars = 4000;
+
+    private const string UserRole = "user";
+    private const string AssistantRole = "assistant";
+
+    /// <summary>
+    /// Defensive clamp over untrusted client history: drops blank/unknown-role turns, keeps only the
+    /// LAST <see cref="MaxTurns"/> turns, and truncates each turn's content to <see cref="MaxTurnChars"/>.
+    /// Roles are normalized to "user"/"assistant" (anything else is dropped).
+    /// </summary>
+    public static IReadOnlyList<AskTurnDto> Clamp(IReadOnlyList<AskTurnDto>? history)
+    {
+        if (history is null || history.Count == 0)
+            return [];
+
+        var cleaned = new List<AskTurnDto>(history.Count);
+        foreach (var turn in history)
+        {
+            if (turn is null || string.IsNullOrWhiteSpace(turn.Content))
+                continue;
+
+            var role = turn.Role?.Trim().ToLowerInvariant();
+            if (role != UserRole && role != AssistantRole)
+                continue;
+
+            var content = turn.Content.Length > MaxTurnChars ? turn.Content[..MaxTurnChars] : turn.Content;
+            cleaned.Add(new AskTurnDto(role, content));
+        }
+
+        if (cleaned.Count <= MaxTurns)
+            return cleaned;
+
+        return cleaned.GetRange(cleaned.Count - MaxTurns, MaxTurns);
+    }
+
+    /// <summary>
+    /// Assembles the chat messages for the gateway: a "user" message carrying the numbered excerpts +
+    /// the reader's notes (the grounding context block), then the clamped prior turns in order, then
+    /// the new question as the final "user" message. The system (companion) prompt is supplied
+    /// separately on the <see cref="LlmRequest"/>. <paramref name="history"/> is assumed already
+    /// clamped by <see cref="Clamp"/>.
+    /// </summary>
+    public static IReadOnlyList<LlmMessage> BuildMessages(
+        string contextBlock, IReadOnlyList<AskTurnDto> history, string question)
+    {
+        var messages = new List<LlmMessage>(history.Count + 2)
+        {
+            new(UserRole, contextBlock),
+        };
+
+        foreach (var turn in history)
+            messages.Add(new LlmMessage(turn.Role, turn.Content));
+
+        messages.Add(new LlmMessage(UserRole, question));
+        return messages;
+    }
+}
diff --git a/backend/src/Application/Ai/RagAskPrompt.cs b/backend/src/Application/Ai/RagAskPrompt.cs
index 172a34ed..09188891 100644
--- a/backend/src/Application/Ai/RagAskPrompt.cs
+++ b/backend/src/Application/Ai/RagAskPrompt.cs
@@ -12,32 +12,46 @@ namespace Application.Ai;
 public static class RagAskPrompt
 {
     public static string BuildSystemPrompt() =>
-        "You answer a reader's question about a book using ONLY the numbered excerpts provided. " +
-        "Write 2-4 sentences. Cite every claim with [n] referring to the excerpt numbers you used. " +
-        "If the excerpts do not contain the answer, say so plainly — do NOT use outside knowledge. " +
-        "No preface, no markdown.";
+        "You are a warm, thoughtful reading companion chatting with a reader about a book they are " +
+        "reading. Be friendly and conversational in tone. " +
+        "GROUNDING (strict, overrides everything below): the numbered excerpts below are your ONLY source " +
+        "of truth about this book. Every substantive statement you make about the book — its plot, " +
+        "characters, events, the author's words, AND any theme, symbolism, meaning, motive, or " +
+        "interpretation you offer — MUST be supported by those excerpts, and you MUST cite each such claim " +
+        "with [n] referring to the excerpt number(s) you used. Never use outside knowledge (the wider " +
+        "story, other works, the author's biography, plot you recall) to assert, explain, or interpret " +
+        "anything about this book, and never invent details that aren't in the excerpts. You may help the " +
+        "reader reflect, but only by reasoning from what the cited excerpts actually say — if a thought " +
+        "isn't supported there, don't present it as fact about the book. " +
+        "The ONLY messages you may answer without citing excerpts are a greeting or a question about what " +
+        "you can do (e.g. \"hi\", \"what can you do\"): respond warmly and invite a question about the book " +
+        "— do NOT cite anything and do NOT say the excerpts lack the answer. For anything else, if the " +
+        "excerpts don't cover it, say so gracefully (e.g. \"I don't see that in what you've read so far\") " +
+        "rather than guessing or filling the gap from memory. " +
+        "Keep replies short (a few sentences). No markdown, no preface.";
 
     /// <summary>
-    /// Numbered excerpts (1-based, matching the citation markers) + the reader's own notes, then the
-    /// question. <paramref name="chunks"/> order defines the [n] numbering.
+    /// The grounding context block: numbered excerpts (1-based, matching the citation markers) + the
+    /// reader's own notes. <paramref name="chunks"/> order defines the [n] numbering. The question is
+    /// NOT included — in the multi-turn assembly it's the final user message (see
+    /// <see cref="RagAskHistory.BuildMessages"/>).
     /// </summary>
-    public static string BuildUserPrompt(
-        string question, IReadOnlyList<RetrievedChunk> chunks, IReadOnlyList<string> notes)
+    public static string BuildContextBlock(IReadOnlyList<RetrievedChunk> chunks, IReadOnlyList<string> notes)
     {
         var sb = new StringBuilder();
-        sb.Append("Excerpts:\n");
+        sb.Append("Numbered excerpts from the part of the book the reader has reached " +
+                  "(cite these as [n] for any book fact):\n");
         for (var i = 0; i < chunks.Count; i++)
             sb.Append('[').Append(i + 1).Append("] (ch.").Append(chunks[i].ChapterOrd).Append(") ")
               .Append(chunks[i].Text).Append('\n');
 
         if (notes.Count > 0)
         {
-            sb.Append("\nYour notes:\n");
+            sb.Append("\nThe reader's own notes:\n");
             foreach (var note in notes)
                 sb.Append("- ").Append(note).Append('\n');
         }
 
-        sb.Append("\nQuestion: ").Append(question);
         return sb.ToString();
     }
 
diff --git a/backend/src/Application/DependencyInjection.cs b/backend/src/Application/DependencyInjection.cs
index b55b6680..cea7d10d 100644
--- a/backend/src/Application/DependencyInjection.cs
+++ b/backend/src/Application/DependencyInjection.cs
@@ -106,8 +106,17 @@ public static IServiceCollection AddApplication(this IServiceCollection services
                 sp.GetRequiredService<ILogger<global::TextStack.Ai.Llm.OpenAiLlmClient>>(),
                 sp.GetRequiredService<IConfiguration>()["OpenAI:Explain:Model"] ?? "gpt-4.1-mini"));
 
+        // "Ask this book" (rag.ask) runs a STRONGER conversational model than the nano default
+        // (OpenAI:RagAsk:Model, default gpt-4.1-mini) for the warm reading-companion experience.
+        // Routed per-feature (Ai:Routes:rag.ask → openai-rag) so translate / podcast stay on nano.
+        services.AddKeyedSingleton<global::TextStack.Ai.Core.ILlmService>("openai-rag-raw", (sp, key) =>
+            new global::TextStack.Ai.Llm.OpenAiLlmClient(
+                sp.GetRequiredService<IConfiguration>(),
+                sp.GetRequiredService<ILogger<global::TextStack.Ai.Llm.OpenAiLlmClient>>(),
+                sp.GetRequiredService<IConfiguration>()["OpenAI:RagAsk:Model"] ?? "gpt-4.1-mini"));
+
         // Decorated providers (keyed): TracingDecorator wraps each raw provider.
-        foreach (var providerKey in new[] { "openai", "ollama", "openai-judge", "openai-explain" })
+        foreach (var providerKey in new[] { "openai", "ollama", "openai-judge", "openai-explain", "openai-rag" })
         {
             services.AddKeyedSingleton<global::TextStack.Ai.Core.ILlmService>(providerKey, (sp, key) =>
                 new global::TextStack.Ai.Llm.TracingDecorator(
diff --git a/backend/src/Application/Rag/RagAskService.cs b/backend/src/Application/Rag/RagAskService.cs
index 9fe63d82..e91c4008 100644
--- a/backend/src/Application/Rag/RagAskService.cs
+++ b/backend/src/Application/Rag/RagAskService.cs
@@ -1,4 +1,7 @@
+using System.Runtime.CompilerServices;
+using System.Text;
 using Application.Ai;
+using Contracts.Books;
 using TextStack.Ai.Core;
 using TextStack.Ai.Rag;
 
@@ -17,6 +20,23 @@ public record AskAnswer(
     int LastReadOrd,
     bool Insufficient);
 
+/// <summary>
+/// A streaming chunk of an "Ask this book" answer (AI-028). Exactly one field is set per item: a
+/// <see cref="TextDelta"/> partial text fragment as it arrives, or the terminal <see cref="Terminal"/>
+/// (citations + lastReadOrd + insufficient) that closes the stream. An <see cref="Error"/> item is the
+/// terminal item on failure.
+/// </summary>
+public record AskStreamItem(
+    string? TextDelta = null,
+    AskTerminal? Terminal = null,
+    string? Error = null);
+
+/// <summary>The terminal payload of a streamed answer: resolved citations + gating metadata.</summary>
+public record AskTerminal(
+    IReadOnlyList<AskCitationSource> Citations,
+    int LastReadOrd,
+    bool Insufficient);
+
 /// <summary>
 /// "Ask this book" (Phase 4 RAG, AI-025). Interface so the AI-027 citation eval can drive generation
 /// against pre-retrieved chunks (no reading user) and tests can fake it.
@@ -27,61 +47,159 @@ public interface IRagAskService
     /// Spoiler-safe ask for a real reader: gates context by their reading progress. Pass
     /// <paramref name="currentChapterId"/> (the chapter open in the reader) so it counts as read even
     /// before the debounced progress-save persists — resolved server-side, ignored if not this edition.
+    /// <paramref name="history"/> carries prior conversation turns (multi-turn memory); retrieval still
+    /// runs on <paramref name="question"/> only.
     /// </summary>
     Task<AskAnswer> AskAsync(
         Guid userId, Guid siteId, Guid editionId, string question, int k,
-        Guid? currentChapterId, CancellationToken ct);
+        Guid? currentChapterId, IReadOnlyList<AskTurnDto> history, CancellationToken ct);
 
     /// <summary>
     /// Generate a grounded, cited answer from an already-retrieved chunk set — bypasses user-progress
     /// gating (the caller supplies the chunks). Used by the AI-027 citation eval over the golden set.
+    /// <paramref name="history"/> is the prior conversation (pass <c>[]</c> for a single-shot grounding eval).
     /// </summary>
     Task<AskAnswer> AskFromChunksAsync(
         string question, IReadOnlyList<RetrievedChunk> chunks, IReadOnlyList<string> noteTexts,
-        int lastReadOrd, CancellationToken ct);
+        IReadOnlyList<AskTurnDto> history, int lastReadOrd, CancellationToken ct);
 }
 
 /// <summary>
 /// "Ask this book" (Phase 4 RAG, AI-025): retrieves spoiler-safe context via
-/// <see cref="RagContextService"/>, generates a grounded 2-4 sentence answer with citations via the
-/// LLM gateway (FeatureTag <c>rag.ask</c>), and resolves the cited excerpts. Reused by the AI-027 eval.
+/// <see cref="RagContextService"/>, generates a grounded, conversational answer with citations via the
+/// LLM gateway (FeatureTag <c>rag.ask</c>), and resolves the cited excerpts. Multi-turn (AI-028):
+/// prior conversation turns are clamped + threaded into the chat. Streaming via <see cref="StreamAsync"/>.
+/// Reused by the AI-027 eval.
 /// </summary>
 public sealed class RagAskService(RagContextService context, ILlmService llm) : IRagAskService
 {
     public const string FeatureTag = "rag.ask";
-    private const int MaxOutputTokens = 320;
+
+    // Raised for conversational length (AI-028: companion tone, multi-sentence replies).
+    private const int MaxOutputTokens = 400;
 
     private const string InsufficientMessage =
-        "You haven't read enough of this book yet for me to answer from it. Keep reading and ask again.";
+        "I don't see enough of this book in what you've read so far to answer that yet. " +
+        "Keep reading and ask me again — I'll be here.";
 
     public async Task<AskAnswer> AskAsync(
         Guid userId, Guid siteId, Guid editionId, string question, int k,
-        Guid? currentChapterId, CancellationToken ct)
+        Guid? currentChapterId, IReadOnlyList<AskTurnDto> history, CancellationToken ct)
     {
         var ctx = await context.BuildAsync(userId, siteId, editionId, question, k, currentChapterId, ct);
         var noteTexts = ctx.Notes.Select(n => n.Text).ToList();
-        return await AskFromChunksAsync(question, ctx.Chunks, noteTexts, ctx.LastReadOrd, ct);
+        return await AskFromChunksAsync(question, ctx.Chunks, noteTexts, history, ctx.LastReadOrd, ct);
     }
 
     public async Task<AskAnswer> AskFromChunksAsync(
         string question, IReadOnlyList<RetrievedChunk> chunks, IReadOnlyList<string> noteTexts,
-        int lastReadOrd, CancellationToken ct)
+        IReadOnlyList<AskTurnDto> history, int lastReadOrd, CancellationToken ct)
     {
         // No readable context → answer plainly without spending an LLM call.
         if (chunks.Count == 0)
             return new AskAnswer(InsufficientMessage, [], lastReadOrd, Insufficient: true);
 
-        var request = new LlmRequest(
+        var request = BuildRequest(question, chunks, noteTexts, history);
+        var response = await llm.CompleteAsync(request, ct);
+
+        var citations = ResolveCitations(response.Text, chunks);
+        return new AskAnswer(response.Text, citations, lastReadOrd, Insufficient: false);
+    }
+
+    /// <summary>
+    /// Streaming "Ask this book" (AI-028): same context build + gate as <see cref="AskFromChunksAsync"/>,
+    /// then streams the answer as <see cref="AskStreamItem"/> text deltas, accumulating server-side; on
+    /// completion runs the EXISTING <c>[n]</c> citation parse over the full text and emits a single
+    /// terminal item with resolved citations. Empty chunks → one friendly delta + an insufficient
+    /// terminal (no model call). A mid-stream failure emits an <see cref="AskStreamItem.Error"/> terminal.
+    /// </summary>
+    public async IAsyncEnumerable<AskStreamItem> StreamAsync(
+        string question, IReadOnlyList<RetrievedChunk> chunks, IReadOnlyList<string> noteTexts,
+        IReadOnlyList<AskTurnDto> history, int lastReadOrd,
+        [EnumeratorCancellation] CancellationToken ct = default)
+    {
+        // No readable context → friendly message + insufficient terminal, no LLM call.
+        if (chunks.Count == 0)
+        {
+            yield return new AskStreamItem(TextDelta: InsufficientMessage);
+            yield return new AskStreamItem(Terminal: new AskTerminal([], lastReadOrd, Insufficient: true));
+            yield break;
+        }
+
+        var request = BuildRequest(question, chunks, noteTexts, history);
+
+        var text = new StringBuilder();
+        string? error = null;
+
+        // Provider resolution happens inside StreamAsync (gateway → keyed provider ctor) and can throw
+        // on a keyless host — surface that as an error terminal, not a broken stream.
+        IAsyncEnumerable<LlmDelta>? stream = null;
+        try
+        {
+            stream = llm.StreamAsync(request, ct);
+        }
+        catch (Exception)
+        {
+            error = "Ask is temporarily unavailable.";
+        }
+
+        if (error is null)
+        {
+            await using var e = stream!.GetAsyncEnumerator(ct);
+            while (true)
+            {
+                LlmDelta delta;
+                try
+                {
+                    if (!await e.MoveNextAsync())
+                        break;
+                    delta = e.Current;
+                }
+                catch (OperationCanceledException) when (ct.IsCancellationRequested)
+                {
+                    throw; // client disconnected — nothing to emit
+                }
+                catch (Exception)
+                {
+                    error = "Ask is temporarily unavailable.";
+                    break;
+                }
+
+                if (delta.TextDelta is { Length: > 0 } fragment)
+                {
+                    text.Append(fragment);
+                    yield return new AskStreamItem(TextDelta: fragment);
+                }
+            }
+        }
+
+        if (error is not null)
+        {
+            yield return new AskStreamItem(Error: error);
+            yield break;
+        }
+
+        var citations = ResolveCitations(text.ToString(), chunks);
+        yield return new AskStreamItem(Terminal: new AskTerminal(citations, lastReadOrd, Insufficient: false));
+    }
+
+    private static LlmRequest BuildRequest(
+        string question, IReadOnlyList<RetrievedChunk> chunks, IReadOnlyList<string> noteTexts,
+        IReadOnlyList<AskTurnDto> history)
+    {
+        var contextBlock = RagAskPrompt.BuildContextBlock(chunks, noteTexts);
+        var messages = RagAskHistory.BuildMessages(contextBlock, history, question);
+        return new LlmRequest(
             SystemPrompt: RagAskPrompt.BuildSystemPrompt(),
-            Messages: [new LlmMessage("user", RagAskPrompt.BuildUserPrompt(question, chunks, noteTexts))],
+            Messages: messages,
             MaxOutputTokens: MaxOutputTokens,
             FeatureTag: FeatureTag);
+    }
 
-        var response = await llm.CompleteAsync(request, ct);
-
-        var markers = RagAskPrompt.ParseCitations(response.Text, chunks.Count);
-        var citations = markers.Select(n => new AskCitationSource(n, chunks[n - 1])).ToList();
-
-        return new AskAnswer(response.Text, citations, lastReadOrd, Insufficient: false);
+    private static IReadOnlyList<AskCitationSource> ResolveCitations(
+        string answer, IReadOnlyList<RetrievedChunk> chunks)
+    {
+        var markers = RagAskPrompt.ParseCitations(answer, chunks.Count);
+        return markers.Select(n => new AskCitationSource(n, chunks[n - 1])).ToList();
     }
 }
diff --git a/backend/src/Contracts/Books/AskDtos.cs b/backend/src/Contracts/Books/AskDtos.cs
index 0d4e0c9c..b7850837 100644
--- a/backend/src/Contracts/Books/AskDtos.cs
+++ b/backend/src/Contracts/Books/AskDtos.cs
@@ -1,12 +1,25 @@
 namespace Contracts.Books;
 
+/// <summary>
+/// One prior turn in a multi-turn "Ask this book" conversation. <see cref="Role"/> is "user" or
+/// "assistant". History is sent by the client (the server is stateless); it's clamped server-side
+/// (last few turns, each content-capped) before reaching the LLM.
+/// </summary>
+public record AskTurnDto(string Role, string Content);
+
 /// <summary>
 /// "Ask this book" request (AI-025). <see cref="K"/> overrides the default retrieval count.
 /// <see cref="CurrentChapterId"/> is the chapter the reader has open right now; the spoiler gate
 /// counts it as read (max with persisted progress) so "ask about what I'm reading" works before the
 /// debounced progress-save fires. Resolved server-side and ignored if it isn't part of this edition.
+/// <see cref="History"/> carries the prior conversation turns for multi-turn memory (AI-028);
+/// retrieval still runs on <see cref="Question"/> only.
 /// </summary>
-public record AskRequest(string Question, int? K = null, Guid? CurrentChapterId = null);
+public record AskRequest(
+    string Question,
+    int? K = null,
+    Guid? CurrentChapterId = null,
+    IReadOnlyList<AskTurnDto>? History = null);
 
 /// <summary>
 /// A cited source for an answer. <see cref="Marker"/> is the <c>[n]</c> number in the answer text;
diff --git a/backend/src/Infrastructure/Persistence/ModelRegistrySeeder.cs b/backend/src/Infrastructure/Persistence/ModelRegistrySeeder.cs
index fd948ed4..18dfefde 100644
--- a/backend/src/Infrastructure/Persistence/ModelRegistrySeeder.cs
+++ b/backend/src/Infrastructure/Persistence/ModelRegistrySeeder.cs
@@ -23,7 +23,7 @@ private static readonly (string Feature, string Provider, string Model)[] Primar
         ("bookmeta", "ollama", "gemma4:e2b"),
         ("tagsuggestion", "ollama", "gemma4:e2b"),
         ("podcast.script", "openai", "gpt-4.1-nano"),
-        ("rag.ask", "openai", "gpt-4.1-nano"),
+        ("rag.ask", "openai-rag", "gpt-4.1-mini"),
     ];
 
     public static async Task SeedAsync(AppDbContext db, CancellationToken ct = default)
diff --git a/packages/shared/src/types/api.ts b/packages/shared/src/types/api.ts
index 6e8e932a..6e47e33c 100644
--- a/packages/shared/src/types/api.ts
+++ b/packages/shared/src/types/api.ts
@@ -441,3 +441,12 @@ export interface AskResponse {
   lastReadOrd: number
   insufficient: boolean
 }
+
+/**
+ * One prior turn of the conversation, sent back to the server for multi-turn "Ask this book"
+ * (AI-026e). The client bounds the history (last 6 turns) before sending.
+ */
+export interface AskTurnDto {
+  role: 'user' | 'assistant'
+  content: string
+}
diff --git a/tests/TextStack.AiEvals/RagEvalRunnerTests.cs b/tests/TextStack.AiEvals/RagEvalRunnerTests.cs
index e27dabde..16a1bef4 100644
--- a/tests/TextStack.AiEvals/RagEvalRunnerTests.cs
+++ b/tests/TextStack.AiEvals/RagEvalRunnerTests.cs
@@ -64,11 +64,11 @@ public Task<IReadOnlyList<RetrievedChunk>> RetrieveUserBookAsync(
     /// <summary>Echoes back a one-citation answer pointing at the first supplied chunk.</summary>
     private sealed class FakeAsk : IRagAskService
     {
-        public Task<AskAnswer> AskAsync(Guid u, Guid s, Guid e, string q, int k, Guid? currentChapterId, CancellationToken ct) =>
+        public Task<AskAnswer> AskAsync(Guid u, Guid s, Guid e, string q, int k, Guid? currentChapterId, IReadOnlyList<Contracts.Books.AskTurnDto> history, CancellationToken ct) =>
             throw new NotSupportedException();
 
         public Task<AskAnswer> AskFromChunksAsync(
-            string question, IReadOnlyList<RetrievedChunk> chunks, IReadOnlyList<string> notes, int lastReadOrd, CancellationToken ct)
+            string question, IReadOnlyList<RetrievedChunk> chunks, IReadOnlyList<string> notes, IReadOnlyList<Contracts.Books.AskTurnDto> history, int lastReadOrd, CancellationToken ct)
         {
             var citations = chunks.Count == 0
                 ? Array.Empty<AskCitationSource>()
diff --git a/tests/TextStack.IntegrationTests/AskEndpointTests.cs b/tests/TextStack.IntegrationTests/AskEndpointTests.cs
index a7928400..9499949f 100644
--- a/tests/TextStack.IntegrationTests/AskEndpointTests.cs
+++ b/tests/TextStack.IntegrationTests/AskEndpointTests.cs
@@ -1,4 +1,5 @@
 using System.Net;
+using System.Net.Http.Headers;
 using System.Net.Http.Json;
 
 namespace TextStack.IntegrationTests;
@@ -46,4 +47,57 @@ public async Task Ask_WithAuth_Returns200OrSkips()
             "endpoint not reachable (no edition / key / corpus)");
         Assert.Equal(HttpStatusCode.OK, response.StatusCode);
     }
+
+    [Fact]
+    public async Task Ask_WithHistory_JsonPathReturnsAskResponse()
+    {
+        // AI-028: the JSON contract is unchanged when multi-turn history is supplied.
+        Assert.SkipUnless(_fixture.IsAuthenticated, "auth unavailable");
+
+        var request = _fixture.CreateRequest(HttpMethod.Post, $"/books/{SomeEdition}/ask");
+        request.Content = JsonContent.Create(new
+        {
+            question = "and what happens next?",
+            history = new[]
+            {
+                new { role = "user", content = "who is the main character?" },
+                new { role = "assistant", content = "The protagonist is introduced early [1]." },
+            },
+        });
+        var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken);
+
+        Assert.SkipWhen(
+            IntegrationSkip.Unavailable(response) || response.StatusCode == HttpStatusCode.ServiceUnavailable,
+            "endpoint not reachable (no edition / key / corpus)");
+        Assert.Equal(HttpStatusCode.OK, response.StatusCode);
+        Assert.Equal("application/json", response.Content.Headers.ContentType?.MediaType);
+    }
+
+    [Fact]
+    public async Task Ask_AcceptEventStream_ReturnsSseWithDeltas()
+    {
+        // AI-028: content-negotiated SSE — Accept: text/event-stream → text/event-stream body with
+        // at least one `delta` event (and a terminal `done`). Skips when key/edition/corpus missing.
+        Assert.SkipUnless(_fixture.IsAuthenticated, "auth unavailable");
+
+        var request = _fixture.CreateRequest(HttpMethod.Post, $"/books/{SomeEdition}/ask");
+        request.Content = JsonContent.Create(new { question = "what happens at the start?" });
+        request.Headers.Accept.Clear();
+        request.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("text/event-stream"));
+
+        var response = await _fixture.Client.SendAsync(
+            request, HttpCompletionOption.ResponseHeadersRead, TestContext.Current.CancellationToken);
+
+        Assert.SkipWhen(
+            IntegrationSkip.Unavailable(response) || response.StatusCode == HttpStatusCode.ServiceUnavailable,
+            "endpoint not reachable (no edition / key / corpus)");
+        Assert.Equal(HttpStatusCode.OK, response.StatusCode);
+        Assert.Equal("text/event-stream", response.Content.Headers.ContentType?.MediaType);
+
+        var body = await response.Content.ReadAsStringAsync(TestContext.Current.CancellationToken);
+        // The spoiler gate / model may yield the friendly "insufficient" message, but it still streams
+        // as a `delta` then `done` — assert the SSE framing, not the content.
+        Assert.Contains("event: delta", body);
+        Assert.Contains("event: done", body);
+    }
 }
diff --git a/tests/TextStack.UnitTests/AskSseTests.cs b/tests/TextStack.UnitTests/AskSseTests.cs
new file mode 100644
index 00000000..b12da794
--- /dev/null
+++ b/tests/TextStack.UnitTests/AskSseTests.cs
@@ -0,0 +1,103 @@
+using System.Net.ServerSentEvents;
+using Api.Endpoints;
+using Application.Rag;
+using TextStack.Ai.Rag;
+
+namespace TextStack.UnitTests;
+
+/// <summary>
+/// AI-028 — the Ask SSE event stream (<see cref="AskSse.StreamEventsAsync"/>), driven through a fake
+/// <see cref="AskStreamItem"/> source: text deltas then a terminal done-with-citations; the empty-chunks
+/// case (friendly delta + insufficient done); a service-level error item; and a mid-stream throw → error.
+/// </summary>
+public class AskSseTests
+{
+    private static RetrievedChunk Chunk() =>
+        new(Guid.NewGuid(), Guid.NewGuid(), 2, 0, "excerpt text", 0, 12, 0.9);
+
+    private static async IAsyncEnumerable<AskStreamItem> Items(params AskStreamItem[] items)
+    {
+        foreach (var i in items)
+        {
+            await Task.Yield();
+            yield return i;
+        }
+    }
+
+    private static async IAsyncEnumerable<AskStreamItem> Boom()
+    {
+        await Task.Yield();
+        yield return new AskStreamItem(TextDelta: "par");
+        throw new InvalidOperationException("provider died");
+    }
+
+    private static async Task<List<SseItem<string>>> Collect(
+        Func<CancellationToken, IAsyncEnumerable<AskStreamItem>> source)
+    {
+        var list = new List<SseItem<string>>();
+        await foreach (var item in AskSse.StreamEventsAsync(source, onException: null, ct: TestContext.Current.CancellationToken))
+            list.Add(item);
+        return list;
+    }
+
+    [Fact]
+    public async Task Stream_DeltasThenTerminalDoneWithCitations()
+    {
+        var chunk = Chunk();
+        var events = await Collect(_ => Items(
+            new AskStreamItem(TextDelta: "Hel"),
+            new AskStreamItem(TextDelta: "lo [1]"),
+            new AskStreamItem(Terminal: new AskTerminal([new AskCitationSource(1, chunk)], LastReadOrd: 4, Insufficient: false))));
+
+        Assert.Equal(["delta", "delta", "done"], events.Select(e => e.EventType));
+        Assert.Equal("Hel", events[0].Data);
+        Assert.Equal("lo [1]", events[1].Data);
+
+        var done = events[^1].Data;
+        Assert.Contains("\"insufficient\":false", done);
+        Assert.Contains("\"lastReadOrd\":4", done);
+        Assert.Contains("\"marker\":1", done.Replace(" ", ""));
+        Assert.Contains(chunk.ChunkId.ToString(), done);
+    }
+
+    [Fact]
+    public async Task Stream_EmptyChunks_FriendlyDeltaThenInsufficientDone()
+    {
+        // Mirrors RagAskService.StreamAsync's no-chunks path: one delta + insufficient terminal, no citations.
+        var events = await Collect(_ => Items(
+            new AskStreamItem(TextDelta: "I don't see enough of this book yet."),
+            new AskStreamItem(Terminal: new AskTerminal([], LastReadOrd: 0, Insufficient: true))));
+
+        Assert.Equal(["delta", "done"], events.Select(e => e.EventType));
+        Assert.Contains("\"insufficient\":true", events[^1].Data);
+        Assert.Contains("\"citations\":[]", events[^1].Data.Replace(" ", ""));
+    }
+
+    [Fact]
+    public async Task Stream_ServiceErrorItem_EmitsTerminalError()
+    {
+        var events = await Collect(_ => Items(
+            new AskStreamItem(TextDelta: "partial"),
+            new AskStreamItem(Error: "Ask is temporarily unavailable.")));
+
+        Assert.Equal(["delta", "error"], events.Select(e => e.EventType));
+        Assert.Equal("Ask is temporarily unavailable.", events[^1].Data);
+    }
+
+    [Fact]
+    public async Task Stream_MidStreamThrow_EmitsErrorAsTerminal()
+    {
+        var events = await Collect(_ => Boom());
+
+        Assert.Equal(["delta", "error"], events.Select(e => e.EventType));
+    }
+
+    [Fact]
+    public async Task Stream_SourceFactoryThrows_EmitsError()
+    {
+        var events = await Collect(_ => throw new InvalidOperationException("no key"));
+
+        var item = Assert.Single(events);
+        Assert.Equal("error", item.EventType);
+    }
+}
diff --git a/tests/TextStack.UnitTests/RagAskHistoryTests.cs b/tests/TextStack.UnitTests/RagAskHistoryTests.cs
new file mode 100644
index 00000000..ab0b0c82
--- /dev/null
+++ b/tests/TextStack.UnitTests/RagAskHistoryTests.cs
@@ -0,0 +1,127 @@
+using Application.Ai;
+using Contracts.Books;
+
+namespace TextStack.UnitTests;
+
+/// <summary>
+/// AI-028 — multi-turn "Ask this book" history: the defensive clamp (last-6 + per-turn content cap +
+/// role normalization) and the chat-message assembly order (system supplied separately → context →
+/// prior turns → new question last).
+/// </summary>
+public class RagAskHistoryTests
+{
+    private static AskTurnDto User(string c) => new("user", c);
+    private static AskTurnDto Assistant(string c) => new("assistant", c);
+
+    [Fact]
+    public void Clamp_Null_ReturnsEmpty() => Assert.Empty(RagAskHistory.Clamp(null));
+
+    [Fact]
+    public void Clamp_KeepsOnlyLastSixTurns()
+    {
+        var history = Enumerable.Range(0, 10).Select(i => User($"turn {i}")).ToList();
+
+        var clamped = RagAskHistory.Clamp(history);
+
+        Assert.Equal(RagAskHistory.MaxTurns, clamped.Count);
+        // The LAST six survive (turns 4..9), most recent context retained.
+        Assert.Equal("turn 4", clamped[0].Content);
+        Assert.Equal("turn 9", clamped[^1].Content);
+    }
+
+    [Fact]
+    public void Clamp_TruncatesPerTurnContent()
+    {
+        var huge = new string('x', RagAskHistory.MaxTurnChars + 500);
+
+        var clamped = RagAskHistory.Clamp([User(huge)]);
+
+        Assert.Equal(RagAskHistory.MaxTurnChars, Assert.Single(clamped).Content.Length);
+    }
+
+    [Fact]
+    public void Clamp_DropsBlankAndUnknownRoles_NormalizesRole()
+    {
+        var clamped = RagAskHistory.Clamp(
+        [
+            new AskTurnDto("USER", "keep me"),
+            new AskTurnDto("system", "drop: bad role"),
+            new AskTurnDto("assistant", "   "),
+            new AskTurnDto("Assistant", "keep assistant"),
+        ]);
+
+        Assert.Equal(2, clamped.Count);
+        Assert.Equal("user", clamped[0].Role);
+        Assert.Equal("keep me", clamped[0].Content);
+        Assert.Equal("assistant", clamped[1].Role);
+        Assert.Equal("keep assistant", clamped[1].Content);
+    }
+
+    [Fact]
+    public void BuildMessages_Order_ContextThenHistoryThenQuestion()
+    {
+        var messages = RagAskHistory.BuildMessages(
+            "CONTEXT BLOCK",
+            [User("earlier question"), Assistant("earlier answer [1]")],
+            "the new question");
+
+        // [0] = context (user role), [1..n] = prior turns in order, [last] = new question (user role).
+        Assert.Equal(4, messages.Count);
+
+        Assert.Equal("user", messages[0].Role);
+        Assert.Equal("CONTEXT BLOCK", messages[0].Content);
+
+        Assert.Equal("user", messages[1].Role);
+        Assert.Equal("earlier question", messages[1].Content);
+
+        Assert.Equal("assistant", messages[2].Role);
+        Assert.Equal("earlier answer [1]", messages[2].Content);
+
+        Assert.Equal("user", messages[^1].Role);
+        Assert.Equal("the new question", messages[^1].Content);
+    }
+
+    [Fact]
+    public void BuildMessages_EmptyHistory_ContextThenQuestionOnly()
+    {
+        var messages = RagAskHistory.BuildMessages("CTX", [], "Q?");
+
+        Assert.Equal(2, messages.Count);
+        Assert.Equal("CTX", messages[0].Content);
+        Assert.Equal("Q?", messages[1].Content);
+    }
+
+    [Fact]
+    public void Clamp_AbusiveFlood_BoundedToLastSix_EachContentCapped()
+    {
+        // Malicious client: 1000 turns, each 10MB. Server-side clamp must bound both axes regardless of
+        // what the client claims to send (cost / context-window defense — NOT just a UI slice).
+        var huge = new string('z', 10 * 1024 * 1024);
+        var flood = Enumerable.Range(0, 1000)
+            .Select(i => i % 2 == 0 ? User(huge) : Assistant(huge))
+            .ToList();
+
+        var clamped = RagAskHistory.Clamp(flood);
+
+        Assert.Equal(RagAskHistory.MaxTurns, clamped.Count);
+        Assert.All(clamped, t => Assert.Equal(RagAskHistory.MaxTurnChars, t.Content.Length));
+    }
+
+    [Fact]
+    public void BuildMessages_MalformedHistory_TwoUsersInARow_DoesNotThrow_PreservesOrder()
+    {
+        // A crafted history that doesn't alternate (two user turns, assistant-first elsewhere) must not
+        // break assembly — we pass it through verbatim after the context block; the LLM tolerates it.
+        var messages = RagAskHistory.BuildMessages(
+            "CTX",
+            [User("first"), User("second-in-a-row"), Assistant("reply")],
+            "now what?");
+
+        Assert.Equal(5, messages.Count);
+        Assert.Equal("CTX", messages[0].Content);
+        Assert.Equal("first", messages[1].Content);
+        Assert.Equal("second-in-a-row", messages[2].Content);
+        Assert.Equal("reply", messages[3].Content);
+        Assert.Equal("now what?", messages[^1].Content);
+    }
+}
diff --git a/tests/TextStack.UnitTests/RagAskPromptTests.cs b/tests/TextStack.UnitTests/RagAskPromptTests.cs
index c03e7b34..b7f64f65 100644
--- a/tests/TextStack.UnitTests/RagAskPromptTests.cs
+++ b/tests/TextStack.UnitTests/RagAskPromptTests.cs
@@ -9,28 +9,82 @@ private static RetrievedChunk Chunk(int chapterOrd, string text) =>
         new(Guid.NewGuid(), Guid.NewGuid(), chapterOrd, 0, text, 0, text.Length, 0.9);
 
     [Fact]
-    public void BuildUserPrompt_NumbersExcerptsAndAppendsQuestion()
+    public void BuildContextBlock_NumbersExcerpts_NoQuestionEmbedded()
     {
-        var prompt = RagAskPrompt.BuildUserPrompt(
-            "How does it work?",
+        // The question is the final user message in the multi-turn assembly — it must NOT be baked
+        // into the context block.
+        var block = RagAskPrompt.BuildContextBlock(
             [Chunk(3, "first excerpt"), Chunk(5, "second excerpt")],
             []);
 
-        Assert.Contains("[1] (ch.3) first excerpt", prompt);
-        Assert.Contains("[2] (ch.5) second excerpt", prompt);
-        Assert.Contains("Question: How does it work?", prompt);
-        Assert.DoesNotContain("Your notes:", prompt);
+        Assert.Contains("[1] (ch.3) first excerpt", block);
+        Assert.Contains("[2] (ch.5) second excerpt", block);
+        Assert.DoesNotContain("Question:", block);
+        Assert.DoesNotContain("reader's own notes", block);
     }
 
     [Fact]
-    public void BuildUserPrompt_WithNotes_IncludesNotesBlock()
+    public void BuildContextBlock_WithNotes_IncludesNotesBlock()
     {
-        var prompt = RagAskPrompt.BuildUserPrompt(
-            "Q?", [Chunk(1, "x")], ["my highlight", "another note"]);
+        var block = RagAskPrompt.BuildContextBlock([Chunk(1, "x")], ["my highlight", "another note"]);
 
-        Assert.Contains("Your notes:", prompt);
-        Assert.Contains("- my highlight", prompt);
-        Assert.Contains("- another note", prompt);
+        Assert.Contains("reader's own notes", block);
+        Assert.Contains("- my highlight", block);
+        Assert.Contains("- another note", block);
+    }
+
+    [Fact]
+    public void BuildSystemPrompt_Companion_GreetingBranchHasNoForcedCitation()
+    {
+        // Structural: the warm-companion prompt steers greeting/meta to NOT cite and NOT refuse, while
+        // still REQUIRING [n] citations for any book fact (grounding contract preserved).
+        var system = RagAskPrompt.BuildSystemPrompt();
+
+        // Greeting/meta branch: warm + invite, explicitly no citation, no "excerpts lack the answer".
+        Assert.Contains("greet", system, StringComparison.OrdinalIgnoreCase);
+        Assert.Contains("do NOT cite", system);
+
+        // Grounding contract still present: book facts must come from excerpts and cite [n].
+        Assert.Contains("[n]", system);
+        Assert.Contains("MUST", system);
+        Assert.Contains("outside knowledge", system);
+    }
+
+    [Fact]
+    public void BuildSystemPrompt_Grounding_DominatesAndCoversInterpretation()
+    {
+        // The #1 hallucination gate: the companion may be conversational, but grounding must dominate
+        // and must cover NOT just literal plot facts but theme / meaning / interpretation — otherwise the
+        // model "explains the symbolism" from world knowledge with no citation. Lock that wording in.
+        var system = RagAskPrompt.BuildSystemPrompt();
+
+        Assert.Contains("ONLY source", system, StringComparison.OrdinalIgnoreCase);
+        Assert.Contains("interpret", system, StringComparison.OrdinalIgnoreCase);
+        Assert.Contains("theme", system, StringComparison.OrdinalIgnoreCase);
+        // Grounding is declared as overriding the conversational tone, not co-equal with it.
+        Assert.Contains("overrides", system, StringComparison.OrdinalIgnoreCase);
+    }
+
+    [Fact]
+    public void BuildSystemPrompt_GreetingExceptionIsNarrow_NotABlanketNoCiteLicense()
+    {
+        // The no-cite exception must be scoped to greeting / "what can you do" ONLY — never a blanket
+        // "you may answer without excerpts". If this assertion ever fails because the exception widened,
+        // the spoiler/hallucination gate has a hole.
+        var system = RagAskPrompt.BuildSystemPrompt();
+
+        Assert.Contains("ONLY messages you may answer without citing", system, StringComparison.OrdinalIgnoreCase);
+        Assert.DoesNotContain("you may answer without excerpts", system, StringComparison.OrdinalIgnoreCase);
+    }
+
+    [Fact]
+    public void BuildSystemPrompt_Companion_GracefulMissForContentQuestions()
+    {
+        // Genuine content question with no relevant excerpt → graceful "I don't see that", never invent.
+        var system = RagAskPrompt.BuildSystemPrompt();
+
+        Assert.Contains("don't see", system, StringComparison.OrdinalIgnoreCase);
+        Assert.Contains("invent", system, StringComparison.OrdinalIgnoreCase);
     }
 
     [Fact]
diff --git a/tests/TextStack.UnitTests/RagAskServiceTests.cs b/tests/TextStack.UnitTests/RagAskServiceTests.cs
new file mode 100644
index 00000000..fc702d1c
--- /dev/null
+++ b/tests/TextStack.UnitTests/RagAskServiceTests.cs
@@ -0,0 +1,178 @@
+using System.Runtime.CompilerServices;
+using Application.Rag;
+using Contracts.Books;
+using TextStack.Ai.Core;
+using TextStack.Ai.Rag;
+
+namespace TextStack.UnitTests;
+
+/// <summary>
+/// AI-028 — service-level guards on <see cref="RagAskService"/> driven by a deterministic, CALL-COUNTING
+/// fake <see cref="ILlmService"/> (no key, no network). The #1 invariant: when retrieval returns 0 chunks
+/// (un-read / beyond progress / unindexed), BOTH the JSON (<c>AskFromChunksAsync</c>) and streaming
+/// (<c>StreamAsync</c>) paths short-circuit to the friendly insufficient message WITHOUT calling the LLM —
+/// a single model call on empty chunks would be a spoiler/hallucination leak. Also: the non-empty path
+/// calls the model exactly once, threads the clamped multi-turn history into the request, and resolves
+/// citations from the ACCUMULATED streamed text (not per-delta).
+/// </summary>
+public class RagAskServiceTests
+{
+    private static RetrievedChunk Chunk(int ord, string text) =>
+        new(Guid.NewGuid(), Guid.NewGuid(), ord, 0, text, 0, text.Length, 0.9);
+
+    /// <summary>Counts CompleteAsync/StreamAsync calls and captures the last request; emits scripted output.</summary>
+    private sealed class CountingLlm(string completeText, string[] streamFragments) : ILlmService
+    {
+        public int CompleteCalls { get; private set; }
+        public int StreamCalls { get; private set; }
+        public LlmRequest? LastRequest { get; private set; }
+
+        public Task<LlmResponse> CompleteAsync(LlmRequest request, CancellationToken ct)
+        {
+            CompleteCalls++;
+            LastRequest = request;
+            return Task.FromResult(new LlmResponse(
+                completeText, [], new LlmUsage(0, 0, 0m), "fake-model", Guid.NewGuid()));
+        }
+
+        public async IAsyncEnumerable<LlmDelta> StreamAsync(
+            LlmRequest request, [EnumeratorCancellation] CancellationToken ct)
+        {
+            StreamCalls++;
+            LastRequest = request;
+            foreach (var f in streamFragments)
+            {
+                await Task.Yield();
+                yield return new LlmDelta(TextDelta: f);
+            }
+        }
+    }
+
+    /// <summary>An ILlmService that throws if EITHER method is ever invoked — proves no model call happened.</summary>
+    private sealed class ThrowingLlm : ILlmService
+    {
+        public Task<LlmResponse> CompleteAsync(LlmRequest request, CancellationToken ct) =>
+            throw new InvalidOperationException("LLM must NOT be called on empty chunks");
+
+        public IAsyncEnumerable<LlmDelta> StreamAsync(LlmRequest request, CancellationToken ct) =>
+            throw new InvalidOperationException("LLM must NOT be called on empty chunks");
+    }
+
+    // RagContextService is only used by AskAsync (the progress-gated entry). These guard tests target
+    // AskFromChunksAsync / StreamAsync directly, so context is never touched — pass null safely.
+    private static RagAskService Service(ILlmService llm) => new(context: null!, llm);
+
+    private static readonly CancellationToken Ct = TestContext.Current.CancellationToken;
+
+    // ---- Empty-chunk short-circuit (spoiler / hallucination gate) ------------------------------------
+
+    [Fact]
+    public async Task AskFromChunksAsync_EmptyChunks_DoesNotCallLlm_ReturnsInsufficient()
+    {
+        var svc = Service(new ThrowingLlm()); // throws if the model is touched
+
+        var answer = await svc.AskFromChunksAsync(
+            "what happens at the end?", chunks: [], noteTexts: [], history: [], lastReadOrd: 3, Ct);
+
+        Assert.True(answer.Insufficient);
+        Assert.Empty(answer.Citations);
+        Assert.Equal(3, answer.LastReadOrd);
+        Assert.Contains("don't see enough", answer.Answer);
+    }
+
+    [Fact]
+    public async Task StreamAsync_EmptyChunks_DoesNotCallLlm_FriendlyDeltaThenInsufficientTerminal()
+    {
+        var svc = Service(new ThrowingLlm()); // throws if the model is touched
+
+        var items = new List<AskStreamItem>();
+        await foreach (var i in svc.StreamAsync(
+            "what happens at the end?", chunks: [], noteTexts: [], history: [], lastReadOrd: 0, Ct))
+            items.Add(i);
+
+        Assert.Equal(2, items.Count);
+        Assert.False(string.IsNullOrEmpty(items[0].TextDelta));
+        Assert.NotNull(items[1].Terminal);
+        Assert.True(items[1].Terminal!.Insufficient);
+        Assert.Empty(items[1].Terminal!.Citations);
+    }
+
+    // ---- Non-empty path: exactly one model call, history threaded, citations resolved ---------------
+
+    [Fact]
+    public async Task AskFromChunksAsync_WithChunks_CallsLlmOnce_ResolvesCitations()
+    {
+        var llm = new CountingLlm("The hero leaves home [1] and meets a guide [2].", []);
+        var svc = Service(llm);
+
+        var answer = await svc.AskFromChunksAsync(
+            "what happens?", [Chunk(1, "left home"), Chunk(2, "the guide")], noteTexts: [],
+            history: [], lastReadOrd: 5, Ct);
+
+        Assert.Equal(1, llm.CompleteCalls);
+        Assert.False(answer.Insufficient);
+        Assert.Equal(new[] { 1, 2 }, answer.Citations.Select(c => c.Marker));
+    }
+
+    [Fact]
+    public async Task AskFromChunksAsync_ThreadsClampedHistoryIntoRequest()
+    {
+        var llm = new CountingLlm("ok", []);
+        var svc = Service(llm);
+
+        var history = new AskTurnDto[]
+        {
+            new("user", "who is she?"),
+            new("assistant", "She is introduced early [1]."),
+        };
+
+        await svc.AskFromChunksAsync("and then?", [Chunk(1, "excerpt")], [], history, lastReadOrd: 0, Ct);
+
+        // Assembly order: context block (user) → prior turns → new question last. System prompt separate.
+        var msgs = llm.LastRequest!.Messages;
+        Assert.Equal("user", msgs[0].Role);                       // context block
+        Assert.Equal("who is she?", msgs[1].Content);             // prior user turn
+        Assert.Equal("She is introduced early [1].", msgs[2].Content); // prior assistant turn
+        Assert.Equal("and then?", msgs[^1].Content);              // new question last
+        Assert.Equal(RagAskService.FeatureTag, llm.LastRequest!.FeatureTag);
+        Assert.False(string.IsNullOrWhiteSpace(llm.LastRequest!.SystemPrompt));
+    }
+
+    [Fact]
+    public async Task StreamAsync_WithChunks_CitationsFromAccumulatedText_NotPerDelta()
+    {
+        // The "[1]" marker is split across two deltas; citations must come from the FULL accumulated text.
+        var llm = new CountingLlm("", ["The hero leaves home [", "1] for adventure."]);
+        var svc = Service(llm);
+
+        var items = new List<AskStreamItem>();
+        await foreach (var i in svc.StreamAsync(
+            "what happens?", [Chunk(1, "left home")], [], history: [], lastReadOrd: 7, Ct))
+            items.Add(i);
+
+        Assert.Equal(1, llm.StreamCalls);
+        var terminal = items[^1].Terminal;
+        Assert.NotNull(terminal);
+        Assert.False(terminal!.Insufficient);
+        Assert.Equal(7, terminal.LastReadOrd);
+        // Single citation resolved from the reassembled "[1]" that no single delta contained.
+        Assert.Equal(new[] { 1 }, terminal.Citations.Select(c => c.Marker));
+    }
+
+    [Fact]
+    public async Task StreamAsync_ProviderResolutionThrows_EmitsErrorTerminal_NoThrow()
+    {
+        // A keyless host throws when the gateway resolves the provider inside StreamAsync — the service
+        // must surface that as an Error item, not let the stream blow up.
+        var svc = Service(new ThrowingLlm());
+
+        var items = new List<AskStreamItem>();
+        await foreach (var i in svc.StreamAsync(
+            "q?", [Chunk(1, "x")], [], history: [], lastReadOrd: 0, Ct))
+            items.Add(i);
+
+        var last = Assert.Single(items);
+        Assert.False(string.IsNullOrEmpty(last.Error));
+        Assert.Null(last.Terminal);
+    }
+}