Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions docs/retrieval-pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -245,16 +245,18 @@ When the quality message contains "less relevant" or "keyword search", an additi

### Token budgets

Token budgets are determined by `OpenAIStreamingService` based on the active model:
Token budgets are determined by `OpenAiRequestFactory` based on the provider and model:

| Model family | Token budget | Constant |
| Provider + Model | Token budget | Constant |
|---|---|---|
| GPT-5.x | 7,000 | `MAX_TOKENS_GPT5_INPUT` (`OpenAIStreamingService.java:54`) |
| All others | 100,000 | `MAX_TOKENS_DEFAULT_INPUT` (`OpenAIStreamingService.java:57`) |
| GitHub Models GPT-5 | 7,000 | `MAX_TOKENS_GITHUB_MODELS_GPT5_INPUT` (`OpenAiRequestFactory.java:41`) |
| All others | 100,000 | `MAX_TOKENS_DEFAULT_INPUT` (`OpenAiRequestFactory.java:44`) |

The 7,000-token cap applies only to GPT-5 served via GitHub Models (which has an 8K input tier). GPT-5 family models served via OpenAI direct or the LLM gateway accept far larger inputs and use the 100K default.

Token estimation uses a conservative `(text.length() / 4) + 1` approximation (~4 characters per token for English text).

For token-constrained models (GPT-5.x), RAG retrieval is also reduced upstream: max 3 documents (`RAG_LIMIT_CONSTRAINED`) with max 600 tokens each (`RAG_TOKEN_LIMIT_CONSTRAINED`), defined in `ModelConfiguration.java`.
For token-constrained models (GPT-5 via GitHub Models), RAG retrieval is also reduced upstream: max 3 documents (`RAG_LIMIT_CONSTRAINED`) with max 600 tokens each (`RAG_TOKEN_LIMIT_CONSTRAINED`), defined in `ModelConfiguration.java`.

---

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public class PromptTruncator {

private static final Logger log = LoggerFactory.getLogger(PromptTruncator.class);

/** Truncation notice for GPT-5 family models with 8K input limit. */
/** Truncation notice for GPT-5 via GitHub Models (8K input tier). */
private static final String TRUNCATION_NOTICE_GPT5 = "[Context truncated due to GPT-5 8K input limit]\n\n";

/** Truncation notice for other models with larger limits. */
Expand All @@ -43,10 +43,10 @@ public class PromptTruncator {
*
* @param prompt the structured prompt to truncate
* @param maxTokens maximum allowed tokens
* @param isGpt5Family true if targeting GPT-5 family models (affects notice text)
* @param isGitHubModelsGpt5 true if targeting GPT-5 via GitHub Models (affects notice text)
* @return truncation result with the fitted prompt and truncation metadata
*/
public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean isGpt5Family) {
public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean isGitHubModelsGpt5) {
int reservedTokens =
prompt.system().estimatedTokens() + prompt.currentQuery().estimatedTokens();

Expand All @@ -59,7 +59,7 @@ public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean
// Return prompt with only system and query - no room for context or history
StructuredPrompt minimalPrompt =
new StructuredPrompt(prompt.system(), List.of(), List.of(), prompt.currentQuery());
return new TruncatedPrompt(minimalPrompt, true, isGpt5Family);
return new TruncatedPrompt(minimalPrompt, true, isGitHubModelsGpt5);
}

int available = maxTokens - reservedTokens;
Expand Down Expand Up @@ -98,7 +98,7 @@ public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean
maxTokens);
}

return new TruncatedPrompt(truncated, wasTruncated, isGpt5Family);
return new TruncatedPrompt(truncated, wasTruncated, isGitHubModelsGpt5);
}

/**
Expand Down Expand Up @@ -183,9 +183,9 @@ private int sumTokens(List<? extends com.williamcallahan.javachat.domain.prompt.
*
* @param prompt the truncated structured prompt
* @param wasTruncated true if any segments were removed
* @param isGpt5Family true if targeting GPT-5 family models
* @param isGitHubModelsGpt5 true if targeting GPT-5 via GitHub Models
*/
public record TruncatedPrompt(StructuredPrompt prompt, boolean wasTruncated, boolean isGpt5Family) {
public record TruncatedPrompt(StructuredPrompt prompt, boolean wasTruncated, boolean isGitHubModelsGpt5) {
/**
* Renders the prompt to a string, prepending truncation notice if needed.
*
Expand All @@ -195,7 +195,7 @@ public String render() {
if (!wasTruncated) {
return prompt.render();
}
String notice = isGpt5Family ? TRUNCATION_NOTICE_GPT5 : TRUNCATION_NOTICE_GENERIC;
String notice = isGitHubModelsGpt5 ? TRUNCATION_NOTICE_GPT5 : TRUNCATION_NOTICE_GENERIC;
return notice + prompt.render();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ public final class ModelConfiguration {
/** Default model identifier when none is configured. */
public static final String DEFAULT_MODEL = "gpt-5.2";

/** Model family prefix for GPT-5.x models with token constraints. */
/** Model family prefix for GPT-5.x models. */
private static final String GPT5_FAMILY_PREFIX = "gpt-5";

/** Estimated characters per token for conservative token counting. */
public static final int ESTIMATED_CHARS_PER_TOKEN = 4;

/** RAG document limit for token-constrained models like GPT-5.2. */
/** RAG document limit for token-constrained models (GPT-5 via GitHub Models). */
public static final int RAG_LIMIT_CONSTRAINED = 3;

/** Max tokens per RAG document for token-constrained models. */
Expand All @@ -32,7 +32,9 @@ private ModelConfiguration() {
/**
* Determines if the given model is token-constrained (requires reduced RAG context).
*
* <p>Currently the GPT-5.x family has an 8K input token limit, requiring reduced RAG context.</p>
* <p>Currently only GPT-5.x family models served via GitHub Models have an 8K input
* token limit, requiring reduced RAG context. GPT-5 family models served via OpenAI
* direct or the LLM gateway accept far larger inputs.</p>
*
* @param modelHint optional model hint from request
* @return true if reduced RAG context should be used
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ public StructuredPrompt buildStructuredPromptWithContextAndGuidance(
public StructuredPromptOutcome buildStructuredPromptWithContextOutcome(
List<Message> history, String latestUserMessage, String modelHint) {

// Use reduced RAG for token-constrained models (GPT-5.x family)
// Use reduced RAG for token-constrained models (GPT-5 via GitHub Models)
RetrievalService.RetrievalOutcome retrievalOutcome;
if (ModelConfiguration.isTokenConstrained(modelHint)) {
retrievalOutcome = retrievalService.retrieveWithLimitOutcome(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public class OpenAiRequestFactory {
/** Generous token budget for high-context models. */
private static final int MAX_TOKENS_DEFAULT_INPUT = 100_000;

/** Truncation notice for GPT-5 family models with 8K input limit. */
/** Truncation notice for GPT-5 via GitHub Models (8K input tier). */
private static final String TRUNCATION_NOTICE_GPT5 = "[Context truncated due to GPT-5 8K input limit]\n\n";

/** Truncation notice for other models with larger limits. */
Expand Down