From 8842d917e9b7c8160eb5b6db2dbffaa6a1539e69 Mon Sep 17 00:00:00 2001 From: "detail-app[bot]" <180357370+detail-app[bot]@users.noreply.github.com> Date: Wed, 10 Jun 2026 16:24:48 +0000 Subject: [PATCH] docs: correct GPT-5 token limit scope to GitHub Models only --- docs/retrieval-pipeline.md | 12 +++++++----- .../application/prompt/PromptTruncator.java | 16 ++++++++-------- .../javachat/config/ModelConfiguration.java | 8 +++++--- .../javachat/service/ChatService.java | 2 +- .../javachat/service/OpenAiRequestFactory.java | 2 +- 5 files changed, 22 insertions(+), 18 deletions(-) diff --git a/docs/retrieval-pipeline.md b/docs/retrieval-pipeline.md index cccd94ac..dc771a53 100644 --- a/docs/retrieval-pipeline.md +++ b/docs/retrieval-pipeline.md @@ -245,16 +245,18 @@ When the quality message contains "less relevant" or "keyword search", an additi ### Token budgets -Token budgets are determined by `OpenAIStreamingService` based on the active model: +Token budgets are determined by `OpenAiRequestFactory` based on the provider and model: -| Model family | Token budget | Constant | +| Provider + Model | Token budget | Constant | |---|---|---| -| GPT-5.x | 7,000 | `MAX_TOKENS_GPT5_INPUT` (`OpenAIStreamingService.java:54`) | -| All others | 100,000 | `MAX_TOKENS_DEFAULT_INPUT` (`OpenAIStreamingService.java:57`) | +| GitHub Models GPT-5 | 7,000 | `MAX_TOKENS_GITHUB_MODELS_GPT5_INPUT` (`OpenAiRequestFactory.java:41`) | +| All others | 100,000 | `MAX_TOKENS_DEFAULT_INPUT` (`OpenAiRequestFactory.java:44`) | + +The 7,000-token cap applies only to GPT-5 served via GitHub Models (which has an 8K input tier). GPT-5 family models served via OpenAI direct or the LLM gateway accept far larger inputs and use the 100K default. Token estimation uses a conservative `(text.length() / 4) + 1` approximation (~4 characters per token for English text). -For token-constrained models (GPT-5.x), RAG retrieval is also reduced upstream: max 3 documents (`RAG_LIMIT_CONSTRAINED`) with max 600 tokens each (`RAG_TOKEN_LIMIT_CONSTRAINED`), defined in `ModelConfiguration.java`. +For token-constrained models (GPT-5 via GitHub Models), RAG retrieval is also reduced upstream: max 3 documents (`RAG_LIMIT_CONSTRAINED`) with max 600 tokens each (`RAG_TOKEN_LIMIT_CONSTRAINED`), defined in `ModelConfiguration.java`. --- diff --git a/src/main/java/com/williamcallahan/javachat/application/prompt/PromptTruncator.java b/src/main/java/com/williamcallahan/javachat/application/prompt/PromptTruncator.java index 186fe6c8..91711d02 100644 --- a/src/main/java/com/williamcallahan/javachat/application/prompt/PromptTruncator.java +++ b/src/main/java/com/williamcallahan/javachat/application/prompt/PromptTruncator.java @@ -28,7 +28,7 @@ public class PromptTruncator { private static final Logger log = LoggerFactory.getLogger(PromptTruncator.class); - /** Truncation notice for GPT-5 family models with 8K input limit. */ + /** Truncation notice for GPT-5 via GitHub Models (8K input tier). */ private static final String TRUNCATION_NOTICE_GPT5 = "[Context truncated due to GPT-5 8K input limit]\n\n"; /** Truncation notice for other models with larger limits. */ @@ -43,10 +43,10 @@ public class PromptTruncator { * * @param prompt the structured prompt to truncate * @param maxTokens maximum allowed tokens - * @param isGpt5Family true if targeting GPT-5 family models (affects notice text) + * @param isGitHubModelsGpt5 true if targeting GPT-5 via GitHub Models (affects notice text) * @return truncation result with the fitted prompt and truncation metadata */ - public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean isGpt5Family) { + public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean isGitHubModelsGpt5) { int reservedTokens = prompt.system().estimatedTokens() + prompt.currentQuery().estimatedTokens(); @@ -59,7 +59,7 @@ public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean // Return prompt with only system and query - no room for context or history StructuredPrompt minimalPrompt = new StructuredPrompt(prompt.system(), List.of(), List.of(), prompt.currentQuery()); - return new TruncatedPrompt(minimalPrompt, true, isGpt5Family); + return new TruncatedPrompt(minimalPrompt, true, isGitHubModelsGpt5); } int available = maxTokens - reservedTokens; @@ -98,7 +98,7 @@ public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean maxTokens); } - return new TruncatedPrompt(truncated, wasTruncated, isGpt5Family); + return new TruncatedPrompt(truncated, wasTruncated, isGitHubModelsGpt5); } /** @@ -183,9 +183,9 @@ private int sumTokens(List extends com.williamcallahan.javachat.domain.prompt. * * @param prompt the truncated structured prompt * @param wasTruncated true if any segments were removed - * @param isGpt5Family true if targeting GPT-5 family models + * @param isGitHubModelsGpt5 true if targeting GPT-5 via GitHub Models */ - public record TruncatedPrompt(StructuredPrompt prompt, boolean wasTruncated, boolean isGpt5Family) { + public record TruncatedPrompt(StructuredPrompt prompt, boolean wasTruncated, boolean isGitHubModelsGpt5) { /** * Renders the prompt to a string, prepending truncation notice if needed. * @@ -195,7 +195,7 @@ public String render() { if (!wasTruncated) { return prompt.render(); } - String notice = isGpt5Family ? TRUNCATION_NOTICE_GPT5 : TRUNCATION_NOTICE_GENERIC; + String notice = isGitHubModelsGpt5 ? TRUNCATION_NOTICE_GPT5 : TRUNCATION_NOTICE_GENERIC; return notice + prompt.render(); } diff --git a/src/main/java/com/williamcallahan/javachat/config/ModelConfiguration.java b/src/main/java/com/williamcallahan/javachat/config/ModelConfiguration.java index 40c420d0..c1e386af 100644 --- a/src/main/java/com/williamcallahan/javachat/config/ModelConfiguration.java +++ b/src/main/java/com/williamcallahan/javachat/config/ModelConfiguration.java @@ -13,13 +13,13 @@ public final class ModelConfiguration { /** Default model identifier when none is configured. */ public static final String DEFAULT_MODEL = "gpt-5.2"; - /** Model family prefix for GPT-5.x models with token constraints. */ + /** Model family prefix for GPT-5.x models. */ private static final String GPT5_FAMILY_PREFIX = "gpt-5"; /** Estimated characters per token for conservative token counting. */ public static final int ESTIMATED_CHARS_PER_TOKEN = 4; - /** RAG document limit for token-constrained models like GPT-5.2. */ + /** RAG document limit for token-constrained models (GPT-5 via GitHub Models). */ public static final int RAG_LIMIT_CONSTRAINED = 3; /** Max tokens per RAG document for token-constrained models. */ @@ -32,7 +32,9 @@ private ModelConfiguration() { /** * Determines if the given model is token-constrained (requires reduced RAG context). * - *
Currently the GPT-5.x family has an 8K input token limit, requiring reduced RAG context.
+ *Currently only GPT-5.x family models served via GitHub Models have an 8K input + * token limit, requiring reduced RAG context. GPT-5 family models served via OpenAI + * direct or the LLM gateway accept far larger inputs.
* * @param modelHint optional model hint from request * @return true if reduced RAG context should be used diff --git a/src/main/java/com/williamcallahan/javachat/service/ChatService.java b/src/main/java/com/williamcallahan/javachat/service/ChatService.java index 317c06d1..55009623 100644 --- a/src/main/java/com/williamcallahan/javachat/service/ChatService.java +++ b/src/main/java/com/williamcallahan/javachat/service/ChatService.java @@ -136,7 +136,7 @@ public StructuredPrompt buildStructuredPromptWithContextAndGuidance( public StructuredPromptOutcome buildStructuredPromptWithContextOutcome( List