diff --git a/docs/retrieval-pipeline.md b/docs/retrieval-pipeline.md index cccd94ac..dc771a53 100644 --- a/docs/retrieval-pipeline.md +++ b/docs/retrieval-pipeline.md @@ -245,16 +245,18 @@ When the quality message contains "less relevant" or "keyword search", an additi ### Token budgets -Token budgets are determined by `OpenAIStreamingService` based on the active model: +Token budgets are determined by `OpenAiRequestFactory` based on the provider and model: -| Model family | Token budget | Constant | +| Provider + Model | Token budget | Constant | |---|---|---| -| GPT-5.x | 7,000 | `MAX_TOKENS_GPT5_INPUT` (`OpenAIStreamingService.java:54`) | -| All others | 100,000 | `MAX_TOKENS_DEFAULT_INPUT` (`OpenAIStreamingService.java:57`) | +| GitHub Models GPT-5 | 7,000 | `MAX_TOKENS_GITHUB_MODELS_GPT5_INPUT` (`OpenAiRequestFactory.java:41`) | +| All others | 100,000 | `MAX_TOKENS_DEFAULT_INPUT` (`OpenAiRequestFactory.java:44`) | + +The 7,000-token cap applies only to GPT-5 served via GitHub Models (which has an 8K input tier). GPT-5 family models served via OpenAI direct or the LLM gateway accept far larger inputs and use the 100K default. Token estimation uses a conservative `(text.length() / 4) + 1` approximation (~4 characters per token for English text). -For token-constrained models (GPT-5.x), RAG retrieval is also reduced upstream: max 3 documents (`RAG_LIMIT_CONSTRAINED`) with max 600 tokens each (`RAG_TOKEN_LIMIT_CONSTRAINED`), defined in `ModelConfiguration.java`. +For token-constrained models (GPT-5 via GitHub Models), RAG retrieval is also reduced upstream: max 3 documents (`RAG_LIMIT_CONSTRAINED`) with max 600 tokens each (`RAG_TOKEN_LIMIT_CONSTRAINED`), defined in `ModelConfiguration.java`. --- diff --git a/src/main/java/com/williamcallahan/javachat/application/prompt/PromptTruncator.java b/src/main/java/com/williamcallahan/javachat/application/prompt/PromptTruncator.java index 186fe6c8..91711d02 100644 --- a/src/main/java/com/williamcallahan/javachat/application/prompt/PromptTruncator.java +++ b/src/main/java/com/williamcallahan/javachat/application/prompt/PromptTruncator.java @@ -28,7 +28,7 @@ public class PromptTruncator { private static final Logger log = LoggerFactory.getLogger(PromptTruncator.class); - /** Truncation notice for GPT-5 family models with 8K input limit. */ + /** Truncation notice for GPT-5 via GitHub Models (8K input tier). */ private static final String TRUNCATION_NOTICE_GPT5 = "[Context truncated due to GPT-5 8K input limit]\n\n"; /** Truncation notice for other models with larger limits. */ @@ -43,10 +43,10 @@ public class PromptTruncator { * * @param prompt the structured prompt to truncate * @param maxTokens maximum allowed tokens - * @param isGpt5Family true if targeting GPT-5 family models (affects notice text) + * @param isGitHubModelsGpt5 true if targeting GPT-5 via GitHub Models (affects notice text) * @return truncation result with the fitted prompt and truncation metadata */ - public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean isGpt5Family) { + public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean isGitHubModelsGpt5) { int reservedTokens = prompt.system().estimatedTokens() + prompt.currentQuery().estimatedTokens(); @@ -59,7 +59,7 @@ public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean // Return prompt with only system and query - no room for context or history StructuredPrompt minimalPrompt = new StructuredPrompt(prompt.system(), List.of(), List.of(), prompt.currentQuery()); - return new TruncatedPrompt(minimalPrompt, true, isGpt5Family); + return new TruncatedPrompt(minimalPrompt, true, isGitHubModelsGpt5); } int available = maxTokens - reservedTokens; @@ -98,7 +98,7 @@ public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean maxTokens); } - return new TruncatedPrompt(truncated, wasTruncated, isGpt5Family); + return new TruncatedPrompt(truncated, wasTruncated, isGitHubModelsGpt5); } /** @@ -183,9 +183,9 @@ private int sumTokens(ListCurrently the GPT-5.x family has an 8K input token limit, requiring reduced RAG context.

+ *

Currently only GPT-5.x family models served via GitHub Models have an 8K input + * token limit, requiring reduced RAG context. GPT-5 family models served via OpenAI + * direct or the LLM gateway accept far larger inputs.

* * @param modelHint optional model hint from request * @return true if reduced RAG context should be used diff --git a/src/main/java/com/williamcallahan/javachat/service/ChatService.java b/src/main/java/com/williamcallahan/javachat/service/ChatService.java index 317c06d1..55009623 100644 --- a/src/main/java/com/williamcallahan/javachat/service/ChatService.java +++ b/src/main/java/com/williamcallahan/javachat/service/ChatService.java @@ -136,7 +136,7 @@ public StructuredPrompt buildStructuredPromptWithContextAndGuidance( public StructuredPromptOutcome buildStructuredPromptWithContextOutcome( List history, String latestUserMessage, String modelHint) { - // Use reduced RAG for token-constrained models (GPT-5.x family) + // Use reduced RAG for token-constrained models (GPT-5 via GitHub Models) RetrievalService.RetrievalOutcome retrievalOutcome; if (ModelConfiguration.isTokenConstrained(modelHint)) { retrievalOutcome = retrievalService.retrieveWithLimitOutcome( diff --git a/src/main/java/com/williamcallahan/javachat/service/OpenAiRequestFactory.java b/src/main/java/com/williamcallahan/javachat/service/OpenAiRequestFactory.java index a9665fda..cb6f9998 100644 --- a/src/main/java/com/williamcallahan/javachat/service/OpenAiRequestFactory.java +++ b/src/main/java/com/williamcallahan/javachat/service/OpenAiRequestFactory.java @@ -43,7 +43,7 @@ public class OpenAiRequestFactory { /** Generous token budget for high-context models. */ private static final int MAX_TOKENS_DEFAULT_INPUT = 100_000; - /** Truncation notice for GPT-5 family models with 8K input limit. */ + /** Truncation notice for GPT-5 via GitHub Models (8K input tier). */ private static final String TRUNCATION_NOTICE_GPT5 = "[Context truncated due to GPT-5 8K input limit]\n\n"; /** Truncation notice for other models with larger limits. */