WilliamAGH · detail-app · Jun 10, 2026
diff --git a/docs/retrieval-pipeline.md b/docs/retrieval-pipeline.md
@@ -245,16 +245,18 @@ When the quality message contains "less relevant" or "keyword search", an additi
 
 ### Token budgets
 
-Token budgets are determined by `OpenAIStreamingService` based on the active model:
+Token budgets are determined by `OpenAiRequestFactory` based on the provider and model:
 
-| Model family | Token budget | Constant |
+| Provider + Model | Token budget | Constant |
 |---|---|---|
-| GPT-5.x | 7,000 | `MAX_TOKENS_GPT5_INPUT` (`OpenAIStreamingService.java:54`) |
-| All others | 100,000 | `MAX_TOKENS_DEFAULT_INPUT` (`OpenAIStreamingService.java:57`) |
+| GitHub Models GPT-5 | 7,000 | `MAX_TOKENS_GITHUB_MODELS_GPT5_INPUT` (`OpenAiRequestFactory.java:41`) |
+| All others | 100,000 | `MAX_TOKENS_DEFAULT_INPUT` (`OpenAiRequestFactory.java:44`) |
+
+The 7,000-token cap applies only to GPT-5 served via GitHub Models (which has an 8K input tier). GPT-5 family models served via OpenAI direct or the LLM gateway accept far larger inputs and use the 100K default.
 
 Token estimation uses a conservative `(text.length() / 4) + 1` approximation (~4 characters per token for English text).
 
-For token-constrained models (GPT-5.x), RAG retrieval is also reduced upstream: max 3 documents (`RAG_LIMIT_CONSTRAINED`) with max 600 tokens each (`RAG_TOKEN_LIMIT_CONSTRAINED`), defined in `ModelConfiguration.java`.
+For token-constrained models (GPT-5 via GitHub Models), RAG retrieval is also reduced upstream: max 3 documents (`RAG_LIMIT_CONSTRAINED`) with max 600 tokens each (`RAG_TOKEN_LIMIT_CONSTRAINED`), defined in `ModelConfiguration.java`.
 
 ---
 

diff --git a/src/main/java/com/williamcallahan/javachat/application/prompt/PromptTruncator.java b/src/main/java/com/williamcallahan/javachat/application/prompt/PromptTruncator.java
@@ -28,7 +28,7 @@ public class PromptTruncator {
 
     private static final Logger log = LoggerFactory.getLogger(PromptTruncator.class);
 
-    /** Truncation notice for GPT-5 family models with 8K input limit. */
+    /** Truncation notice for GPT-5 via GitHub Models (8K input tier). */
     private static final String TRUNCATION_NOTICE_GPT5 = "[Context truncated due to GPT-5 8K input limit]\n\n";
 
     /** Truncation notice for other models with larger limits. */
@@ -43,10 +43,10 @@ public class PromptTruncator {
      *
      * @param prompt the structured prompt to truncate
      * @param maxTokens maximum allowed tokens
-     * @param isGpt5Family true if targeting GPT-5 family models (affects notice text)
+     * @param isGitHubModelsGpt5 true if targeting GPT-5 via GitHub Models (affects notice text)
      * @return truncation result with the fitted prompt and truncation metadata
      */
-    public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean isGpt5Family) {
+    public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean isGitHubModelsGpt5) {
         int reservedTokens =
                 prompt.system().estimatedTokens() + prompt.currentQuery().estimatedTokens();
 
@@ -59,7 +59,7 @@ public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean
             // Return prompt with only system and query - no room for context or history
             StructuredPrompt minimalPrompt =
                     new StructuredPrompt(prompt.system(), List.of(), List.of(), prompt.currentQuery());
-            return new TruncatedPrompt(minimalPrompt, true, isGpt5Family);
+            return new TruncatedPrompt(minimalPrompt, true, isGitHubModelsGpt5);
         }
 
         int available = maxTokens - reservedTokens;
@@ -98,7 +98,7 @@ public TruncatedPrompt truncate(StructuredPrompt prompt, int maxTokens, boolean
                     maxTokens);
         }
 
-        return new TruncatedPrompt(truncated, wasTruncated, isGpt5Family);
+        return new TruncatedPrompt(truncated, wasTruncated, isGitHubModelsGpt5);
     }
 
     /**
@@ -183,9 +183,9 @@ private int sumTokens(List<? extends com.williamcallahan.javachat.domain.prompt.
      *
      * @param prompt the truncated structured prompt
      * @param wasTruncated true if any segments were removed
-     * @param isGpt5Family true if targeting GPT-5 family models
+     * @param isGitHubModelsGpt5 true if targeting GPT-5 via GitHub Models
      */
-    public record TruncatedPrompt(StructuredPrompt prompt, boolean wasTruncated, boolean isGpt5Family) {
+    public record TruncatedPrompt(StructuredPrompt prompt, boolean wasTruncated, boolean isGitHubModelsGpt5) {
         /**
          * Renders the prompt to a string, prepending truncation notice if needed.
          *
@@ -195,7 +195,7 @@ public String render() {
             if (!wasTruncated) {
                 return prompt.render();
             }
-            String notice = isGpt5Family ? TRUNCATION_NOTICE_GPT5 : TRUNCATION_NOTICE_GENERIC;
+            String notice = isGitHubModelsGpt5 ? TRUNCATION_NOTICE_GPT5 : TRUNCATION_NOTICE_GENERIC;
             return notice + prompt.render();
         }
 

diff --git a/src/main/java/com/williamcallahan/javachat/config/ModelConfiguration.java b/src/main/java/com/williamcallahan/javachat/config/ModelConfiguration.java
@@ -13,13 +13,13 @@ public final class ModelConfiguration {
     /** Default model identifier when none is configured. */
     public static final String DEFAULT_MODEL = "gpt-5.2";
 
-    /** Model family prefix for GPT-5.x models with token constraints. */
+    /** Model family prefix for GPT-5.x models. */
     private static final String GPT5_FAMILY_PREFIX = "gpt-5";
 
     /** Estimated characters per token for conservative token counting. */
     public static final int ESTIMATED_CHARS_PER_TOKEN = 4;
 
-    /** RAG document limit for token-constrained models like GPT-5.2. */
+    /** RAG document limit for token-constrained models (GPT-5 via GitHub Models). */
     public static final int RAG_LIMIT_CONSTRAINED = 3;
 
     /** Max tokens per RAG document for token-constrained models. */
@@ -32,7 +32,9 @@ private ModelConfiguration() {
     /**
      * Determines if the given model is token-constrained (requires reduced RAG context).
      *
-     * <p>Currently the GPT-5.x family has an 8K input token limit, requiring reduced RAG context.</p>
+     * <p>Currently only GPT-5.x family models served via GitHub Models have an 8K input
+     * token limit, requiring reduced RAG context. GPT-5 family models served via OpenAI
+     * direct or the LLM gateway accept far larger inputs.</p>
      *
      * @param modelHint optional model hint from request
      * @return true if reduced RAG context should be used

diff --git a/src/main/java/com/williamcallahan/javachat/service/ChatService.java b/src/main/java/com/williamcallahan/javachat/service/ChatService.java
@@ -136,7 +136,7 @@ public StructuredPrompt buildStructuredPromptWithContextAndGuidance(
     public StructuredPromptOutcome buildStructuredPromptWithContextOutcome(
             List<Message> history, String latestUserMessage, String modelHint) {
 
-        // Use reduced RAG for token-constrained models (GPT-5.x family)
+        // Use reduced RAG for token-constrained models (GPT-5 via GitHub Models)
         RetrievalService.RetrievalOutcome retrievalOutcome;
         if (ModelConfiguration.isTokenConstrained(modelHint)) {
             retrievalOutcome = retrievalService.retrieveWithLimitOutcome(

diff --git a/src/main/java/com/williamcallahan/javachat/service/OpenAiRequestFactory.java b/src/main/java/com/williamcallahan/javachat/service/OpenAiRequestFactory.java
@@ -43,7 +43,7 @@ public class OpenAiRequestFactory {
     /** Generous token budget for high-context models. */
     private static final int MAX_TOKENS_DEFAULT_INPUT = 100_000;
 
-    /** Truncation notice for GPT-5 family models with 8K input limit. */
+    /** Truncation notice for GPT-5 via GitHub Models (8K input tier). */
     private static final String TRUNCATION_NOTICE_GPT5 = "[Context truncated due to GPT-5 8K input limit]\n\n";
 
     /** Truncation notice for other models with larger limits. */