From 57e2d7ad4c7d3b483fb82c14546955c02f73bc94 Mon Sep 17 00:00:00 2001
From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>
Date: Sun, 16 Nov 2025 13:00:36 +0100
Subject: [PATCH 1/5] fix: increase MAX_DIFF_SIZE default from 800KB to 5MB

Allows larger PRs to be reviewed without manual configuration.
The 800KB limit was too restrictive for many real-world PRs.
---
 ai-reviewer.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai-reviewer.sh b/ai-reviewer.sh
index f932cb4..a32202a 100644
--- a/ai-reviewer.sh
+++ b/ai-reviewer.sh
@@ -29,7 +29,7 @@ fi
 AI_MODEL="${AI_MODEL:-moonshotai/kimi-k2-thinking}"
 AI_TEMPERATURE="${AI_TEMPERATURE:-0.1}"
 AI_MAX_TOKENS="${AI_MAX_TOKENS:-64000}"
-MAX_DIFF_SIZE="${MAX_DIFF_SIZE:-800000}"  # 800KB default limit (~200K tokens, matching model context size)
+MAX_DIFF_SIZE="${MAX_DIFF_SIZE:-5000000}"  # 5MB default limit (allows large PRs while preventing excessive API usage)
 EXCLUDE_FILE_PATTERNS="${EXCLUDE_FILE_PATTERNS:-*.lock,*.min.js,*.min.css,package-lock.json,yarn.lock}"
 
 # Read diff content from stdin

From 6461e8d0cb3acdafb801dd690f51b144c01ac576 Mon Sep 17 00:00:00 2001
From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>
Date: Sun, 16 Nov 2025 13:12:30 +0100
Subject: [PATCH 2/5] feat: add environment variables to control context
 inclusion

Adds the following environment variables to reduce token usage:
- INCLUDE_PREVIOUS_REVIEWS (default: true)
- INCLUDE_HUMAN_COMMENTS (default: true)
- INCLUDE_CHECK_RUNS (default: true)
- INCLUDE_LABELS (default: true)
- INCLUDE_PR_DESCRIPTION (default: true)
- INCLUDE_COMMIT_MESSAGES (default: true)

Set any to 'false' to exclude that context and reduce token count.
This helps handle large PRs that would otherwise exceed model context limits.
---
 ai-reviewer.sh | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/ai-reviewer.sh b/ai-reviewer.sh
index a32202a..92b2e8c 100644
--- a/ai-reviewer.sh
+++ b/ai-reviewer.sh
@@ -32,6 +32,14 @@ AI_MAX_TOKENS="${AI_MAX_TOKENS:-64000}"
 MAX_DIFF_SIZE="${MAX_DIFF_SIZE:-5000000}"  # 5MB default limit (allows large PRs while preventing excessive API usage)
 EXCLUDE_FILE_PATTERNS="${EXCLUDE_FILE_PATTERNS:-*.lock,*.min.js,*.min.css,package-lock.json,yarn.lock}"
 
+# Context inclusion options (set to 'false' to disable, reduces token usage)
+INCLUDE_PREVIOUS_REVIEWS="${INCLUDE_PREVIOUS_REVIEWS:-true}"
+INCLUDE_HUMAN_COMMENTS="${INCLUDE_HUMAN_COMMENTS:-true}"
+INCLUDE_CHECK_RUNS="${INCLUDE_CHECK_RUNS:-true}"
+INCLUDE_LABELS="${INCLUDE_LABELS:-true}"
+INCLUDE_PR_DESCRIPTION="${INCLUDE_PR_DESCRIPTION:-true}"
+INCLUDE_COMMIT_MESSAGES="${INCLUDE_COMMIT_MESSAGES:-true}"
+
 # Read diff content from stdin
 DIFF_CONTENT=$(cat)
 
@@ -64,7 +72,7 @@ fi
 
 # Fetch previous AI review (only the most recent one) for context
 PREVIOUS_REVIEWS=""
-if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
+if [ "$INCLUDE_PREVIOUS_REVIEWS" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
     # Fetch only the most recent AI review comment
     PREVIOUS_REVIEWS=$(gh api "repos/$REPO_FULL_NAME/issues/$PR_NUMBER/comments" \
         --jq '[.[] | select(.body | startswith("## AI Code Review"))] | last | if . then "### Previous AI Review (" + .created_at + "):\n" + .body + "\n---\n" else "" end' 2>/dev/null | head -c 10000 || echo "")
@@ -72,7 +80,7 @@ fi
 
 # Fetch human comments for context
 HUMAN_COMMENTS=""
-if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
+if [ "$INCLUDE_HUMAN_COMMENTS" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
     # Fetch comments from humans (not the bot)
     HUMAN_COMMENTS=$(gh api "repos/$REPO_FULL_NAME/issues/$PR_NUMBER/comments" \
         --jq '[.[] | select(.body | startswith("## AI Code Review") | not)] | map("**" + .user.login + "** (" + .created_at + "):\n" + .body) | join("\n\n---\n\n")' 2>/dev/null | head -c 20000 || echo "")
@@ -80,7 +88,7 @@ fi
 
 # Fetch GitHub Actions check runs status (if PR_NUMBER and REPO_FULL_NAME are set)
 CHECK_RUNS_STATUS=""
-if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
+if [ "$INCLUDE_CHECK_RUNS" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
     # Get the head SHA of the PR
     HEAD_SHA=$(gh api "repos/$REPO_FULL_NAME/pulls/$PR_NUMBER" --jq '.head.sha' 2>/dev/null || echo "")
 
@@ -93,7 +101,7 @@ fi
 
 # Fetch available repository labels (if PR_NUMBER and REPO_FULL_NAME are set)
 AVAILABLE_LABELS=""
-if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
+if [ "$INCLUDE_LABELS" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
     # Fetch all labels from the repository
     if [ "$DEBUG_MODE" = "true" ]; then
         echo "🔍 Fetching available labels from repository..." >&2
@@ -113,7 +121,7 @@ fi
 
 # Fetch PR title and description
 PR_DESCRIPTION=""
-if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
+if [ "$INCLUDE_PR_DESCRIPTION" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
     if [ "$DEBUG_MODE" = "true" ]; then
         echo "🔍 Fetching PR title and description..." >&2
     fi
@@ -127,7 +135,7 @@ fi
 
 # Fetch commit messages (limit to 15 most recent, exclude merges)
 COMMIT_MESSAGES=""
-if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
+if [ "$INCLUDE_COMMIT_MESSAGES" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then
     if [ "$DEBUG_MODE" = "true" ]; then
         echo "🔍 Fetching commit messages..." >&2
     fi

From b79f0322dc42a0cec206d15f5ca7cde85e73399c Mon Sep 17 00:00:00 2001
From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>
Date: Sun, 16 Nov 2025 13:22:53 +0100
Subject: [PATCH 3/5] fix: extract JSON from mixed debug output in workflow

When DEBUG_MODE is enabled, the workflow captures both debug output
(stderr) and JSON output (stdout) together. This caused JSON parsing
to fail because it tried to parse the entire mixed output.

Now the workflow extracts just the JSON object from the mixed output
before attempting to parse it.
---
 .github/workflows/ai-code-reviewer.yml | 29 ++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ai-code-reviewer.yml b/.github/workflows/ai-code-reviewer.yml
index 44c5c54..0253f64 100644
--- a/.github/workflows/ai-code-reviewer.yml
+++ b/.github/workflows/ai-code-reviewer.yml
@@ -76,8 +76,29 @@ jobs:
             exit 1
           fi
 
+          # Extract only the JSON part (find the last valid JSON object in output)
+          # This handles debug output being mixed with the JSON response
+          # Try to find JSON by looking for lines starting with { and ending with }
+          AI_JSON=$(echo "$AI_RESPONSE" | awk '/^{$/,/^}$/{print}' | tail -n +1)
+
+          # If that didn't work, try extracting just the last line that looks like JSON
+          if [ -z "$AI_JSON" ] || ! echo "$AI_JSON" | jq . >/dev/null 2>&1; then
+            # Try to extract the last complete JSON object using perl
+            AI_JSON=$(echo "$AI_RESPONSE" | perl -0777 -ne 'print $1 if /(\{(?:[^{}]|(?R))*\})[^\{]*$/s' | tail -c 1000000)
+          fi
+
+          if [ -z "$AI_JSON" ]; then
+            echo "⚠️ Could not extract JSON from AI response."
+            if [ "$DEBUG_MODE" = "true" ]; then
+              echo "=== DEBUG: Full response (first 3000 chars) ==="
+              echo "$AI_RESPONSE" | head -c 3000
+              echo "=== END DEBUG ==="
+            fi
+            exit 1
+          fi
+
           # Parse JSON response
-          if ! echo "$AI_RESPONSE" | jq . >/dev/null 2>&1; then
+          if ! echo "$AI_JSON" | jq . >/dev/null 2>&1; then
             echo "⚠️ AI response is not valid JSON. Cannot process review."
 
             # Log raw response for debugging (redact sensitive info)
@@ -103,9 +124,9 @@ jobs:
           fi
 
           # Extract fields from JSON
-          REVIEW=$(echo "$AI_RESPONSE" | jq -r '.review // "No review provided"')
-          DECISION=$(echo "$AI_RESPONSE" | jq -r '.fail_pass_workflow // "uncertain"')
-          LABELS=$(echo "$AI_RESPONSE" | jq -r '.labels_added[]? // empty')
+          REVIEW=$(echo "$AI_JSON" | jq -r '.review // "No review provided"')
+          DECISION=$(echo "$AI_JSON" | jq -r '.fail_pass_workflow // "uncertain"')
+          LABELS=$(echo "$AI_JSON" | jq -r '.labels_added[]? // empty')
 
           # Post the review as PR comment
           echo "$REVIEW" | gh pr comment ${{ github.event.pull_request.number }} --repo ${{ github.repository }} -F -

From 01dae60d5a8a77cba0f546c486640114d8f94e5c Mon Sep 17 00:00:00 2001
From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>
Date: Sun, 16 Nov 2025 13:45:42 +0100
Subject: [PATCH 4/5] fix: quote GITHUB_ENV and trim DECISION value

Fixes 'Unable to process file command env' error by:
1. Adding quotes around $GITHUB_ENV variable
2. Trimming whitespace/newlines from DECISION value before setting it
---
 .github/workflows/ai-code-reviewer.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ai-code-reviewer.yml b/.github/workflows/ai-code-reviewer.yml
index 0253f64..5c44119 100644
--- a/.github/workflows/ai-code-reviewer.yml
+++ b/.github/workflows/ai-code-reviewer.yml
@@ -125,7 +125,7 @@ jobs:
 
           # Extract fields from JSON
           REVIEW=$(echo "$AI_JSON" | jq -r '.review // "No review provided"')
-          DECISION=$(echo "$AI_JSON" | jq -r '.fail_pass_workflow // "uncertain"')
+          DECISION=$(echo "$AI_JSON" | jq -r '.fail_pass_workflow // "uncertain"' | tr -d '\n\r' | xargs)
           LABELS=$(echo "$AI_JSON" | jq -r '.labels_added[]? // empty')
 
           # Post the review as PR comment
@@ -151,7 +151,7 @@ jobs:
           echo "AI decision: $DECISION"
 
           # Store the decision for later use
-          echo "DECISION=$DECISION" >> $GITHUB_ENV
+          echo "DECISION=$DECISION" >> "$GITHUB_ENV"
 
           # Remove the ai_code_review label to make re-triggering easier
           echo "Removing ai_code_review label to allow easy re-triggering"

From fab5597f77cf6ca8d2c575b9160a1a3ad5d3334c Mon Sep 17 00:00:00 2001
From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>
Date: Sun, 16 Nov 2025 14:36:05 +0100
Subject: [PATCH 5/5] revert: remove complex JSON extraction logic

Reverts the fragile awk/perl JSON extraction that was causing failures.

Changes:
- Removed AI_JSON extraction logic
- Removed 2>&1 to prevent stderr mixing with JSON output
- Debug messages now go to workflow logs naturally
- Keeps DECISION trimming and GITHUB_ENV quoting fixes

The script outputs JSON to stdout and debug to stderr, so they
don't need complex extraction - just capture stdout for JSON.
---
 .github/workflows/ai-code-reviewer.yml | 31 +++++---------------------
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/ai-code-reviewer.yml b/.github/workflows/ai-code-reviewer.yml
index 5c44119..4b1172f 100644
--- a/.github/workflows/ai-code-reviewer.yml
+++ b/.github/workflows/ai-code-reviewer.yml
@@ -49,7 +49,7 @@ jobs:
 
           # Temporarily disable errexit to capture errors properly
           set +e
-          AI_RESPONSE=$(cat diff.txt | bash ai-reviewer.sh 2>&1)
+          AI_RESPONSE=$(cat diff.txt | bash ai-reviewer.sh)
           EXIT_CODE=$?
           set -e
 
@@ -76,29 +76,8 @@ jobs:
             exit 1
           fi
 
-          # Extract only the JSON part (find the last valid JSON object in output)
-          # This handles debug output being mixed with the JSON response
-          # Try to find JSON by looking for lines starting with { and ending with }
-          AI_JSON=$(echo "$AI_RESPONSE" | awk '/^{$/,/^}$/{print}' | tail -n +1)
-
-          # If that didn't work, try extracting just the last line that looks like JSON
-          if [ -z "$AI_JSON" ] || ! echo "$AI_JSON" | jq . >/dev/null 2>&1; then
-            # Try to extract the last complete JSON object using perl
-            AI_JSON=$(echo "$AI_RESPONSE" | perl -0777 -ne 'print $1 if /(\{(?:[^{}]|(?R))*\})[^\{]*$/s' | tail -c 1000000)
-          fi
-
-          if [ -z "$AI_JSON" ]; then
-            echo "⚠️ Could not extract JSON from AI response."
-            if [ "$DEBUG_MODE" = "true" ]; then
-              echo "=== DEBUG: Full response (first 3000 chars) ==="
-              echo "$AI_RESPONSE" | head -c 3000
-              echo "=== END DEBUG ==="
-            fi
-            exit 1
-          fi
-
           # Parse JSON response
-          if ! echo "$AI_JSON" | jq . >/dev/null 2>&1; then
+          if ! echo "$AI_RESPONSE" | jq . >/dev/null 2>&1; then
             echo "⚠️ AI response is not valid JSON. Cannot process review."
 
             # Log raw response for debugging (redact sensitive info)
@@ -124,9 +103,9 @@ jobs:
           fi
 
           # Extract fields from JSON
-          REVIEW=$(echo "$AI_JSON" | jq -r '.review // "No review provided"')
-          DECISION=$(echo "$AI_JSON" | jq -r '.fail_pass_workflow // "uncertain"' | tr -d '\n\r' | xargs)
-          LABELS=$(echo "$AI_JSON" | jq -r '.labels_added[]? // empty')
+          REVIEW=$(echo "$AI_RESPONSE" | jq -r '.review // "No review provided"')
+          DECISION=$(echo "$AI_RESPONSE" | jq -r '.fail_pass_workflow // "uncertain"' | tr -d '\n\r' | xargs)
+          LABELS=$(echo "$AI_RESPONSE" | jq -r '.labels_added[]? // empty')
 
           # Post the review as PR comment
           echo "$REVIEW" | gh pr comment ${{ github.event.pull_request.number }} --repo ${{ github.repository }} -F -