From 57e2d7ad4c7d3b483fb82c14546955c02f73bc94 Mon Sep 17 00:00:00 2001 From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com> Date: Sun, 16 Nov 2025 13:00:36 +0100 Subject: [PATCH 1/5] fix: increase MAX_DIFF_SIZE default from 800KB to 5MB Allows larger PRs to be reviewed without manual configuration. The 800KB limit was too restrictive for many real-world PRs. --- ai-reviewer.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-reviewer.sh b/ai-reviewer.sh index f932cb4..a32202a 100644 --- a/ai-reviewer.sh +++ b/ai-reviewer.sh @@ -29,7 +29,7 @@ fi AI_MODEL="${AI_MODEL:-moonshotai/kimi-k2-thinking}" AI_TEMPERATURE="${AI_TEMPERATURE:-0.1}" AI_MAX_TOKENS="${AI_MAX_TOKENS:-64000}" -MAX_DIFF_SIZE="${MAX_DIFF_SIZE:-800000}" # 800KB default limit (~200K tokens, matching model context size) +MAX_DIFF_SIZE="${MAX_DIFF_SIZE:-5000000}" # 5MB default limit (allows large PRs while preventing excessive API usage) EXCLUDE_FILE_PATTERNS="${EXCLUDE_FILE_PATTERNS:-*.lock,*.min.js,*.min.css,package-lock.json,yarn.lock}" # Read diff content from stdin From 6461e8d0cb3acdafb801dd690f51b144c01ac576 Mon Sep 17 00:00:00 2001 From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com> Date: Sun, 16 Nov 2025 13:12:30 +0100 Subject: [PATCH 2/5] feat: add environment variables to control context inclusion Adds the following environment variables to reduce token usage: - INCLUDE_PREVIOUS_REVIEWS (default: true) - INCLUDE_HUMAN_COMMENTS (default: true) - INCLUDE_CHECK_RUNS (default: true) - INCLUDE_LABELS (default: true) - INCLUDE_PR_DESCRIPTION (default: true) - INCLUDE_COMMIT_MESSAGES (default: true) Set any to 'false' to exclude that context and reduce token count. This helps handle large PRs that would otherwise exceed model context limits. --- ai-reviewer.sh | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/ai-reviewer.sh b/ai-reviewer.sh index a32202a..92b2e8c 100644 --- a/ai-reviewer.sh +++ b/ai-reviewer.sh @@ -32,6 +32,14 @@ AI_MAX_TOKENS="${AI_MAX_TOKENS:-64000}" MAX_DIFF_SIZE="${MAX_DIFF_SIZE:-5000000}" # 5MB default limit (allows large PRs while preventing excessive API usage) EXCLUDE_FILE_PATTERNS="${EXCLUDE_FILE_PATTERNS:-*.lock,*.min.js,*.min.css,package-lock.json,yarn.lock}" +# Context inclusion options (set to 'false' to disable, reduces token usage) +INCLUDE_PREVIOUS_REVIEWS="${INCLUDE_PREVIOUS_REVIEWS:-true}" +INCLUDE_HUMAN_COMMENTS="${INCLUDE_HUMAN_COMMENTS:-true}" +INCLUDE_CHECK_RUNS="${INCLUDE_CHECK_RUNS:-true}" +INCLUDE_LABELS="${INCLUDE_LABELS:-true}" +INCLUDE_PR_DESCRIPTION="${INCLUDE_PR_DESCRIPTION:-true}" +INCLUDE_COMMIT_MESSAGES="${INCLUDE_COMMIT_MESSAGES:-true}" + # Read diff content from stdin DIFF_CONTENT=$(cat) @@ -64,7 +72,7 @@ fi # Fetch previous AI review (only the most recent one) for context PREVIOUS_REVIEWS="" -if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then +if [ "$INCLUDE_PREVIOUS_REVIEWS" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then # Fetch only the most recent AI review comment PREVIOUS_REVIEWS=$(gh api "repos/$REPO_FULL_NAME/issues/$PR_NUMBER/comments" \ --jq '[.[] | select(.body | startswith("## AI Code Review"))] | last | if . then "### Previous AI Review (" + .created_at + "):\n" + .body + "\n---\n" else "" end' 2>/dev/null | head -c 10000 || echo "") @@ -72,7 +80,7 @@ fi # Fetch human comments for context HUMAN_COMMENTS="" -if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then +if [ "$INCLUDE_HUMAN_COMMENTS" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then # Fetch comments from humans (not the bot) HUMAN_COMMENTS=$(gh api "repos/$REPO_FULL_NAME/issues/$PR_NUMBER/comments" \ --jq '[.[] | select(.body | startswith("## AI Code Review") | not)] | map("**" + .user.login + "** (" + .created_at + "):\n" + .body) | join("\n\n---\n\n")' 2>/dev/null | head -c 20000 || echo "") @@ -80,7 +88,7 @@ fi # Fetch GitHub Actions check runs status (if PR_NUMBER and REPO_FULL_NAME are set) CHECK_RUNS_STATUS="" -if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then +if [ "$INCLUDE_CHECK_RUNS" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then # Get the head SHA of the PR HEAD_SHA=$(gh api "repos/$REPO_FULL_NAME/pulls/$PR_NUMBER" --jq '.head.sha' 2>/dev/null || echo "") @@ -93,7 +101,7 @@ fi # Fetch available repository labels (if PR_NUMBER and REPO_FULL_NAME are set) AVAILABLE_LABELS="" -if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then +if [ "$INCLUDE_LABELS" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then # Fetch all labels from the repository if [ "$DEBUG_MODE" = "true" ]; then echo "🔍 Fetching available labels from repository..." >&2 @@ -113,7 +121,7 @@ fi # Fetch PR title and description PR_DESCRIPTION="" -if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then +if [ "$INCLUDE_PR_DESCRIPTION" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then if [ "$DEBUG_MODE" = "true" ]; then echo "🔍 Fetching PR title and description..." >&2 fi @@ -127,7 +135,7 @@ fi # Fetch commit messages (limit to 15 most recent, exclude merges) COMMIT_MESSAGES="" -if [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then +if [ "$INCLUDE_COMMIT_MESSAGES" = "true" ] && [ -n "$PR_NUMBER" ] && [ -n "$REPO_FULL_NAME" ] && [ -n "$GITHUB_TOKEN" ]; then if [ "$DEBUG_MODE" = "true" ]; then echo "🔍 Fetching commit messages..." >&2 fi From b79f0322dc42a0cec206d15f5ca7cde85e73399c Mon Sep 17 00:00:00 2001 From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com> Date: Sun, 16 Nov 2025 13:22:53 +0100 Subject: [PATCH 3/5] fix: extract JSON from mixed debug output in workflow When DEBUG_MODE is enabled, the workflow captures both debug output (stderr) and JSON output (stdout) together. This caused JSON parsing to fail because it tried to parse the entire mixed output. Now the workflow extracts just the JSON object from the mixed output before attempting to parse it. --- .github/workflows/ai-code-reviewer.yml | 29 ++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ai-code-reviewer.yml b/.github/workflows/ai-code-reviewer.yml index 44c5c54..0253f64 100644 --- a/.github/workflows/ai-code-reviewer.yml +++ b/.github/workflows/ai-code-reviewer.yml @@ -76,8 +76,29 @@ jobs: exit 1 fi + # Extract only the JSON part (find the last valid JSON object in output) + # This handles debug output being mixed with the JSON response + # Try to find JSON by looking for lines starting with { and ending with } + AI_JSON=$(echo "$AI_RESPONSE" | awk '/^{$/,/^}$/{print}' | tail -n +1) + + # If that didn't work, try extracting just the last line that looks like JSON + if [ -z "$AI_JSON" ] || ! echo "$AI_JSON" | jq . >/dev/null 2>&1; then + # Try to extract the last complete JSON object using perl + AI_JSON=$(echo "$AI_RESPONSE" | perl -0777 -ne 'print $1 if /(\{(?:[^{}]|(?R))*\})[^\{]*$/s' | tail -c 1000000) + fi + + if [ -z "$AI_JSON" ]; then + echo "⚠️ Could not extract JSON from AI response." + if [ "$DEBUG_MODE" = "true" ]; then + echo "=== DEBUG: Full response (first 3000 chars) ===" + echo "$AI_RESPONSE" | head -c 3000 + echo "=== END DEBUG ===" + fi + exit 1 + fi + # Parse JSON response - if ! echo "$AI_RESPONSE" | jq . >/dev/null 2>&1; then + if ! echo "$AI_JSON" | jq . >/dev/null 2>&1; then echo "⚠️ AI response is not valid JSON. Cannot process review." # Log raw response for debugging (redact sensitive info) @@ -103,9 +124,9 @@ jobs: fi # Extract fields from JSON - REVIEW=$(echo "$AI_RESPONSE" | jq -r '.review // "No review provided"') - DECISION=$(echo "$AI_RESPONSE" | jq -r '.fail_pass_workflow // "uncertain"') - LABELS=$(echo "$AI_RESPONSE" | jq -r '.labels_added[]? // empty') + REVIEW=$(echo "$AI_JSON" | jq -r '.review // "No review provided"') + DECISION=$(echo "$AI_JSON" | jq -r '.fail_pass_workflow // "uncertain"') + LABELS=$(echo "$AI_JSON" | jq -r '.labels_added[]? // empty') # Post the review as PR comment echo "$REVIEW" | gh pr comment ${{ github.event.pull_request.number }} --repo ${{ github.repository }} -F - From 01dae60d5a8a77cba0f546c486640114d8f94e5c Mon Sep 17 00:00:00 2001 From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com> Date: Sun, 16 Nov 2025 13:45:42 +0100 Subject: [PATCH 4/5] fix: quote GITHUB_ENV and trim DECISION value Fixes 'Unable to process file command env' error by: 1. Adding quotes around $GITHUB_ENV variable 2. Trimming whitespace/newlines from DECISION value before setting it --- .github/workflows/ai-code-reviewer.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ai-code-reviewer.yml b/.github/workflows/ai-code-reviewer.yml index 0253f64..5c44119 100644 --- a/.github/workflows/ai-code-reviewer.yml +++ b/.github/workflows/ai-code-reviewer.yml @@ -125,7 +125,7 @@ jobs: # Extract fields from JSON REVIEW=$(echo "$AI_JSON" | jq -r '.review // "No review provided"') - DECISION=$(echo "$AI_JSON" | jq -r '.fail_pass_workflow // "uncertain"') + DECISION=$(echo "$AI_JSON" | jq -r '.fail_pass_workflow // "uncertain"' | tr -d '\n\r' | xargs) LABELS=$(echo "$AI_JSON" | jq -r '.labels_added[]? // empty') # Post the review as PR comment @@ -151,7 +151,7 @@ jobs: echo "AI decision: $DECISION" # Store the decision for later use - echo "DECISION=$DECISION" >> $GITHUB_ENV + echo "DECISION=$DECISION" >> "$GITHUB_ENV" # Remove the ai_code_review label to make re-triggering easier echo "Removing ai_code_review label to allow easy re-triggering" From fab5597f77cf6ca8d2c575b9160a1a3ad5d3334c Mon Sep 17 00:00:00 2001 From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com> Date: Sun, 16 Nov 2025 14:36:05 +0100 Subject: [PATCH 5/5] revert: remove complex JSON extraction logic Reverts the fragile awk/perl JSON extraction that was causing failures. Changes: - Removed AI_JSON extraction logic - Removed 2>&1 to prevent stderr mixing with JSON output - Debug messages now go to workflow logs naturally - Keeps DECISION trimming and GITHUB_ENV quoting fixes The script outputs JSON to stdout and debug to stderr, so they don't need complex extraction - just capture stdout for JSON. --- .github/workflows/ai-code-reviewer.yml | 31 +++++--------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/.github/workflows/ai-code-reviewer.yml b/.github/workflows/ai-code-reviewer.yml index 5c44119..4b1172f 100644 --- a/.github/workflows/ai-code-reviewer.yml +++ b/.github/workflows/ai-code-reviewer.yml @@ -49,7 +49,7 @@ jobs: # Temporarily disable errexit to capture errors properly set +e - AI_RESPONSE=$(cat diff.txt | bash ai-reviewer.sh 2>&1) + AI_RESPONSE=$(cat diff.txt | bash ai-reviewer.sh) EXIT_CODE=$? set -e @@ -76,29 +76,8 @@ jobs: exit 1 fi - # Extract only the JSON part (find the last valid JSON object in output) - # This handles debug output being mixed with the JSON response - # Try to find JSON by looking for lines starting with { and ending with } - AI_JSON=$(echo "$AI_RESPONSE" | awk '/^{$/,/^}$/{print}' | tail -n +1) - - # If that didn't work, try extracting just the last line that looks like JSON - if [ -z "$AI_JSON" ] || ! echo "$AI_JSON" | jq . >/dev/null 2>&1; then - # Try to extract the last complete JSON object using perl - AI_JSON=$(echo "$AI_RESPONSE" | perl -0777 -ne 'print $1 if /(\{(?:[^{}]|(?R))*\})[^\{]*$/s' | tail -c 1000000) - fi - - if [ -z "$AI_JSON" ]; then - echo "⚠️ Could not extract JSON from AI response." - if [ "$DEBUG_MODE" = "true" ]; then - echo "=== DEBUG: Full response (first 3000 chars) ===" - echo "$AI_RESPONSE" | head -c 3000 - echo "=== END DEBUG ===" - fi - exit 1 - fi - # Parse JSON response - if ! echo "$AI_JSON" | jq . >/dev/null 2>&1; then + if ! echo "$AI_RESPONSE" | jq . >/dev/null 2>&1; then echo "⚠️ AI response is not valid JSON. Cannot process review." # Log raw response for debugging (redact sensitive info) @@ -124,9 +103,9 @@ jobs: fi # Extract fields from JSON - REVIEW=$(echo "$AI_JSON" | jq -r '.review // "No review provided"') - DECISION=$(echo "$AI_JSON" | jq -r '.fail_pass_workflow // "uncertain"' | tr -d '\n\r' | xargs) - LABELS=$(echo "$AI_JSON" | jq -r '.labels_added[]? // empty') + REVIEW=$(echo "$AI_RESPONSE" | jq -r '.review // "No review provided"') + DECISION=$(echo "$AI_RESPONSE" | jq -r '.fail_pass_workflow // "uncertain"' | tr -d '\n\r' | xargs) + LABELS=$(echo "$AI_RESPONSE" | jq -r '.labels_added[]? // empty') # Post the review as PR comment echo "$REVIEW" | gh pr comment ${{ github.event.pull_request.number }} --repo ${{ github.repository }} -F -