action/entrypoint.py at main · VecGrep/action · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
"""
VecGrep GitHub Action entrypoint.

Reads inputs from INPUT_* environment variables (set by action.yml),
runs the requested vecgrep operation, writes outputs, and exits with
the appropriate code.
"""

from __future__ import annotations

import json
import os
import sys
import urllib.request
import urllib.error
from pathlib import Path


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _env(key: str, default: str = "") -> str:
    return os.environ.get(key, default).strip()


def _set_output(name: str, value: str) -> None:
    """Write a GitHub Actions step output."""
    output_file = os.environ.get("GITHUB_OUTPUT")
    if output_file:
        with open(output_file, "a") as f:
            delimiter = "EOF_VECGREP"
            f.write(f"{name}<<{delimiter}\n{value}\n{delimiter}\n")
    else:
        print(f"::set-output name={name}::{value}")


def _log(msg: str) -> None:
    print(msg, flush=True)


def _fail(msg: str) -> None:
    print(f"::error::{msg}", flush=True)
    sys.exit(1)


def _resolve_path(raw: str) -> str:
    """Resolve path relative to GITHUB_WORKSPACE if not absolute."""
    workspace = os.environ.get("GITHUB_WORKSPACE", "")
    p = Path(raw)
    if not p.is_absolute() and workspace:
        p = Path(workspace) / p
    return str(p.resolve())


# ---------------------------------------------------------------------------
# GitHub API helpers
# ---------------------------------------------------------------------------

def _get_pr_number() -> int | None:
    event_path = os.environ.get("GITHUB_EVENT_PATH", "")
    if not event_path or not Path(event_path).exists():
        return None
    try:
        event = json.loads(Path(event_path).read_text())
        return event.get("pull_request", {}).get("number")
    except (json.JSONDecodeError, OSError):
        return None


def _github_get(token: str, url: str) -> dict | list:
    req = urllib.request.Request(
        url,
        headers={
            "Authorization": f"Bearer {token}",
            "Accept": "application/vnd.github+json",
            "X-GitHub-Api-Version": "2022-11-28",
        },
    )
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())


def _get_pr_changed_files(token: str, repo: str, pr_number: int) -> list[dict]:
    """Return the list of files changed in a PR with their patches."""
    url = f"https://api.github.com/repos/{repo}/pulls/{pr_number}/files?per_page=100"
    return _github_get(token, url)  # type: ignore


def _post_pr_comment(token: str, repo: str, pr_number: int, body: str) -> None:
    url = f"https://api.github.com/repos/{repo}/issues/{pr_number}/comments"
    payload = json.dumps({"body": body}).encode()
    req = urllib.request.Request(
        url,
        data=payload,
        headers={
            "Authorization": f"Bearer {token}",
            "Accept": "application/vnd.github+json",
            "Content-Type": "application/json",
            "X-GitHub-Api-Version": "2022-11-28",
        },
        method="POST",
    )
    try:
        with urllib.request.urlopen(req) as resp:
            _log(f"PR comment posted (HTTP {resp.status}).")
    except urllib.error.HTTPError as e:
        _log(f"Warning: failed to post PR comment — HTTP {e.code}: {e.reason}")


def _extract_added_lines(patch: str) -> str:
    """Extract only the added lines from a unified diff patch."""
    lines = []
    for line in patch.split("\n"):
        if line.startswith("+") and not line.startswith("+++"):
            lines.append(line[1:])
    return "\n".join(lines)


# ---------------------------------------------------------------------------
# VecGrep operations
# ---------------------------------------------------------------------------

def _do_index(path: str) -> str:
    from vecgrep.server import _do_index as do_index  # type: ignore
    return do_index(path, force=False)


def _do_search(path: str, query: str, top_k: int, min_score: float) -> list[dict]:
    """
    Run a semantic search and return results filtered by min_score.
    Each result: {rank, file, start_line, end_line, score, content}
    """
    from vecgrep.server import _do_index, _get_store  # type: ignore
    from vecgrep.embedder import embed  # type: ignore

    _do_index(path, force=False)

    query_vec = embed([query])[0]

    with _get_store(path) as store:
        raw = store.search(query_vec, top_k=top_k)

    results = []
    for i, row in enumerate(raw, start=1):
        score = float(row.get("score", 0.0))
        if score < min_score:
            continue
        results.append({
            "rank": i,
            "file": row.get("file_path", ""),
            "start_line": row.get("start_line", 0),
            "end_line": row.get("end_line", 0),
            "score": round(score, 4),
            "content": row.get("content", ""),
        })
    return results


def _do_pr_analyze(
    path: str,
    token: str,
    repo: str,
    pr_number: int,
    top_k: int,
    min_score: float,
) -> dict[str, list[dict]]:
    """
    Analyze a PR by:
    1. Fetching the list of changed files and their diffs from the GitHub API.
    2. Extracting added/modified lines from each file's patch.
    3. Using that content as a semantic query to find related code in the codebase.
    4. Returning a map of {changed_filename: [related_results]} for files that
       have meaningful related code elsewhere in the codebase.

    Files with trivial changes (<30 chars of new content) are skipped.
    Results that point back to the changed file itself are excluded.
    """
    _log("Fetching PR changed files...")
    try:
        changed_files = _get_pr_changed_files(token, repo, pr_number)
    except urllib.error.HTTPError as e:
        _fail(f"Failed to fetch PR files — HTTP {e.code}: {e.reason}")

    # Only analyse files that have a patch (excludes binary files, renames with no edits)
    files_with_patch = [
        f for f in changed_files
        if f.get("patch") and f.get("status") in ("added", "modified", "renamed")
    ]

    if not files_with_patch:
        _log("No changed files with content to analyse.")
        return {}

    _log(f"Indexing codebase at {path}...")
    _do_index(path)

    findings: dict[str, list[dict]] = {}

    # Limit to 10 files per run to keep CI times reasonable
    for file_info in files_with_patch[:10]:
        filename = file_info["filename"]
        patch = file_info.get("patch", "")
        added_content = _extract_added_lines(patch)

        if len(added_content.strip()) < 30:
            _log(f"Skipping {filename} — trivial change.")
            continue

        # Truncate to 600 chars to keep embedding meaningful
        query_content = added_content[:600]
        _log(f"Searching for code related to changes in {filename}...")

        results = _do_search(path, query_content, top_k=top_k, min_score=min_score)

        # Filter out results from the changed file itself
        workspace = os.environ.get("GITHUB_WORKSPACE", "")
        related = [
            r for r in results
            if not r["file"].endswith(filename)
        ]

        if related:
            findings[filename] = related

    return findings


def _do_duplicate_detection(path: str, min_score: float, top_k: int) -> list[dict]:
    """
    Find semantically similar code chunks within the codebase.
    Returns pairs with score above min_score.

    Retrieves all indexed chunks with their vectors via the underlying LanceDB
    table (store._table.to_arrow()), then for each chunk searches for its
    nearest neighbours and collects pairs that exceed min_score.
    """
    import numpy as np
    from vecgrep.server import _do_index, _get_store  # type: ignore

    _do_index(path, force=False)

    # Pull all rows including the vector column from the LanceDB table
    with _get_store(path) as store:
        arrow_table = store._table.to_arrow()

    file_paths = arrow_table.column("file_path").to_pylist()
    start_lines = arrow_table.column("start_line").to_pylist()
    vectors = arrow_table.column("vector").to_pylist()

    pairs = []
    seen: set[tuple] = set()

    for file_a, line_a, vec_raw in zip(file_paths, start_lines, vectors):
        vec = np.array(vec_raw, dtype=np.float32)

        with _get_store(path) as store:
            neighbours = store.search(vec, top_k=top_k + 1)

        for neighbour in neighbours:
            file_b = neighbour.get("file_path", "")
            line_b = neighbour.get("start_line", 0)
            score = float(neighbour.get("score", 0.0))

            if file_a == file_b and line_a == line_b:
                continue
            if score < min_score:
                continue

            key = tuple(sorted([(file_a, line_a), (file_b, line_b)]))
            if key in seen:
                continue
            seen.add(key)

            pairs.append({
                "file_a": file_a,
                "start_line_a": line_a,
                "file_b": file_b,
                "start_line_b": line_b,
                "score": round(score, 4),
            })

    pairs.sort(key=lambda x: x["score"], reverse=True)
    return pairs


# ---------------------------------------------------------------------------
# Output formatters
# ---------------------------------------------------------------------------

def _format_results_markdown(results: list[dict], header: str) -> str:
    if not results:
        return f"### {header}\n\nNo results found above the score threshold."

    lines = [f"### {header}", ""]
    for r in results:
        lines.append(
            f"**[{r['rank']}]** `{r['file']}:{r['start_line']}-{r['end_line']}` "
            f"(score: {r['score']})"
        )
        lines.append("```")
        lines.append(r["content"].strip())
        lines.append("```")
        lines.append("")
    return "\n".join(lines)


def _format_pr_analysis_markdown(findings: dict[str, list[dict]]) -> str:
    if not findings:
        return (
            "## VecGrep PR Analysis\n\n"
            "No semantically related code found in the codebase for the changes in this PR."
        )

    lines = [
        "## VecGrep PR Analysis",
        "",
        "The following files changed in this PR have semantically related code elsewhere "
        "in the codebase. Review these to avoid duplication or ensure consistency.",
        "",
    ]

    for changed_file, results in findings.items():
        lines.append(f"---")
        lines.append(f"### Changes in `{changed_file}`")
        lines.append("")
        lines.append(f"Related code found ({len(results)} match(es)):")
        lines.append("")
        for r in results:
            lines.append(
                f"**[{r['rank']}]** `{r['file']}:{r['start_line']}-{r['end_line']}` "
                f"(score: {r['score']})"
            )
            lines.append("<details><summary>View snippet</summary>")
            lines.append("")
            lines.append("```python")
            lines.append(r["content"].strip())
            lines.append("```")
            lines.append("</details>")
            lines.append("")

    lines.append("---")
    lines.append(
        "_Generated by [VecGrep Action](https://github.com/VecGrep/action)_"
    )
    return "\n".join(lines)


def _format_duplicates_markdown(pairs: list[dict], header: str) -> str:
    if not pairs:
        return f"### {header}\n\nNo duplicate logic detected above the score threshold."

    lines = [f"### {header}", ""]
    for i, p in enumerate(pairs, start=1):
        lines.append(
            f"**[{i}]** `{p['file_a']}:{p['start_line_a']}` "
            f"vs `{p['file_b']}:{p['start_line_b']}` "
            f"(score: {p['score']})"
        )
        lines.append("")
    return "\n".join(lines)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> None:
    mode             = _env("INPUT_MODE", "search")
    query            = _env("INPUT_QUERY")
    raw_path         = _env("INPUT_PATH", ".")
    top_k            = int(_env("INPUT_TOP_K", "8"))
    min_score        = float(_env("INPUT_MIN_SCORE", "0.7"))
    fail_on_match    = _env("INPUT_FAIL_ON_MATCH", "false").lower() == "true"
    fail_on_no_match = _env("INPUT_FAIL_ON_NO_MATCH", "false").lower() == "true"
    comment_header   = _env("INPUT_COMMENT_HEADER", "VecGrep Semantic Search Results")
    github_token     = _env("INPUT_GITHUB_TOKEN")
    repo             = _env("GITHUB_REPOSITORY")

    path = _resolve_path(raw_path)
    _log(f"VecGrep action | mode={mode} | path={path}")

    # ------------------------------------------------------------------
    # index
    # ------------------------------------------------------------------
    if mode == "index":
        stats = _do_index(path)
        _log(stats)
        _set_output("index_stats", stats)
        return

    # ------------------------------------------------------------------
    # analyze — PR diff analysis with automatic comment
    # ------------------------------------------------------------------
    if mode == "analyze":
        if not github_token:
            _fail("Input 'github_token' is required for mode: analyze")

        pr_number = _get_pr_number()
        if not pr_number:
            _fail("Could not determine PR number. Ensure this runs on a pull_request event.")
        if not repo:
            _fail("GITHUB_REPOSITORY is not set.")

        findings = _do_pr_analyze(path, github_token, repo, pr_number, top_k, min_score)
        total_matches = sum(len(v) for v in findings.items())

        _set_output("results", json.dumps(findings, indent=2))
        _set_output("match_count", str(len(findings)))

        body = _format_pr_analysis_markdown(findings)
        _log(body)
        _post_pr_comment(github_token, repo, pr_number, body)
        return

    # ------------------------------------------------------------------
    # search / validate / comment
    # ------------------------------------------------------------------
    if mode in ("search", "validate", "comment"):
        if not query:
            _fail("Input 'query' is required for mode: " + mode)

        results = _do_search(path, query, top_k, min_score)
        match_count = len(results)

        _set_output("results", json.dumps(results, indent=2))
        _set_output("match_count", str(match_count))

        if mode == "search":
            if results:
                for r in results:
                    _log(f"[{r['rank']}] {r['file']}:{r['start_line']}-{r['end_line']} (score: {r['score']})")
                    _log(r["content"].strip())
                    _log("")
            else:
                _log("No results found above the score threshold.")

        if mode == "comment":
            pr_number = _get_pr_number()
            if not github_token:
                _log("Warning: github_token not provided — skipping PR comment.")
            elif not pr_number:
                _log("Warning: could not determine PR number — skipping PR comment.")
            elif not repo:
                _log("Warning: GITHUB_REPOSITORY not set — skipping PR comment.")
            else:
                body = _format_results_markdown(results, comment_header)
                _post_pr_comment(github_token, repo, pr_number, body)

        if fail_on_match and match_count > 0:
            _fail(f"Found {match_count} match(es) for query '{query}' (fail_on_match=true).")
        if fail_on_no_match and match_count == 0:
            _fail(f"No matches found for query '{query}' (fail_on_no_match=true).")
        return

    # ------------------------------------------------------------------
    # duplicate
    # ------------------------------------------------------------------
    if mode == "duplicate":
        pairs = _do_duplicate_detection(path, min_score, top_k)
        match_count = len(pairs)

        _set_output("results", json.dumps(pairs, indent=2))
        _set_output("match_count", str(match_count))

        if pairs:
            _log(f"Found {match_count} potential duplicate(s):")
            for p in pairs:
                _log(
                    f"  {p['file_a']}:{p['start_line_a']} <-> "
                    f"{p['file_b']}:{p['start_line_b']}  (score: {p['score']})"
                )
        else:
            _log("No duplicate logic detected above the score threshold.")

        if fail_on_match and match_count > 0:
            _fail(f"Found {match_count} duplicate pair(s) (fail_on_match=true).")
        return

    _fail(
        f"Unknown mode: '{mode}'. "
        "Valid modes: index, search, validate, comment, duplicate, analyze."
    )


if __name__ == "__main__":
    main()