From 240968931ded3e3ede3a618e115caae6106d4a1e Mon Sep 17 00:00:00 2001 From: gupsammy Date: Wed, 4 Mar 2026 18:06:22 +0530 Subject: [PATCH 1/5] =?UTF-8?q?feat(cli):=20v2.0=20=E2=80=94=20batch=20mod?= =?UTF-8?q?e,=20NDJSON=20output,=20retry,=20rate=20limiting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `ezycopy batch` subcommand for multi-URL extraction pipelines: - Concurrent worker pool with configurable parallelism (--concurrency) - Per-domain rate limiting to avoid overwhelming servers - Exponential backoff retry for transient failures - NDJSON streaming output with inline errors - Content-Type pre-check to skip non-HTML (PDFs, images) - HTTP response body size limit (--max-body-size) - Browser pool: single Chrome instance reused across batch URLs - Signal handling: graceful Ctrl-C, force exit on second - Structured JSON error output on stderr - --json flag for root command structured output - --skip-existing for batch resume after partial failure - 19 unit tests covering rate limiter, retry, HTTP hardening Restructures codebase: main.go -> cmd/ package (Cobra best practice). Root command is fully backward-compatible with v0.4.0. Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 4 + batch/ratelimit.go | 62 ++++++ batch/ratelimit_test.go | 110 ++++++++++ batch/retry.go | 41 ++++ batch/retry_test.go | 98 +++++++++ cmd/batch.go | 444 ++++++++++++++++++++++++++++++++++++++++ cmd/batch_test.go | 54 +++++ cmd/root.go | 236 +++++++++++++++++++++ cmd/types.go | 61 ++++++ extractor/browser.go | 5 +- extractor/http.go | 73 ++++++- extractor/http_test.go | 150 ++++++++++++++ extractor/pool.go | 102 +++++++++ go.mod | 1 + go.sum | 2 + main.go | 110 +--------- 16 files changed, 1435 insertions(+), 118 deletions(-) create mode 100644 batch/ratelimit.go create mode 100644 batch/ratelimit_test.go create mode 100644 batch/retry.go create mode 100644 batch/retry_test.go create mode 100644 cmd/batch.go create mode 100644 cmd/batch_test.go create mode 100644 cmd/root.go create mode 100644 cmd/types.go create mode 100644 extractor/http_test.go create mode 100644 extractor/pool.go diff --git a/.gitignore b/.gitignore index 9665a8f..902ee36 100644 --- a/.gitignore +++ b/.gitignore @@ -50,8 +50,12 @@ debug .env.local # Extension artifacts +*.zip extensions/**/*.zip extensions/**/assets/ # Video project videos/ + +# Agent counselors output +agents/ diff --git a/batch/ratelimit.go b/batch/ratelimit.go new file mode 100644 index 0000000..97d9dc3 --- /dev/null +++ b/batch/ratelimit.go @@ -0,0 +1,62 @@ +package batch + +import ( + "context" + "net/url" + "sync" + "time" +) + +// DomainLimiter enforces a minimum delay between requests to the same domain. +type DomainLimiter struct { + mu sync.Mutex + minDelay time.Duration + last map[string]time.Time +} + +// NewDomainLimiter creates a rate limiter with the given minimum delay per domain. +func NewDomainLimiter(minDelay time.Duration) *DomainLimiter { + return &DomainLimiter{ + minDelay: minDelay, + last: make(map[string]time.Time), + } +} + +// Wait blocks until it's safe to make a request to the given URL's domain. +// It reserves the slot before returning, so concurrent callers are serialized per domain. +func (d *DomainLimiter) Wait(ctx context.Context, rawURL string) error { + domain := extractDomain(rawURL) + + d.mu.Lock() + lastReq, ok := d.last[domain] + now := time.Now() + + if ok { + elapsed := now.Sub(lastReq) + if elapsed < d.minDelay { + wait := d.minDelay - elapsed + // Reserve the slot before releasing the lock + d.last[domain] = now.Add(wait) + d.mu.Unlock() + + select { + case <-time.After(wait): + return nil + case <-ctx.Done(): + return ctx.Err() + } + } + } + + d.last[domain] = now + d.mu.Unlock() + return nil +} + +func extractDomain(rawURL string) string { + u, err := url.Parse(rawURL) + if err != nil { + return rawURL + } + return u.Hostname() +} diff --git a/batch/ratelimit_test.go b/batch/ratelimit_test.go new file mode 100644 index 0000000..ed190ec --- /dev/null +++ b/batch/ratelimit_test.go @@ -0,0 +1,110 @@ +package batch + +import ( + "context" + "sync" + "testing" + "time" +) + +func TestDomainLimiter_EnforcesDelay(t *testing.T) { + limiter := NewDomainLimiter(100 * time.Millisecond) + ctx := context.Background() + + start := time.Now() + + // First request should not wait + if err := limiter.Wait(ctx, "https://example.com/a"); err != nil { + t.Fatal(err) + } + + // Second request to same domain should wait ~100ms + if err := limiter.Wait(ctx, "https://example.com/b"); err != nil { + t.Fatal(err) + } + + elapsed := time.Since(start) + if elapsed < 90*time.Millisecond { + t.Errorf("expected >= 100ms delay, got %v", elapsed) + } +} + +func TestDomainLimiter_DifferentDomains(t *testing.T) { + limiter := NewDomainLimiter(200 * time.Millisecond) + ctx := context.Background() + + start := time.Now() + + // Two different domains should not block each other + if err := limiter.Wait(ctx, "https://a.com/page"); err != nil { + t.Fatal(err) + } + if err := limiter.Wait(ctx, "https://b.com/page"); err != nil { + t.Fatal(err) + } + + elapsed := time.Since(start) + if elapsed > 50*time.Millisecond { + t.Errorf("different domains should not wait, got %v", elapsed) + } +} + +func TestDomainLimiter_ContextCancellation(t *testing.T) { + limiter := NewDomainLimiter(5 * time.Second) + ctx, cancel := context.WithCancel(context.Background()) + + // First request to establish the domain + if err := limiter.Wait(ctx, "https://example.com/a"); err != nil { + t.Fatal(err) + } + + // Cancel before second request completes + go func() { + time.Sleep(50 * time.Millisecond) + cancel() + }() + + err := limiter.Wait(ctx, "https://example.com/b") + if err == nil { + t.Error("expected context cancellation error") + } +} + +func TestDomainLimiter_ConcurrentSameDomain(t *testing.T) { + limiter := NewDomainLimiter(50 * time.Millisecond) + ctx := context.Background() + + start := time.Now() + var wg sync.WaitGroup + for i := 0; i < 3; i++ { + wg.Add(1) + go func() { + defer wg.Done() + limiter.Wait(ctx, "https://same.com/page") + }() + } + wg.Wait() + + elapsed := time.Since(start) + // 3 requests with 50ms delay = at least ~100ms (first is free, second waits, third waits) + if elapsed < 80*time.Millisecond { + t.Errorf("expected >= 100ms for 3 concurrent same-domain requests, got %v", elapsed) + } +} + +func TestExtractDomain(t *testing.T) { + tests := []struct { + url string + domain string + }{ + {"https://example.com/path", "example.com"}, + {"http://sub.example.com:8080/page", "sub.example.com"}, + {"not-a-url", ""}, + } + for _, tt := range tests { + got := extractDomain(tt.url) + if got != tt.domain { + t.Errorf("extractDomain(%q) = %q, want %q", tt.url, got, tt.domain) + } + } +} diff --git a/batch/retry.go b/batch/retry.go new file mode 100644 index 0000000..0fe2177 --- /dev/null +++ b/batch/retry.go @@ -0,0 +1,41 @@ +package batch + +import ( + "context" + "time" +) + +// RetryConfig controls exponential backoff retry behavior. +type RetryConfig struct { + MaxAttempts int // Total attempts (1 = no retry) + InitDelay time.Duration // Delay before first retry; doubles each attempt +} + +// Do executes fn with retries on error. It uses exponential backoff and +// respects context cancellation. Only retries if shouldRetry returns true. +func (rc *RetryConfig) Do(ctx context.Context, fn func() error, shouldRetry func(error) bool) error { + var lastErr error + delay := rc.InitDelay + + for attempt := 0; attempt < rc.MaxAttempts; attempt++ { + lastErr = fn() + if lastErr == nil { + return nil + } + + if !shouldRetry(lastErr) { + return lastErr + } + + // Don't sleep after the last attempt + if attempt < rc.MaxAttempts-1 { + select { + case <-time.After(delay): + delay *= 2 + case <-ctx.Done(): + return ctx.Err() + } + } + } + return lastErr +} diff --git a/batch/retry_test.go b/batch/retry_test.go new file mode 100644 index 0000000..52975a9 --- /dev/null +++ b/batch/retry_test.go @@ -0,0 +1,98 @@ +package batch + +import ( + "context" + "errors" + "testing" + "time" +) + +func TestRetryConfig_NoRetryOnSuccess(t *testing.T) { + rc := &RetryConfig{MaxAttempts: 3, InitDelay: 10 * time.Millisecond} + calls := 0 + + err := rc.Do(context.Background(), func() error { + calls++ + return nil + }, func(error) bool { return true }) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if calls != 1 { + t.Errorf("expected 1 call, got %d", calls) + } +} + +func TestRetryConfig_RetriesOnTransientError(t *testing.T) { + rc := &RetryConfig{MaxAttempts: 3, InitDelay: 10 * time.Millisecond} + calls := 0 + + err := rc.Do(context.Background(), func() error { + calls++ + if calls < 3 { + return errors.New("transient") + } + return nil + }, func(error) bool { return true }) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if calls != 3 { + t.Errorf("expected 3 calls, got %d", calls) + } +} + +func TestRetryConfig_StopsOnNonRetryable(t *testing.T) { + rc := &RetryConfig{MaxAttempts: 5, InitDelay: 10 * time.Millisecond} + calls := 0 + permanent := errors.New("permanent") + + err := rc.Do(context.Background(), func() error { + calls++ + return permanent + }, func(err error) bool { return err.Error() != "permanent" }) + + if err != permanent { + t.Fatalf("expected permanent error, got %v", err) + } + if calls != 1 { + t.Errorf("expected 1 call (no retry), got %d", calls) + } +} + +func TestRetryConfig_ExponentialBackoff(t *testing.T) { + rc := &RetryConfig{MaxAttempts: 3, InitDelay: 50 * time.Millisecond} + calls := 0 + + start := time.Now() + rc.Do(context.Background(), func() error { + calls++ + return errors.New("fail") + }, func(error) bool { return true }) + + elapsed := time.Since(start) + // Expected: 50ms + 100ms = 150ms minimum + if elapsed < 130*time.Millisecond { + t.Errorf("expected >= 150ms for exponential backoff, got %v", elapsed) + } +} + +func TestRetryConfig_ContextCancellation(t *testing.T) { + rc := &RetryConfig{MaxAttempts: 10, InitDelay: 5 * time.Second} + ctx, cancel := context.WithCancel(context.Background()) + + go func() { + time.Sleep(50 * time.Millisecond) + cancel() + }() + + err := rc.Do(ctx, func() error { + return errors.New("fail") + }, func(error) bool { return true }) + + if err == nil { + t.Error("expected error from cancelled context") + } +} diff --git a/cmd/batch.go b/cmd/batch.go new file mode 100644 index 0000000..61182ad --- /dev/null +++ b/cmd/batch.go @@ -0,0 +1,444 @@ +package cmd + +import ( + "bufio" + "context" + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" + "regexp" + "strings" + "sync" + "time" + + "github.com/gupsammy/EzyCopy/batch" + "github.com/gupsammy/EzyCopy/extractor" + "github.com/gupsammy/EzyCopy/output" + "github.com/spf13/cobra" + "golang.org/x/sync/errgroup" +) + +var ( + fileFlag string + concurrency int + rateLimit time.Duration + retries int + retryDelay time.Duration + failFast bool + skipExisting bool + maxBodySize string +) + +func init() { + batchCmd := &cobra.Command{ + Use: "batch [url...]", + Short: "Extract multiple URLs to NDJSON", + Long: `Extract multiple URLs in parallel, streaming results as NDJSON (one JSON object per line). + +Input sources (mutually exclusive): + Positional args: ezycopy batch https://a.com https://b.com + File: ezycopy batch --file urls.txt + Stdin: cat urls.txt | ezycopy batch - + +Output is always NDJSON to stdout. Progress goes to stderr.`, + RunE: runBatch, + } + + f := batchCmd.Flags() + f.StringVarP(&fileFlag, "file", "f", "", "Read URLs from file (one per line)") + f.IntVarP(&concurrency, "concurrency", "j", 3, "Max parallel extractions") + f.DurationVar(&rateLimit, "rate-limit", 1*time.Second, "Min delay between requests to same domain") + f.IntVar(&retries, "retries", 2, "Retry count on transient failures") + f.DurationVar(&retryDelay, "retry-delay", 3*time.Second, "Base delay between retries (exponential backoff)") + f.BoolVar(&failFast, "fail-fast", false, "Stop on first error") + f.BoolVar(&skipExisting, "skip-existing", false, "Skip URLs whose output file already exists (requires -o)") + f.StringVar(&maxBodySize, "max-body-size", "10MB", "Max HTTP response body size") + + rootCmd.AddCommand(batchCmd) +} + +func runBatch(cmd *cobra.Command, args []string) error { + ctx := cmd.Context() + + // Collect URLs from mutually exclusive sources + urls, err := collectURLs(args) + if err != nil { + return err + } + if len(urls) == 0 { + return &UsageError{Msg: "no URLs provided", Hint: "ezycopy batch --help"} + } + + // Parse max body size + bodyLimit, err := parseByteSize(maxBodySize) + if err != nil { + return &UsageError{Msg: fmt.Sprintf("invalid --max-body-size: %v", err)} + } + + // Setup concurrency primitives + limiter := batch.NewDomainLimiter(rateLimit) + retryCfg := &batch.RetryConfig{ + MaxAttempts: retries + 1, // retries=2 means 3 total attempts + InitDelay: retryDelay, + } + + // Setup browser pool if needed + var pool *extractor.BrowserPool + if browserFlag { + pool, err = extractor.NewBrowserPool(browserWS) + if err != nil { + return fmt.Errorf("failed to start browser: %w", err) + } + defer pool.Close() + } + + // Validate output directory if set + if outputFlag != "" { + if err := os.MkdirAll(outputFlag, 0755); err != nil { + return fmt.Errorf("failed to create output directory: %w", err) + } + } + + // Results channel — buffered to avoid blocking workers + results := make(chan BatchResult, len(urls)) + + // Writer goroutine — single writer to stdout, no interleaving + var writerWg sync.WaitGroup + writerWg.Add(1) + var okCount, errCount int + var errCodes []string + go func() { + defer writerWg.Done() + enc := json.NewEncoder(os.Stdout) + enc.SetEscapeHTML(false) + for r := range results { + enc.Encode(r) + if r.Status == "ok" { + okCount++ + } else { + errCount++ + errCodes = append(errCodes, r.Error) + } + } + }() + + // Worker pool + g, gctx := errgroup.WithContext(ctx) + g.SetLimit(concurrency) + + for i, u := range urls { + seq := i + rawURL := u + + g.Go(func() error { + result := fetchAndExtract(gctx, seq, rawURL, limiter, retryCfg, pool, bodyLimit) + results <- result + + // Write to file if output dir is set and extraction succeeded + if outputFlag != "" && result.Status == "ok" && result.Title != nil { + writeResultToFile(result) + } + + if !quiet { + status := "ok" + if result.Status == "error" { + status = result.Error + } + fmt.Fprintf(os.Stderr, "[%d/%d] %s (%s)\n", seq+1, len(urls), rawURL, status) + } + + if failFast && result.Status == "error" { + return fmt.Errorf("failed: %s", rawURL) + } + return nil + }) + } + + // Wait for all workers, then close results channel + _ = g.Wait() + close(results) + writerWg.Wait() + + // Summary to stderr + if !quiet { + total := okCount + errCount + if errCount > 0 { + unique := uniqueStrings(errCodes) + fmt.Fprintf(os.Stderr, "%d URLs: %d ok, %d failed (%s)\n", total, okCount, errCount, strings.Join(unique, ", ")) + } else { + fmt.Fprintf(os.Stderr, "%d URLs: %d ok\n", total, okCount) + } + } + + // Exit code + if errCount == len(urls) { + return &batchError{code: ExitTotalFailure} + } + if errCount > 0 { + return &batchError{code: ExitPartial} + } + return nil +} + +func fetchAndExtract(ctx context.Context, seq int, rawURL string, limiter *batch.DomainLimiter, retryCfg *batch.RetryConfig, pool *extractor.BrowserPool, maxBody int64) BatchResult { + start := time.Now() + result := BatchResult{ + Seq: seq, + OriginalURL: rawURL, + Type: resolveType(), + ExtractedAt: time.Now().UTC().Format(time.RFC3339), + } + + var pageResult *extractor.PageResult + var extractErr error + + err := retryCfg.Do(ctx, func() error { + if err := limiter.Wait(ctx, rawURL); err != nil { + return err + } + + if pool != nil { + pageResult, extractErr = pool.FetchPage(ctx, rawURL, timeout) + } else { + pageResult, extractErr = extractor.FetchPageHTTP(ctx, rawURL, timeout, maxBody) + } + return extractErr + }, isRetryable) + + if err != nil { + result.Status = "error" + result.DurationMs = time.Since(start).Milliseconds() + var ee *extractor.ExtractionError + if errors.As(err, &ee) { + result.Error = ee.Code + result.Message = ee.Message + } else { + result.Error = "fetch_failed" + result.Message = err.Error() + } + return result + } + + // Extract article + article, err := extractor.ExtractArticle(pageResult.HTML, pageResult.URL) + if err != nil { + result.Status = "error" + result.Error = "extract_failed" + result.Message = err.Error() + result.FinalURL = pageResult.URL + result.ContentType = pageResult.ContentType + result.DurationMs = time.Since(start).Milliseconds() + return result + } + + // Convert to markdown + includeImages := !noImages + markdown, err := extractor.FormatArticle(article, includeImages) + if err != nil { + result.Status = "error" + result.Error = "format_failed" + result.Message = err.Error() + result.FinalURL = pageResult.URL + result.ContentType = pageResult.ContentType + result.DurationMs = time.Since(start).Milliseconds() + return result + } + + title := article.Title + result.Status = "ok" + result.FinalURL = pageResult.URL + result.Title = &title + result.ContentType = pageResult.ContentType + result.Markdown = markdown + result.DurationMs = time.Since(start).Milliseconds() + return result +} + +// isRetryable returns true for transient errors worth retrying. +func isRetryable(err error) bool { + var ee *extractor.ExtractionError + if errors.As(err, &ee) { + // Don't retry content-type or body-size errors + switch ee.Code { + case "unsupported_content_type", "body_too_large": + return false + } + // Retry 5xx errors + if strings.HasPrefix(ee.Code, "http_5") { + return true + } + } + // Retry generic fetch errors (timeouts, connection resets) + return true +} + +func collectURLs(args []string) ([]string, error) { + hasArgs := len(args) > 0 + hasFile := fileFlag != "" + + // Args and --file are explicitly provided and mutually exclusive + if hasArgs && hasFile { + return nil, &UsageError{ + Msg: "cannot use both positional args and --file", + Hint: "ezycopy batch --help", + } + } + + if hasArgs { + // Special case: "ezycopy batch -" means stdin + if len(args) == 1 && args[0] == "-" { + return readURLsFromReader(os.Stdin) + } + return args, nil + } + + if hasFile { + f, err := os.Open(fileFlag) + if err != nil { + return nil, fmt.Errorf("failed to open URL file: %w", err) + } + defer f.Close() + return readURLsFromReader(f) + } + + // No args and no --file: try stdin if it's a pipe + if isPipeInput() { + return readURLsFromReader(os.Stdin) + } + + return nil, nil +} + +func readURLsFromReader(r *os.File) ([]string, error) { + var urls []string + scanner := bufio.NewScanner(r) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + urls = append(urls, line) + } + return urls, scanner.Err() +} + +func isPipeInput() bool { + info, err := os.Stdin.Stat() + if err != nil { + return false + } + return info.Mode()&os.ModeCharDevice == 0 +} + +func writeResultToFile(result BatchResult) { + if result.Title == nil || *result.Title == "" { + return + } + + // Use output package's resolve path for consistent naming + filePath, err := output.ResolveOutputPath(outputFlag, *result.Title) + if err != nil { + return + } + + if skipExisting { + if _, err := os.Stat(filePath); err == nil { + return // File exists, skip + } + } + + // Handle collisions by appending -2, -3, etc. + filePath = resolveCollision(filePath) + + _ = output.WriteToFile(filePath, result.Markdown) +} + +func resolveCollision(path string) string { + if _, err := os.Stat(path); os.IsNotExist(err) { + return path + } + + ext := filepath.Ext(path) + base := strings.TrimSuffix(path, ext) + + for i := 2; i < 1000; i++ { + candidate := fmt.Sprintf("%s-%d%s", base, i, ext) + if _, err := os.Stat(candidate); os.IsNotExist(err) { + return candidate + } + } + return path +} + +func parseByteSize(s string) (int64, error) { + s = strings.TrimSpace(strings.ToUpper(s)) + if s == "0" { + return 0, nil + } + + // Check longest suffixes first to avoid "MB" matching "B" + suffixes := []struct { + suffix string + mult int64 + }{ + {"GB", 1024 * 1024 * 1024}, + {"MB", 1024 * 1024}, + {"KB", 1024}, + {"B", 1}, + } + + for _, entry := range suffixes { + if strings.HasSuffix(s, entry.suffix) { + numStr := strings.TrimSuffix(s, entry.suffix) + var n int64 + _, err := fmt.Sscanf(numStr, "%d", &n) + if err != nil { + return 0, fmt.Errorf("cannot parse %q", s) + } + return n * entry.mult, nil + } + } + + // Plain number = bytes + var n int64 + _, err := fmt.Sscanf(s, "%d", &n) + return n, err +} + +func uniqueStrings(ss []string) []string { + seen := make(map[string]bool) + var result []string + for _, s := range ss { + if !seen[s] { + seen[s] = true + result = append(result, s) + } + } + return result +} + +// sanitizeFilenameForBatch creates a filesystem-safe filename from a title. +var unsafeCharsRe = regexp.MustCompile(`[^a-zA-Z0-9]+`) + +func sanitizeFilenameForBatch(title string) string { + if len(title) > 50 { + title = title[:50] + } + safe := unsafeCharsRe.ReplaceAllString(title, "-") + safe = strings.Trim(safe, "-") + if safe == "" { + safe = "untitled" + } + return safe + ".md" +} + +// batchError carries an exit code without a user-visible message. +type batchError struct { + code int +} + +func (e *batchError) Error() string { + return fmt.Sprintf("batch completed with exit code %d", e.code) +} diff --git a/cmd/batch_test.go b/cmd/batch_test.go new file mode 100644 index 0000000..25b8414 --- /dev/null +++ b/cmd/batch_test.go @@ -0,0 +1,54 @@ +package cmd + +import ( + "testing" +) + +func TestParseByteSize(t *testing.T) { + tests := []struct { + input string + want int64 + err bool + }{ + {"10MB", 10 * 1024 * 1024, false}, + {"1GB", 1024 * 1024 * 1024, false}, + {"512KB", 512 * 1024, false}, + {"100B", 100, false}, + {"0", 0, false}, + {"1024", 1024, false}, + {"invalid", 0, true}, + } + for _, tt := range tests { + got, err := parseByteSize(tt.input) + if tt.err { + if err == nil { + t.Errorf("parseByteSize(%q) expected error", tt.input) + } + continue + } + if err != nil { + t.Errorf("parseByteSize(%q) unexpected error: %v", tt.input, err) + continue + } + if got != tt.want { + t.Errorf("parseByteSize(%q) = %d, want %d", tt.input, got, tt.want) + } + } +} + +func TestUniqueStrings(t *testing.T) { + input := []string{"a", "b", "a", "c", "b"} + got := uniqueStrings(input) + if len(got) != 3 || got[0] != "a" || got[1] != "b" || got[2] != "c" { + t.Errorf("uniqueStrings(%v) = %v, want [a b c]", input, got) + } +} + +func TestResolveCollision(t *testing.T) { + // Non-existent file — should return as-is + path := "/tmp/ezycopy-test-nonexistent-file-abc123.md" + got := resolveCollision(path) + if got != path { + t.Errorf("resolveCollision(%q) = %q, want same path", path, got) + } +} diff --git a/cmd/root.go b/cmd/root.go new file mode 100644 index 0000000..a663c88 --- /dev/null +++ b/cmd/root.go @@ -0,0 +1,236 @@ +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "net/url" + "os" + "os/signal" + "time" + + "github.com/gupsammy/EzyCopy/extractor" + "github.com/gupsammy/EzyCopy/output" + "github.com/spf13/cobra" +) + +var ( + Version = "2.0.0" + + // Persistent flags (shared across root + subcommands) + outputFlag string + noImages bool + timeout time.Duration + browserFlag bool + browserWS string + typeFlag string + jsonFlag bool + quiet bool + verbose bool + noColor bool + + // Root-only flags + clipboardFlag bool +) + +var rootCmd = &cobra.Command{ + Use: "ezycopy ", + Short: "Extract web content as markdown", + Long: `EzyCopy extracts article content from web pages and converts it to markdown. + +By default, uses fast HTTP fetch. Use --browser for JS-heavy sites (Twitter, SPAs) +or authenticated content (uses your Chrome profile). + +Content is printed to stdout. Use -c to copy to clipboard, -o to save to a file. +Use --json for structured JSON output.`, + Args: cobra.ExactArgs(1), + Version: Version, + RunE: runRoot, + + SilenceUsage: true, + SilenceErrors: true, +} + +func init() { + // Persistent flags — available to root and all subcommands + pf := rootCmd.PersistentFlags() + pf.StringVarP(&outputFlag, "output", "o", "", "Save to file (directory auto-generates name)") + pf.BoolVar(&noImages, "no-images", false, "Strip image links from output") + pf.DurationVarP(&timeout, "timeout", "t", 30*time.Second, "Page load timeout") + pf.BoolVar(&browserFlag, "browser", false, "Use Chrome browser (for JS-heavy or authenticated sites)") + pf.StringVar(&browserWS, "browser-ws", "", "Connect to existing Chrome via DevTools WebSocket URL") + pf.StringVar(&typeFlag, "type", "", "Content type hint: article, github") + pf.BoolVar(&jsonFlag, "json", false, "Emit JSON output instead of raw markdown") + pf.BoolVarP(&quiet, "quiet", "q", false, "Suppress progress messages") + pf.BoolVarP(&verbose, "verbose", "v", false, "Debug output to stderr") + pf.BoolVar(&noColor, "no-color", false, "Disable ANSI colors") + + // Root-only flags + rootCmd.Flags().BoolVarP(&clipboardFlag, "clipboard", "c", false, "Copy output to clipboard") + + // browser-ws implies browser + rootCmd.PersistentPreRunE = func(cmd *cobra.Command, args []string) error { + if browserWS != "" { + browserFlag = true + } + // Respect NO_COLOR env + if os.Getenv("NO_COLOR") != "" { + noColor = true + } + return nil + } +} + +// Execute sets up signal handling and runs the root command. +func Execute() { + ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) + defer cancel() + + // Second Ctrl-C forces exit + go func() { + <-ctx.Done() + // Context cancelled by first signal; wait for second + sig := make(chan os.Signal, 1) + signal.Notify(sig, os.Interrupt) + <-sig + os.Exit(130) + }() + + rootCmd.SetContext(ctx) + + if err := rootCmd.Execute(); err != nil { + // Check for batch error with explicit exit code + if be, ok := err.(*batchError); ok { + os.Exit(be.code) + } + exitCode := ExitPartial + if isUsageError(err) { + exitCode = ExitInvalidUsage + } + writeFatalError(err, exitCode) + os.Exit(exitCode) + } +} + +func runRoot(cmd *cobra.Command, args []string) error { + start := time.Now() + inputURL := args[0] + + if _, err := url.ParseRequestURI(inputURL); err != nil { + return &UsageError{Msg: fmt.Sprintf("invalid URL: %s", inputURL)} + } + + // Fetch page + var pageResult *extractor.PageResult + var err error + if browserFlag { + if !quiet { + fmt.Fprintln(os.Stderr, "Fetching page (browser)...") + } + pageResult, err = extractor.FetchPage(inputURL, timeout) + } else { + if !quiet { + fmt.Fprintln(os.Stderr, "Fetching page...") + } + pageResult, err = extractor.FetchPageHTTP(cmd.Context(), inputURL, timeout, 0) + } + if err != nil { + return fmt.Errorf("failed to fetch page: %w", err) + } + + // Extract article + if !quiet { + fmt.Fprintln(os.Stderr, "Extracting content...") + } + article, err := extractor.ExtractArticle(pageResult.HTML, pageResult.URL) + if err != nil { + return fmt.Errorf("failed to extract content: %w", err) + } + + // Convert to markdown + includeImages := !noImages + markdown, err := extractor.FormatArticle(article, includeImages) + if err != nil { + return fmt.Errorf("failed to convert to markdown: %w", err) + } + + durationMs := time.Since(start).Milliseconds() + + // JSON output + if jsonFlag { + result := SingleResult{ + OriginalURL: inputURL, + FinalURL: pageResult.URL, + Title: article.Title, + Author: article.Byline, + Type: resolveType(), + ContentType: pageResult.ContentType, + Markdown: markdown, + DurationMs: durationMs, + ExtractedAt: time.Now().UTC().Format(time.RFC3339), + } + enc := json.NewEncoder(os.Stdout) + enc.SetEscapeHTML(false) + return enc.Encode(result) + } + + // Default: raw markdown to stdout + fmt.Println(markdown) + + // Copy to clipboard if requested + if clipboardFlag { + if err := output.CopyToClipboard(markdown); err != nil { + fmt.Fprintf(os.Stderr, "Warning: failed to copy to clipboard: %v\n", err) + } else if !quiet { + fmt.Fprintln(os.Stderr, "Copied to clipboard!") + } + } + + // Save to file if requested + if outputFlag != "" { + filePath, err := output.ResolveOutputPath(outputFlag, article.Title) + if err != nil { + return fmt.Errorf("failed to resolve output path: %w", err) + } + if err := output.WriteToFile(filePath, markdown); err != nil { + return fmt.Errorf("failed to write file: %w", err) + } + if !quiet { + fmt.Fprintf(os.Stderr, "Saved to: %s\n", filePath) + } + } + + return nil +} + +func resolveType() string { + if typeFlag != "" { + return typeFlag + } + return "article" +} + +func writeFatalError(err error, exitCode int) { + fe := FatalError{ + Error: "runtime_error", + Message: err.Error(), + } + if exitCode == ExitInvalidUsage { + fe.Error = "invalid_usage" + fe.Hint = "ezycopy --help" + } + // Only write structured JSON if stderr is not a TTY or noColor is set + if noColor || !isTTY(os.Stderr) { + json.NewEncoder(os.Stderr).Encode(fe) + } else { + fmt.Fprintf(os.Stderr, "Error: %s\n", err.Error()) + } +} + +func isTTY(f *os.File) bool { + info, err := f.Stat() + if err != nil { + return false + } + return info.Mode()&os.ModeCharDevice != 0 +} diff --git a/cmd/types.go b/cmd/types.go new file mode 100644 index 0000000..466516d --- /dev/null +++ b/cmd/types.go @@ -0,0 +1,61 @@ +package cmd + +// Exit codes +const ( + ExitOK = 0 + ExitPartial = 1 // One or more URLs failed (batch: partial; root: extraction failed) + ExitInvalidUsage = 2 // Bad flag, missing arg, conflicting input sources + ExitTotalFailure = 3 // All URLs failed (batch only) +) + +// SingleResult is the JSON output for the root command with --json. +type SingleResult struct { + OriginalURL string `json:"original_url"` + FinalURL string `json:"final_url"` + Title string `json:"title"` + Author string `json:"author,omitempty"` + Type string `json:"type"` + ContentType string `json:"content_type"` + Markdown string `json:"markdown"` + DurationMs int64 `json:"duration_ms"` + ExtractedAt string `json:"extracted_at"` +} + +// BatchResult is one line of NDJSON output from the batch subcommand. +type BatchResult struct { + Seq int `json:"seq"` + OriginalURL string `json:"original_url"` + FinalURL string `json:"final_url,omitempty"` + Title *string `json:"title"` // null on error + Type string `json:"type"` + ContentType string `json:"content_type,omitempty"` + Status string `json:"status"` // "ok" or "error" + Error string `json:"error,omitempty"` + Message string `json:"message,omitempty"` + Markdown string `json:"markdown,omitempty"` + DurationMs int64 `json:"duration_ms"` + ExtractedAt string `json:"extracted_at"` +} + +// FatalError is the structured error written to stderr for usage/config errors. +type FatalError struct { + Error string `json:"error"` + Message string `json:"message"` + Hint string `json:"hint,omitempty"` +} + +// UsageError indicates invalid CLI usage (exit code 2). +type UsageError struct { + Msg string + Hint string +} + +func (e *UsageError) Error() string { + return e.Msg +} + +func isUsageError(err error) bool { + _, ok := err.(*UsageError) + return ok +} + diff --git a/extractor/browser.go b/extractor/browser.go index ef51f29..b7949bb 100644 --- a/extractor/browser.go +++ b/extractor/browser.go @@ -13,8 +13,9 @@ import ( // PageResult contains the fetched page data type PageResult struct { - HTML string - URL string // Final URL after redirects + HTML string + URL string // Final URL after redirects + ContentType string // HTTP Content-Type header } // getDefaultChromeProfile returns the path to Chrome's default user data directory diff --git a/extractor/http.go b/extractor/http.go index bf3be0d..060ffe3 100644 --- a/extractor/http.go +++ b/extractor/http.go @@ -1,24 +1,34 @@ package extractor import ( + "context" "fmt" "io" "net/http" + "strings" "time" ) -// FetchPageHTTP fetches a page using simple HTTP (no JavaScript execution) -func FetchPageHTTP(url string, timeout time.Duration) (*PageResult, error) { +// supportedContentTypes lists MIME types we can meaningfully extract. +var supportedContentTypes = []string{ + "text/html", + "application/xhtml+xml", + "text/plain", + "text/xml", +} + +// FetchPageHTTP fetches a page using simple HTTP (no JavaScript execution). +// Pass maxBodySize=0 to disable the body size limit. +func FetchPageHTTP(ctx context.Context, url string, timeout time.Duration, maxBodySize int64) (*PageResult, error) { client := &http.Client{ Timeout: timeout, } - req, err := http.NewRequest("GET", url, nil) + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } - // Set User-Agent to avoid bot blocks req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") @@ -29,16 +39,63 @@ func FetchPageHTTP(url string, timeout time.Duration) (*PageResult, error) { defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) + return nil, &ExtractionError{ + Code: fmt.Sprintf("http_%d", resp.StatusCode), + Message: fmt.Sprintf("HTTP %d: %s", resp.StatusCode, resp.Status), + } + } + + // Content-Type pre-check + ct := resp.Header.Get("Content-Type") + if ct != "" && !isHTMLContentType(ct) { + return nil, &ExtractionError{ + Code: "unsupported_content_type", + Message: fmt.Sprintf("Content-Type %s is not supported", ct), + } } - body, err := io.ReadAll(resp.Body) + // Read body with optional size limit + var reader io.Reader = resp.Body + if maxBodySize > 0 { + reader = io.LimitReader(resp.Body, maxBodySize+1) + } + + body, err := io.ReadAll(reader) if err != nil { return nil, fmt.Errorf("failed to read response: %w", err) } + if maxBodySize > 0 && int64(len(body)) > maxBodySize { + return nil, &ExtractionError{ + Code: "body_too_large", + Message: fmt.Sprintf("response body exceeds %d bytes", maxBodySize), + } + } + return &PageResult{ - HTML: string(body), - URL: resp.Request.URL.String(), // Final URL after redirects + HTML: string(body), + URL: resp.Request.URL.String(), + ContentType: ct, }, nil } + +// ExtractionError wraps per-URL errors with a machine-readable code. +type ExtractionError struct { + Code string + Message string +} + +func (e *ExtractionError) Error() string { + return fmt.Sprintf("%s: %s", e.Code, e.Message) +} + +// isHTMLContentType checks if the Content-Type is one we can extract from. +func isHTMLContentType(ct string) bool { + ct = strings.ToLower(ct) + for _, supported := range supportedContentTypes { + if strings.HasPrefix(ct, supported) { + return true + } + } + return false +} diff --git a/extractor/http_test.go b/extractor/http_test.go new file mode 100644 index 0000000..2aef1f6 --- /dev/null +++ b/extractor/http_test.go @@ -0,0 +1,150 @@ +package extractor + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +func TestFetchPageHTTP_Success(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + w.Write([]byte("Hello")) + })) + defer srv.Close() + + result, err := FetchPageHTTP(context.Background(), srv.URL, 5*time.Second, 0) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !strings.Contains(result.HTML, "Hello") { + t.Errorf("expected HTML to contain 'Hello', got %q", result.HTML) + } + if result.ContentType != "text/html" { + t.Errorf("expected content-type 'text/html', got %q", result.ContentType) + } +} + +func TestFetchPageHTTP_RejectsNonHTML(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/pdf") + w.Write([]byte("%PDF-1.4")) + })) + defer srv.Close() + + _, err := FetchPageHTTP(context.Background(), srv.URL, 5*time.Second, 0) + if err == nil { + t.Fatal("expected error for non-HTML content type") + } + + var ee *ExtractionError + if !errors.As(err, &ee) { + t.Fatalf("expected ExtractionError, got %T: %v", err, err) + } + if ee.Code != "unsupported_content_type" { + t.Errorf("expected code 'unsupported_content_type', got %q", ee.Code) + } +} + +func TestFetchPageHTTP_BodySizeLimit(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(strings.Repeat("x", 1000))) + })) + defer srv.Close() + + _, err := FetchPageHTTP(context.Background(), srv.URL, 5*time.Second, 500) + if err == nil { + t.Fatal("expected error for body exceeding limit") + } + + var ee *ExtractionError + if !errors.As(err, &ee) { + t.Fatalf("expected ExtractionError, got %T: %v", err, err) + } + if ee.Code != "body_too_large" { + t.Errorf("expected code 'body_too_large', got %q", ee.Code) + } +} + +func TestFetchPageHTTP_HTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + _, err := FetchPageHTTP(context.Background(), srv.URL, 5*time.Second, 0) + if err == nil { + t.Fatal("expected error for 500 response") + } + + var ee *ExtractionError + if !errors.As(err, &ee) { + t.Fatalf("expected ExtractionError, got %T: %v", err, err) + } + if ee.Code != "http_500" { + t.Errorf("expected code 'http_500', got %q", ee.Code) + } +} + +func TestFetchPageHTTP_ContextCancellation(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(5 * time.Second) + })) + defer srv.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + + _, err := FetchPageHTTP(ctx, srv.URL, 10*time.Second, 0) + if err == nil { + t.Fatal("expected error from cancelled context") + } +} + +func TestFetchPageHTTP_FollowsRedirects(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/" { + http.Redirect(w, r, "/final", http.StatusMovedPermanently) + return + } + w.Header().Set("Content-Type", "text/html") + w.Write([]byte("Final")) + })) + defer srv.Close() + + result, err := FetchPageHTTP(context.Background(), srv.URL, 5*time.Second, 0) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !strings.HasSuffix(result.URL, "/final") { + t.Errorf("expected final URL to end with '/final', got %q", result.URL) + } +} + +func TestIsHTMLContentType(t *testing.T) { + tests := []struct { + ct string + want bool + }{ + {"text/html", true}, + {"text/html; charset=utf-8", true}, + {"application/xhtml+xml", true}, + {"text/plain", true}, + {"text/xml", true}, + {"application/pdf", false}, + {"image/png", false}, + {"application/json", false}, + {"", false}, + } + for _, tt := range tests { + got := isHTMLContentType(tt.ct) + if got != tt.want { + t.Errorf("isHTMLContentType(%q) = %v, want %v", tt.ct, got, tt.want) + } + } +} diff --git a/extractor/pool.go b/extractor/pool.go new file mode 100644 index 0000000..b696f88 --- /dev/null +++ b/extractor/pool.go @@ -0,0 +1,102 @@ +package extractor + +import ( + "context" + "fmt" + "time" + + "github.com/go-rod/rod" + "github.com/go-rod/rod/lib/launcher" + "github.com/go-rod/rod/lib/proto" +) + +// BrowserPool manages a shared Chrome instance for batch extraction. +// Unlike FetchPage (root command), this uses an ephemeral profile — +// no user data, safer for concurrent tab access. +type BrowserPool struct { + browser *rod.Browser + launcher *launcher.Launcher // nil when using external Chrome via --browser-ws +} + +// NewBrowserPool creates a browser pool. If wsURL is non-empty, it connects +// to an existing Chrome instance. Otherwise, it launches an ephemeral headless Chrome. +func NewBrowserPool(wsURL string) (*BrowserPool, error) { + pool := &BrowserPool{} + + if wsURL != "" { + browser := rod.New().ControlURL(wsURL) + if err := browser.Connect(); err != nil { + return nil, fmt.Errorf("failed to connect to Chrome at %s: %w", wsURL, err) + } + pool.browser = browser + } else { + l := launcher.New().Headless(true).Set("disable-gpu").Set("no-sandbox") + controlURL, err := l.Launch() + if err != nil { + return nil, fmt.Errorf("failed to launch Chrome: %w", err) + } + pool.launcher = l + + browser := rod.New().ControlURL(controlURL) + if err := browser.Connect(); err != nil { + l.Cleanup() + return nil, fmt.Errorf("failed to connect to Chrome: %w", err) + } + pool.browser = browser + } + + return pool, nil +} + +// FetchPage opens a new tab, navigates to the URL, extracts HTML, and closes the tab. +// Safe for concurrent use — each call gets its own tab. +func (bp *BrowserPool) FetchPage(ctx context.Context, url string, timeout time.Duration) (*PageResult, error) { + page, err := bp.browser.Page(proto.TargetCreateTarget{URL: "about:blank"}) + if err != nil { + return nil, fmt.Errorf("failed to create tab: %w", err) + } + defer func() { + _ = page.Close() + }() + + page = page.Context(ctx).Timeout(timeout) + + if err := page.Navigate(url); err != nil { + return nil, fmt.Errorf("failed to navigate to %s: %w", url, err) + } + + if err := page.WaitLoad(); err != nil { + return nil, fmt.Errorf("page load timeout: %w", err) + } + + // WaitStable replaces the old time.Sleep(2s) — waits for DOM to stabilize + if err := page.WaitStable(300 * time.Millisecond); err != nil { + // Non-fatal: page may be interactive enough + } + + info, err := page.Info() + if err != nil { + return nil, fmt.Errorf("failed to get page info: %w", err) + } + + html, err := page.HTML() + if err != nil { + return nil, fmt.Errorf("failed to extract HTML: %w", err) + } + + return &PageResult{ + HTML: html, + URL: info.URL, + ContentType: "text/html", + }, nil +} + +// Close shuts down the browser. Only call after all FetchPage calls have returned. +func (bp *BrowserPool) Close() { + if bp.browser != nil { + _ = bp.browser.Close() + } + if bp.launcher != nil { + bp.launcher.Cleanup() + } +} diff --git a/go.mod b/go.mod index 597fe8e..eae3d58 100644 --- a/go.mod +++ b/go.mod @@ -24,5 +24,6 @@ require ( github.com/ysmood/gson v0.7.3 // indirect github.com/ysmood/leakless v0.9.0 // indirect golang.org/x/net v0.47.0 // indirect + golang.org/x/sync v0.19.0 // indirect golang.org/x/text v0.31.0 // indirect ) diff --git a/go.sum b/go.sum index f4689f8..6838fad 100644 --- a/go.sum +++ b/go.sum @@ -87,6 +87,8 @@ golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/main.go b/main.go index ff79a5f..66bc373 100644 --- a/main.go +++ b/main.go @@ -1,113 +1,7 @@ package main -import ( - "fmt" - "net/url" - "os" - "time" - - "github.com/gupsammy/EzyCopy/extractor" - "github.com/gupsammy/EzyCopy/output" - "github.com/spf13/cobra" -) - -var ( - version = "0.4.0" - - outputFlag string - noImages bool - timeout time.Duration - browserFlag bool - clipboardFlag bool -) +import "github.com/gupsammy/EzyCopy/cmd" func main() { - rootCmd := &cobra.Command{ - Use: "ezycopy ", - Short: "Extract web content as markdown", - Long: `EzyCopy extracts article content from web pages and converts it to markdown. - -By default, uses fast HTTP fetch. Use --browser for JS-heavy sites (Twitter, SPAs) -or authenticated content (uses your Chrome profile). - -Content is printed to stdout. Use -c to copy to clipboard, -o to save to a file.`, - Args: cobra.ExactArgs(1), - Version: version, - RunE: run, - } - - rootCmd.Flags().StringVarP(&outputFlag, "output", "o", "", "Save to file (directory auto-generates name)") - rootCmd.Flags().BoolVar(&noImages, "no-images", false, "Strip image links from output") - rootCmd.Flags().DurationVarP(&timeout, "timeout", "t", 30*time.Second, "Page load timeout") - rootCmd.Flags().BoolVar(&browserFlag, "browser", false, "Use Chrome browser (for JS-heavy or authenticated sites)") - rootCmd.Flags().BoolVarP(&clipboardFlag, "clipboard", "c", false, "Copy output to clipboard") - - if err := rootCmd.Execute(); err != nil { - os.Exit(2) - } -} - -func run(cmd *cobra.Command, args []string) error { - inputURL := args[0] - - // Validate URL - if _, err := url.ParseRequestURI(inputURL); err != nil { - return fmt.Errorf("invalid URL: %s", inputURL) - } - - // Fetch page - var pageResult *extractor.PageResult - var err error - if browserFlag { - fmt.Fprintln(os.Stderr, "Fetching page (browser)...") - pageResult, err = extractor.FetchPage(inputURL, timeout) - } else { - fmt.Fprintln(os.Stderr, "Fetching page...") - pageResult, err = extractor.FetchPageHTTP(inputURL, timeout) - } - if err != nil { - return fmt.Errorf("failed to fetch page: %w", err) - } - - // Extract article - fmt.Fprintln(os.Stderr, "Extracting content...") - article, err := extractor.ExtractArticle(pageResult.HTML, pageResult.URL) - if err != nil { - return fmt.Errorf("failed to extract content: %w", err) - } - - // Convert to markdown - includeImages := !noImages - markdown, err := extractor.FormatArticle(article, includeImages) - if err != nil { - return fmt.Errorf("failed to convert to markdown: %w", err) - } - - // Output to stdout - fmt.Println(markdown) - - // Copy to clipboard if requested - if clipboardFlag { - if err := output.CopyToClipboard(markdown); err != nil { - fmt.Fprintf(os.Stderr, "Warning: failed to copy to clipboard: %v\n", err) - } else { - fmt.Fprintln(os.Stderr, "Copied to clipboard!") - } - } - - // Save to file if requested - if outputFlag != "" { - filePath, err := output.ResolveOutputPath(outputFlag, article.Title) - if err != nil { - return fmt.Errorf("failed to resolve output path: %w", err) - } - - if err := output.WriteToFile(filePath, markdown); err != nil { - return fmt.Errorf("failed to write file: %w", err) - } - - fmt.Fprintf(os.Stderr, "Saved to: %s\n", filePath) - } - - return nil + cmd.Execute() } From e0ffbd03758be8094b33a5dcfb1f637ec178c0fe Mon Sep 17 00:00:00 2001 From: gupsammy Date: Wed, 4 Mar 2026 18:46:23 +0530 Subject: [PATCH 2/5] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?= =?UTF-8?q?browser-ws,=20fail-fast,=20retries=20validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Root command now uses BrowserPool when --browser-ws is provided instead of always launching local Chrome profile - Batch --fail-fast stops scheduling new jobs once context is cancelled - Negative --retries value is rejected with a clean usage error Co-Authored-By: Claude Opus 4.6 --- cmd/batch.go | 10 ++++++++++ cmd/root.go | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/cmd/batch.go b/cmd/batch.go index 61182ad..392a23f 100644 --- a/cmd/batch.go +++ b/cmd/batch.go @@ -77,6 +77,11 @@ func runBatch(cmd *cobra.Command, args []string) error { return &UsageError{Msg: fmt.Sprintf("invalid --max-body-size: %v", err)} } + // Validate retries + if retries < 0 { + return &UsageError{Msg: "--retries must be non-negative"} + } + // Setup concurrency primitives limiter := batch.NewDomainLimiter(rateLimit) retryCfg := &batch.RetryConfig{ @@ -132,6 +137,11 @@ func runBatch(cmd *cobra.Command, args []string) error { seq := i rawURL := u + // With --fail-fast, stop scheduling new jobs once context is cancelled + if failFast && gctx.Err() != nil { + break + } + g.Go(func() error { result := fetchAndExtract(gctx, seq, rawURL, limiter, retryCfg, pool, bodyLimit) results <- result diff --git a/cmd/root.go b/cmd/root.go index a663c88..10fe6de 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -127,7 +127,17 @@ func runRoot(cmd *cobra.Command, args []string) error { if !quiet { fmt.Fprintln(os.Stderr, "Fetching page (browser)...") } - pageResult, err = extractor.FetchPage(inputURL, timeout) + if browserWS != "" { + // Use BrowserPool to connect to existing Chrome via DevTools WebSocket + pool, poolErr := extractor.NewBrowserPool(browserWS) + if poolErr != nil { + return fmt.Errorf("failed to connect to browser: %w", poolErr) + } + defer pool.Close() + pageResult, err = pool.FetchPage(cmd.Context(), inputURL, timeout) + } else { + pageResult, err = extractor.FetchPage(inputURL, timeout) + } } else { if !quiet { fmt.Fprintln(os.Stderr, "Fetching page...") From 2c830ea8ae097a14852d8354941afb701f7fea80 Mon Sep 17 00:00:00 2001 From: gupsammy Date: Wed, 4 Mar 2026 19:13:43 +0530 Subject: [PATCH 3/5] fix(ci): add allowed tools for code-review plugin to post comments The code-review plugin needs explicit --allowedTools for the inline comment MCP tool and gh CLI commands. Without this, all tool calls were denied (37 permission denials) and no review comments posted. Also added --comment flag to the prompt so the plugin actually posts its findings rather than just printing to the action log. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/claude-code-review.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml index 21cb993..75cc2a1 100644 --- a/.github/workflows/claude-code-review.yml +++ b/.github/workflows/claude-code-review.yml @@ -38,7 +38,7 @@ jobs: claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} plugin_marketplaces: 'https://github.com/anthropics/claude-code.git' plugins: 'code-review@claude-code-plugins' - prompt: '/code-review:code-review ${{ github.repository }}/pull/${{ github.event.pull_request.number }}' - # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md - # or https://code.claude.com/docs/en/cli-reference for available options + prompt: '/code-review:code-review --comment ${{ github.repository }}/pull/${{ github.event.pull_request.number }}' + claude_args: | + --allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh issue view:*),Bash(gh search:*),Bash(gh issue list:*),Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr list:*)" From 615d7ab807e9da74de40b24159584bfaf22faa5d Mon Sep 17 00:00:00 2001 From: gupsammy Date: Wed, 4 Mar 2026 19:46:43 +0530 Subject: [PATCH 4/5] =?UTF-8?q?fix(batch):=20address=20code=20review=20?= =?UTF-8?q?=E2=80=94=20extractDomain=20and=20file=20write=20race?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - extractDomain: return rawURL when hostname is empty (scheme-less URLs were all bucketed under "" in the rate limiter) - Move writeResultToFile into serialized writer goroutine to eliminate TOCTOU race on resolveCollision from concurrent workers - Add junk/ to .gitignore Co-Authored-By: Claude Opus 4.6 --- .gitignore | 3 +++ batch/ratelimit.go | 2 +- batch/ratelimit_test.go | 3 ++- cmd/batch.go | 9 ++++----- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 902ee36..6f39f1d 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,6 @@ videos/ # Agent counselors output agents/ + +# Junk / temp files +junk/ diff --git a/batch/ratelimit.go b/batch/ratelimit.go index 97d9dc3..1a88a93 100644 --- a/batch/ratelimit.go +++ b/batch/ratelimit.go @@ -55,7 +55,7 @@ func (d *DomainLimiter) Wait(ctx context.Context, rawURL string) error { func extractDomain(rawURL string) string { u, err := url.Parse(rawURL) - if err != nil { + if err != nil || u.Hostname() == "" { return rawURL } return u.Hostname() diff --git a/batch/ratelimit_test.go b/batch/ratelimit_test.go index ed190ec..5973ba7 100644 --- a/batch/ratelimit_test.go +++ b/batch/ratelimit_test.go @@ -99,7 +99,8 @@ func TestExtractDomain(t *testing.T) { }{ {"https://example.com/path", "example.com"}, {"http://sub.example.com:8080/page", "sub.example.com"}, - {"not-a-url", ""}, + {"not-a-url", "not-a-url"}, + {"example.com/page", "example.com/page"}, } for _, tt := range tests { got := extractDomain(tt.url) diff --git a/cmd/batch.go b/cmd/batch.go index 392a23f..745651d 100644 --- a/cmd/batch.go +++ b/cmd/batch.go @@ -122,6 +122,10 @@ func runBatch(cmd *cobra.Command, args []string) error { enc.Encode(r) if r.Status == "ok" { okCount++ + // Write file in serialized writer goroutine to avoid TOCTOU race + if outputFlag != "" && r.Title != nil { + writeResultToFile(r) + } } else { errCount++ errCodes = append(errCodes, r.Error) @@ -146,11 +150,6 @@ func runBatch(cmd *cobra.Command, args []string) error { result := fetchAndExtract(gctx, seq, rawURL, limiter, retryCfg, pool, bodyLimit) results <- result - // Write to file if output dir is set and extraction succeeded - if outputFlag != "" && result.Status == "ok" && result.Title != nil { - writeResultToFile(result) - } - if !quiet { status := "ok" if result.Status == "error" { From fa15444ebde85bb6354b0fdf77925df77e0b1f73 Mon Sep 17 00:00:00 2001 From: gupsammy Date: Thu, 5 Mar 2026 11:09:47 +0530 Subject: [PATCH 5/5] fix(batch): don't retry 4xx HTTP errors in isRetryable ExtractionError with http_4xx codes fell through the errors.As block and hit the outer `return true`, causing 404/403/401 to be retried with exponential backoff. Added `return false` at end of the errors.As block so only explicit 5xx and generic non-HTTP errors are retried. Co-Authored-By: Claude Sonnet 4.6 --- cmd/batch.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/batch.go b/cmd/batch.go index 745651d..5c458c7 100644 --- a/cmd/batch.go +++ b/cmd/batch.go @@ -278,6 +278,8 @@ func isRetryable(err error) bool { if strings.HasPrefix(ee.Code, "http_5") { return true } + // Don't retry other known HTTP errors (4xx, etc.) + return false } // Retry generic fetch errors (timeouts, connection resets) return true