From 240968931ded3e3ede3a618e115caae6106d4a1e Mon Sep 17 00:00:00 2001
From: gupsammy <samarthgupta1911@gmail.com>
Date: Wed, 4 Mar 2026 18:06:22 +0530
Subject: [PATCH 1/5] =?UTF-8?q?feat(cli):=20v2.0=20=E2=80=94=20batch=20mod?=
 =?UTF-8?q?e,=20NDJSON=20output,=20retry,=20rate=20limiting?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `ezycopy batch` subcommand for multi-URL extraction pipelines:
- Concurrent worker pool with configurable parallelism (--concurrency)
- Per-domain rate limiting to avoid overwhelming servers
- Exponential backoff retry for transient failures
- NDJSON streaming output with inline errors
- Content-Type pre-check to skip non-HTML (PDFs, images)
- HTTP response body size limit (--max-body-size)
- Browser pool: single Chrome instance reused across batch URLs
- Signal handling: graceful Ctrl-C, force exit on second
- Structured JSON error output on stderr
- --json flag for root command structured output
- --skip-existing for batch resume after partial failure
- 19 unit tests covering rate limiter, retry, HTTP hardening

Restructures codebase: main.go -> cmd/ package (Cobra best practice).
Root command is fully backward-compatible with v0.4.0.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore              |   4 +
 batch/ratelimit.go      |  62 ++++++
 batch/ratelimit_test.go | 110 ++++++++++
 batch/retry.go          |  41 ++++
 batch/retry_test.go     |  98 +++++++++
 cmd/batch.go            | 444 ++++++++++++++++++++++++++++++++++++++++
 cmd/batch_test.go       |  54 +++++
 cmd/root.go             | 236 +++++++++++++++++++++
 cmd/types.go            |  61 ++++++
 extractor/browser.go    |   5 +-
 extractor/http.go       |  73 ++++++-
 extractor/http_test.go  | 150 ++++++++++++++
 extractor/pool.go       | 102 +++++++++
 go.mod                  |   1 +
 go.sum                  |   2 +
 main.go                 | 110 +---------
 16 files changed, 1435 insertions(+), 118 deletions(-)
 create mode 100644 batch/ratelimit.go
 create mode 100644 batch/ratelimit_test.go
 create mode 100644 batch/retry.go
 create mode 100644 batch/retry_test.go
 create mode 100644 cmd/batch.go
 create mode 100644 cmd/batch_test.go
 create mode 100644 cmd/root.go
 create mode 100644 cmd/types.go
 create mode 100644 extractor/http_test.go
 create mode 100644 extractor/pool.go

diff --git a/.gitignore b/.gitignore
index 9665a8f..902ee36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,8 +50,12 @@ debug
 .env.local
 
 # Extension artifacts
+*.zip
 extensions/**/*.zip
 extensions/**/assets/
 
 # Video project
 videos/
+
+# Agent counselors output
+agents/
diff --git a/batch/ratelimit.go b/batch/ratelimit.go
new file mode 100644
index 0000000..97d9dc3
--- /dev/null
+++ b/batch/ratelimit.go
@@ -0,0 +1,62 @@
+package batch
+
+import (
+	"context"
+	"net/url"
+	"sync"
+	"time"
+)
+
+// DomainLimiter enforces a minimum delay between requests to the same domain.
+type DomainLimiter struct {
+	mu       sync.Mutex
+	minDelay time.Duration
+	last     map[string]time.Time
+}
+
+// NewDomainLimiter creates a rate limiter with the given minimum delay per domain.
+func NewDomainLimiter(minDelay time.Duration) *DomainLimiter {
+	return &DomainLimiter{
+		minDelay: minDelay,
+		last:     make(map[string]time.Time),
+	}
+}
+
+// Wait blocks until it's safe to make a request to the given URL's domain.
+// It reserves the slot before returning, so concurrent callers are serialized per domain.
+func (d *DomainLimiter) Wait(ctx context.Context, rawURL string) error {
+	domain := extractDomain(rawURL)
+
+	d.mu.Lock()
+	lastReq, ok := d.last[domain]
+	now := time.Now()
+
+	if ok {
+		elapsed := now.Sub(lastReq)
+		if elapsed < d.minDelay {
+			wait := d.minDelay - elapsed
+			// Reserve the slot before releasing the lock
+			d.last[domain] = now.Add(wait)
+			d.mu.Unlock()
+
+			select {
+			case <-time.After(wait):
+				return nil
+			case <-ctx.Done():
+				return ctx.Err()
+			}
+		}
+	}
+
+	d.last[domain] = now
+	d.mu.Unlock()
+	return nil
+}
+
+func extractDomain(rawURL string) string {
+	u, err := url.Parse(rawURL)
+	if err != nil {
+		return rawURL
+	}
+	return u.Hostname()
+}
diff --git a/batch/ratelimit_test.go b/batch/ratelimit_test.go
new file mode 100644
index 0000000..ed190ec
--- /dev/null
+++ b/batch/ratelimit_test.go
@@ -0,0 +1,110 @@
+package batch
+
+import (
+	"context"
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestDomainLimiter_EnforcesDelay(t *testing.T) {
+	limiter := NewDomainLimiter(100 * time.Millisecond)
+	ctx := context.Background()
+
+	start := time.Now()
+
+	// First request should not wait
+	if err := limiter.Wait(ctx, "https://example.com/a"); err != nil {
+		t.Fatal(err)
+	}
+
+	// Second request to same domain should wait ~100ms
+	if err := limiter.Wait(ctx, "https://example.com/b"); err != nil {
+		t.Fatal(err)
+	}
+
+	elapsed := time.Since(start)
+	if elapsed < 90*time.Millisecond {
+		t.Errorf("expected >= 100ms delay, got %v", elapsed)
+	}
+}
+
+func TestDomainLimiter_DifferentDomains(t *testing.T) {
+	limiter := NewDomainLimiter(200 * time.Millisecond)
+	ctx := context.Background()
+
+	start := time.Now()
+
+	// Two different domains should not block each other
+	if err := limiter.Wait(ctx, "https://a.com/page"); err != nil {
+		t.Fatal(err)
+	}
+	if err := limiter.Wait(ctx, "https://b.com/page"); err != nil {
+		t.Fatal(err)
+	}
+
+	elapsed := time.Since(start)
+	if elapsed > 50*time.Millisecond {
+		t.Errorf("different domains should not wait, got %v", elapsed)
+	}
+}
+
+func TestDomainLimiter_ContextCancellation(t *testing.T) {
+	limiter := NewDomainLimiter(5 * time.Second)
+	ctx, cancel := context.WithCancel(context.Background())
+
+	// First request to establish the domain
+	if err := limiter.Wait(ctx, "https://example.com/a"); err != nil {
+		t.Fatal(err)
+	}
+
+	// Cancel before second request completes
+	go func() {
+		time.Sleep(50 * time.Millisecond)
+		cancel()
+	}()
+
+	err := limiter.Wait(ctx, "https://example.com/b")
+	if err == nil {
+		t.Error("expected context cancellation error")
+	}
+}
+
+func TestDomainLimiter_ConcurrentSameDomain(t *testing.T) {
+	limiter := NewDomainLimiter(50 * time.Millisecond)
+	ctx := context.Background()
+
+	start := time.Now()
+	var wg sync.WaitGroup
+	for i := 0; i < 3; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			limiter.Wait(ctx, "https://same.com/page")
+		}()
+	}
+	wg.Wait()
+
+	elapsed := time.Since(start)
+	// 3 requests with 50ms delay = at least ~100ms (first is free, second waits, third waits)
+	if elapsed < 80*time.Millisecond {
+		t.Errorf("expected >= 100ms for 3 concurrent same-domain requests, got %v", elapsed)
+	}
+}
+
+func TestExtractDomain(t *testing.T) {
+	tests := []struct {
+		url    string
+		domain string
+	}{
+		{"https://example.com/path", "example.com"},
+		{"http://sub.example.com:8080/page", "sub.example.com"},
+		{"not-a-url", ""},
+	}
+	for _, tt := range tests {
+		got := extractDomain(tt.url)
+		if got != tt.domain {
+			t.Errorf("extractDomain(%q) = %q, want %q", tt.url, got, tt.domain)
+		}
+	}
+}
diff --git a/batch/retry.go b/batch/retry.go
new file mode 100644
index 0000000..0fe2177
--- /dev/null
+++ b/batch/retry.go
@@ -0,0 +1,41 @@
+package batch
+
+import (
+	"context"
+	"time"
+)
+
+// RetryConfig controls exponential backoff retry behavior.
+type RetryConfig struct {
+	MaxAttempts int           // Total attempts (1 = no retry)
+	InitDelay   time.Duration // Delay before first retry; doubles each attempt
+}
+
+// Do executes fn with retries on error. It uses exponential backoff and
+// respects context cancellation. Only retries if shouldRetry returns true.
+func (rc *RetryConfig) Do(ctx context.Context, fn func() error, shouldRetry func(error) bool) error {
+	var lastErr error
+	delay := rc.InitDelay
+
+	for attempt := 0; attempt < rc.MaxAttempts; attempt++ {
+		lastErr = fn()
+		if lastErr == nil {
+			return nil
+		}
+
+		if !shouldRetry(lastErr) {
+			return lastErr
+		}
+
+		// Don't sleep after the last attempt
+		if attempt < rc.MaxAttempts-1 {
+			select {
+			case <-time.After(delay):
+				delay *= 2
+			case <-ctx.Done():
+				return ctx.Err()
+			}
+		}
+	}
+	return lastErr
+}
diff --git a/batch/retry_test.go b/batch/retry_test.go
new file mode 100644
index 0000000..52975a9
--- /dev/null
+++ b/batch/retry_test.go
@@ -0,0 +1,98 @@
+package batch
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+)
+
+func TestRetryConfig_NoRetryOnSuccess(t *testing.T) {
+	rc := &RetryConfig{MaxAttempts: 3, InitDelay: 10 * time.Millisecond}
+	calls := 0
+
+	err := rc.Do(context.Background(), func() error {
+		calls++
+		return nil
+	}, func(error) bool { return true })
+
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if calls != 1 {
+		t.Errorf("expected 1 call, got %d", calls)
+	}
+}
+
+func TestRetryConfig_RetriesOnTransientError(t *testing.T) {
+	rc := &RetryConfig{MaxAttempts: 3, InitDelay: 10 * time.Millisecond}
+	calls := 0
+
+	err := rc.Do(context.Background(), func() error {
+		calls++
+		if calls < 3 {
+			return errors.New("transient")
+		}
+		return nil
+	}, func(error) bool { return true })
+
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if calls != 3 {
+		t.Errorf("expected 3 calls, got %d", calls)
+	}
+}
+
+func TestRetryConfig_StopsOnNonRetryable(t *testing.T) {
+	rc := &RetryConfig{MaxAttempts: 5, InitDelay: 10 * time.Millisecond}
+	calls := 0
+	permanent := errors.New("permanent")
+
+	err := rc.Do(context.Background(), func() error {
+		calls++
+		return permanent
+	}, func(err error) bool { return err.Error() != "permanent" })
+
+	if err != permanent {
+		t.Fatalf("expected permanent error, got %v", err)
+	}
+	if calls != 1 {
+		t.Errorf("expected 1 call (no retry), got %d", calls)
+	}
+}
+
+func TestRetryConfig_ExponentialBackoff(t *testing.T) {
+	rc := &RetryConfig{MaxAttempts: 3, InitDelay: 50 * time.Millisecond}
+	calls := 0
+
+	start := time.Now()
+	rc.Do(context.Background(), func() error {
+		calls++
+		return errors.New("fail")
+	}, func(error) bool { return true })
+
+	elapsed := time.Since(start)
+	// Expected: 50ms + 100ms = 150ms minimum
+	if elapsed < 130*time.Millisecond {
+		t.Errorf("expected >= 150ms for exponential backoff, got %v", elapsed)
+	}
+}
+
+func TestRetryConfig_ContextCancellation(t *testing.T) {
+	rc := &RetryConfig{MaxAttempts: 10, InitDelay: 5 * time.Second}
+	ctx, cancel := context.WithCancel(context.Background())
+
+	go func() {
+		time.Sleep(50 * time.Millisecond)
+		cancel()
+	}()
+
+	err := rc.Do(ctx, func() error {
+		return errors.New("fail")
+	}, func(error) bool { return true })
+
+	if err == nil {
+		t.Error("expected error from cancelled context")
+	}
+}
diff --git a/cmd/batch.go b/cmd/batch.go
new file mode 100644
index 0000000..61182ad
--- /dev/null
+++ b/cmd/batch.go
@@ -0,0 +1,444 @@
+package cmd
+
+import (
+	"bufio"
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/gupsammy/EzyCopy/batch"
+	"github.com/gupsammy/EzyCopy/extractor"
+	"github.com/gupsammy/EzyCopy/output"
+	"github.com/spf13/cobra"
+	"golang.org/x/sync/errgroup"
+)
+
+var (
+	fileFlag     string
+	concurrency  int
+	rateLimit    time.Duration
+	retries      int
+	retryDelay   time.Duration
+	failFast     bool
+	skipExisting bool
+	maxBodySize  string
+)
+
+func init() {
+	batchCmd := &cobra.Command{
+		Use:   "batch [url...]",
+		Short: "Extract multiple URLs to NDJSON",
+		Long: `Extract multiple URLs in parallel, streaming results as NDJSON (one JSON object per line).
+
+Input sources (mutually exclusive):
+  Positional args:  ezycopy batch https://a.com https://b.com
+  File:             ezycopy batch --file urls.txt
+  Stdin:            cat urls.txt | ezycopy batch -
+
+Output is always NDJSON to stdout. Progress goes to stderr.`,
+		RunE: runBatch,
+	}
+
+	f := batchCmd.Flags()
+	f.StringVarP(&fileFlag, "file", "f", "", "Read URLs from file (one per line)")
+	f.IntVarP(&concurrency, "concurrency", "j", 3, "Max parallel extractions")
+	f.DurationVar(&rateLimit, "rate-limit", 1*time.Second, "Min delay between requests to same domain")
+	f.IntVar(&retries, "retries", 2, "Retry count on transient failures")
+	f.DurationVar(&retryDelay, "retry-delay", 3*time.Second, "Base delay between retries (exponential backoff)")
+	f.BoolVar(&failFast, "fail-fast", false, "Stop on first error")
+	f.BoolVar(&skipExisting, "skip-existing", false, "Skip URLs whose output file already exists (requires -o)")
+	f.StringVar(&maxBodySize, "max-body-size", "10MB", "Max HTTP response body size")
+
+	rootCmd.AddCommand(batchCmd)
+}
+
+func runBatch(cmd *cobra.Command, args []string) error {
+	ctx := cmd.Context()
+
+	// Collect URLs from mutually exclusive sources
+	urls, err := collectURLs(args)
+	if err != nil {
+		return err
+	}
+	if len(urls) == 0 {
+		return &UsageError{Msg: "no URLs provided", Hint: "ezycopy batch --help"}
+	}
+
+	// Parse max body size
+	bodyLimit, err := parseByteSize(maxBodySize)
+	if err != nil {
+		return &UsageError{Msg: fmt.Sprintf("invalid --max-body-size: %v", err)}
+	}
+
+	// Setup concurrency primitives
+	limiter := batch.NewDomainLimiter(rateLimit)
+	retryCfg := &batch.RetryConfig{
+		MaxAttempts: retries + 1, // retries=2 means 3 total attempts
+		InitDelay:   retryDelay,
+	}
+
+	// Setup browser pool if needed
+	var pool *extractor.BrowserPool
+	if browserFlag {
+		pool, err = extractor.NewBrowserPool(browserWS)
+		if err != nil {
+			return fmt.Errorf("failed to start browser: %w", err)
+		}
+		defer pool.Close()
+	}
+
+	// Validate output directory if set
+	if outputFlag != "" {
+		if err := os.MkdirAll(outputFlag, 0755); err != nil {
+			return fmt.Errorf("failed to create output directory: %w", err)
+		}
+	}
+
+	// Results channel — buffered to avoid blocking workers
+	results := make(chan BatchResult, len(urls))
+
+	// Writer goroutine — single writer to stdout, no interleaving
+	var writerWg sync.WaitGroup
+	writerWg.Add(1)
+	var okCount, errCount int
+	var errCodes []string
+	go func() {
+		defer writerWg.Done()
+		enc := json.NewEncoder(os.Stdout)
+		enc.SetEscapeHTML(false)
+		for r := range results {
+			enc.Encode(r)
+			if r.Status == "ok" {
+				okCount++
+			} else {
+				errCount++
+				errCodes = append(errCodes, r.Error)
+			}
+		}
+	}()
+
+	// Worker pool
+	g, gctx := errgroup.WithContext(ctx)
+	g.SetLimit(concurrency)
+
+	for i, u := range urls {
+		seq := i
+		rawURL := u
+
+		g.Go(func() error {
+			result := fetchAndExtract(gctx, seq, rawURL, limiter, retryCfg, pool, bodyLimit)
+			results <- result
+
+			// Write to file if output dir is set and extraction succeeded
+			if outputFlag != "" && result.Status == "ok" && result.Title != nil {
+				writeResultToFile(result)
+			}
+
+			if !quiet {
+				status := "ok"
+				if result.Status == "error" {
+					status = result.Error
+				}
+				fmt.Fprintf(os.Stderr, "[%d/%d] %s (%s)\n", seq+1, len(urls), rawURL, status)
+			}
+
+			if failFast && result.Status == "error" {
+				return fmt.Errorf("failed: %s", rawURL)
+			}
+			return nil
+		})
+	}
+
+	// Wait for all workers, then close results channel
+	_ = g.Wait()
+	close(results)
+	writerWg.Wait()
+
+	// Summary to stderr
+	if !quiet {
+		total := okCount + errCount
+		if errCount > 0 {
+			unique := uniqueStrings(errCodes)
+			fmt.Fprintf(os.Stderr, "%d URLs: %d ok, %d failed (%s)\n", total, okCount, errCount, strings.Join(unique, ", "))
+		} else {
+			fmt.Fprintf(os.Stderr, "%d URLs: %d ok\n", total, okCount)
+		}
+	}
+
+	// Exit code
+	if errCount == len(urls) {
+		return &batchError{code: ExitTotalFailure}
+	}
+	if errCount > 0 {
+		return &batchError{code: ExitPartial}
+	}
+	return nil
+}
+
+func fetchAndExtract(ctx context.Context, seq int, rawURL string, limiter *batch.DomainLimiter, retryCfg *batch.RetryConfig, pool *extractor.BrowserPool, maxBody int64) BatchResult {
+	start := time.Now()
+	result := BatchResult{
+		Seq:         seq,
+		OriginalURL: rawURL,
+		Type:        resolveType(),
+		ExtractedAt: time.Now().UTC().Format(time.RFC3339),
+	}
+
+	var pageResult *extractor.PageResult
+	var extractErr error
+
+	err := retryCfg.Do(ctx, func() error {
+		if err := limiter.Wait(ctx, rawURL); err != nil {
+			return err
+		}
+
+		if pool != nil {
+			pageResult, extractErr = pool.FetchPage(ctx, rawURL, timeout)
+		} else {
+			pageResult, extractErr = extractor.FetchPageHTTP(ctx, rawURL, timeout, maxBody)
+		}
+		return extractErr
+	}, isRetryable)
+
+	if err != nil {
+		result.Status = "error"
+		result.DurationMs = time.Since(start).Milliseconds()
+		var ee *extractor.ExtractionError
+		if errors.As(err, &ee) {
+			result.Error = ee.Code
+			result.Message = ee.Message
+		} else {
+			result.Error = "fetch_failed"
+			result.Message = err.Error()
+		}
+		return result
+	}
+
+	// Extract article
+	article, err := extractor.ExtractArticle(pageResult.HTML, pageResult.URL)
+	if err != nil {
+		result.Status = "error"
+		result.Error = "extract_failed"
+		result.Message = err.Error()
+		result.FinalURL = pageResult.URL
+		result.ContentType = pageResult.ContentType
+		result.DurationMs = time.Since(start).Milliseconds()
+		return result
+	}
+
+	// Convert to markdown
+	includeImages := !noImages
+	markdown, err := extractor.FormatArticle(article, includeImages)
+	if err != nil {
+		result.Status = "error"
+		result.Error = "format_failed"
+		result.Message = err.Error()
+		result.FinalURL = pageResult.URL
+		result.ContentType = pageResult.ContentType
+		result.DurationMs = time.Since(start).Milliseconds()
+		return result
+	}
+
+	title := article.Title
+	result.Status = "ok"
+	result.FinalURL = pageResult.URL
+	result.Title = &title
+	result.ContentType = pageResult.ContentType
+	result.Markdown = markdown
+	result.DurationMs = time.Since(start).Milliseconds()
+	return result
+}
+
+// isRetryable returns true for transient errors worth retrying.
+func isRetryable(err error) bool {
+	var ee *extractor.ExtractionError
+	if errors.As(err, &ee) {
+		// Don't retry content-type or body-size errors
+		switch ee.Code {
+		case "unsupported_content_type", "body_too_large":
+			return false
+		}
+		// Retry 5xx errors
+		if strings.HasPrefix(ee.Code, "http_5") {
+			return true
+		}
+	}
+	// Retry generic fetch errors (timeouts, connection resets)
+	return true
+}
+
+func collectURLs(args []string) ([]string, error) {
+	hasArgs := len(args) > 0
+	hasFile := fileFlag != ""
+
+	// Args and --file are explicitly provided and mutually exclusive
+	if hasArgs && hasFile {
+		return nil, &UsageError{
+			Msg:  "cannot use both positional args and --file",
+			Hint: "ezycopy batch --help",
+		}
+	}
+
+	if hasArgs {
+		// Special case: "ezycopy batch -" means stdin
+		if len(args) == 1 && args[0] == "-" {
+			return readURLsFromReader(os.Stdin)
+		}
+		return args, nil
+	}
+
+	if hasFile {
+		f, err := os.Open(fileFlag)
+		if err != nil {
+			return nil, fmt.Errorf("failed to open URL file: %w", err)
+		}
+		defer f.Close()
+		return readURLsFromReader(f)
+	}
+
+	// No args and no --file: try stdin if it's a pipe
+	if isPipeInput() {
+		return readURLsFromReader(os.Stdin)
+	}
+
+	return nil, nil
+}
+
+func readURLsFromReader(r *os.File) ([]string, error) {
+	var urls []string
+	scanner := bufio.NewScanner(r)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		urls = append(urls, line)
+	}
+	return urls, scanner.Err()
+}
+
+func isPipeInput() bool {
+	info, err := os.Stdin.Stat()
+	if err != nil {
+		return false
+	}
+	return info.Mode()&os.ModeCharDevice == 0
+}
+
+func writeResultToFile(result BatchResult) {
+	if result.Title == nil || *result.Title == "" {
+		return
+	}
+
+	// Use output package's resolve path for consistent naming
+	filePath, err := output.ResolveOutputPath(outputFlag, *result.Title)
+	if err != nil {
+		return
+	}
+
+	if skipExisting {
+		if _, err := os.Stat(filePath); err == nil {
+			return // File exists, skip
+		}
+	}
+
+	// Handle collisions by appending -2, -3, etc.
+	filePath = resolveCollision(filePath)
+
+	_ = output.WriteToFile(filePath, result.Markdown)
+}
+
+func resolveCollision(path string) string {
+	if _, err := os.Stat(path); os.IsNotExist(err) {
+		return path
+	}
+
+	ext := filepath.Ext(path)
+	base := strings.TrimSuffix(path, ext)
+
+	for i := 2; i < 1000; i++ {
+		candidate := fmt.Sprintf("%s-%d%s", base, i, ext)
+		if _, err := os.Stat(candidate); os.IsNotExist(err) {
+			return candidate
+		}
+	}
+	return path
+}
+
+func parseByteSize(s string) (int64, error) {
+	s = strings.TrimSpace(strings.ToUpper(s))
+	if s == "0" {
+		return 0, nil
+	}
+
+	// Check longest suffixes first to avoid "MB" matching "B"
+	suffixes := []struct {
+		suffix string
+		mult   int64
+	}{
+		{"GB", 1024 * 1024 * 1024},
+		{"MB", 1024 * 1024},
+		{"KB", 1024},
+		{"B", 1},
+	}
+
+	for _, entry := range suffixes {
+		if strings.HasSuffix(s, entry.suffix) {
+			numStr := strings.TrimSuffix(s, entry.suffix)
+			var n int64
+			_, err := fmt.Sscanf(numStr, "%d", &n)
+			if err != nil {
+				return 0, fmt.Errorf("cannot parse %q", s)
+			}
+			return n * entry.mult, nil
+		}
+	}
+
+	// Plain number = bytes
+	var n int64
+	_, err := fmt.Sscanf(s, "%d", &n)
+	return n, err
+}
+
+func uniqueStrings(ss []string) []string {
+	seen := make(map[string]bool)
+	var result []string
+	for _, s := range ss {
+		if !seen[s] {
+			seen[s] = true
+			result = append(result, s)
+		}
+	}
+	return result
+}
+
+// sanitizeFilenameForBatch creates a filesystem-safe filename from a title.
+var unsafeCharsRe = regexp.MustCompile(`[^a-zA-Z0-9]+`)
+
+func sanitizeFilenameForBatch(title string) string {
+	if len(title) > 50 {
+		title = title[:50]
+	}
+	safe := unsafeCharsRe.ReplaceAllString(title, "-")
+	safe = strings.Trim(safe, "-")
+	if safe == "" {
+		safe = "untitled"
+	}
+	return safe + ".md"
+}
+
+// batchError carries an exit code without a user-visible message.
+type batchError struct {
+	code int
+}
+
+func (e *batchError) Error() string {
+	return fmt.Sprintf("batch completed with exit code %d", e.code)
+}
diff --git a/cmd/batch_test.go b/cmd/batch_test.go
new file mode 100644
index 0000000..25b8414
--- /dev/null
+++ b/cmd/batch_test.go
@@ -0,0 +1,54 @@
+package cmd
+
+import (
+	"testing"
+)
+
+func TestParseByteSize(t *testing.T) {
+	tests := []struct {
+		input string
+		want  int64
+		err   bool
+	}{
+		{"10MB", 10 * 1024 * 1024, false},
+		{"1GB", 1024 * 1024 * 1024, false},
+		{"512KB", 512 * 1024, false},
+		{"100B", 100, false},
+		{"0", 0, false},
+		{"1024", 1024, false},
+		{"invalid", 0, true},
+	}
+	for _, tt := range tests {
+		got, err := parseByteSize(tt.input)
+		if tt.err {
+			if err == nil {
+				t.Errorf("parseByteSize(%q) expected error", tt.input)
+			}
+			continue
+		}
+		if err != nil {
+			t.Errorf("parseByteSize(%q) unexpected error: %v", tt.input, err)
+			continue
+		}
+		if got != tt.want {
+			t.Errorf("parseByteSize(%q) = %d, want %d", tt.input, got, tt.want)
+		}
+	}
+}
+
+func TestUniqueStrings(t *testing.T) {
+	input := []string{"a", "b", "a", "c", "b"}
+	got := uniqueStrings(input)
+	if len(got) != 3 || got[0] != "a" || got[1] != "b" || got[2] != "c" {
+		t.Errorf("uniqueStrings(%v) = %v, want [a b c]", input, got)
+	}
+}
+
+func TestResolveCollision(t *testing.T) {
+	// Non-existent file — should return as-is
+	path := "/tmp/ezycopy-test-nonexistent-file-abc123.md"
+	got := resolveCollision(path)
+	if got != path {
+		t.Errorf("resolveCollision(%q) = %q, want same path", path, got)
+	}
+}
diff --git a/cmd/root.go b/cmd/root.go
new file mode 100644
index 0000000..a663c88
--- /dev/null
+++ b/cmd/root.go
@@ -0,0 +1,236 @@
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/url"
+	"os"
+	"os/signal"
+	"time"
+
+	"github.com/gupsammy/EzyCopy/extractor"
+	"github.com/gupsammy/EzyCopy/output"
+	"github.com/spf13/cobra"
+)
+
+var (
+	Version = "2.0.0"
+
+	// Persistent flags (shared across root + subcommands)
+	outputFlag  string
+	noImages    bool
+	timeout     time.Duration
+	browserFlag bool
+	browserWS   string
+	typeFlag    string
+	jsonFlag    bool
+	quiet       bool
+	verbose     bool
+	noColor     bool
+
+	// Root-only flags
+	clipboardFlag bool
+)
+
+var rootCmd = &cobra.Command{
+	Use:   "ezycopy <url>",
+	Short: "Extract web content as markdown",
+	Long: `EzyCopy extracts article content from web pages and converts it to markdown.
+
+By default, uses fast HTTP fetch. Use --browser for JS-heavy sites (Twitter, SPAs)
+or authenticated content (uses your Chrome profile).
+
+Content is printed to stdout. Use -c to copy to clipboard, -o to save to a file.
+Use --json for structured JSON output.`,
+	Args:    cobra.ExactArgs(1),
+	Version: Version,
+	RunE:    runRoot,
+
+	SilenceUsage:  true,
+	SilenceErrors: true,
+}
+
+func init() {
+	// Persistent flags — available to root and all subcommands
+	pf := rootCmd.PersistentFlags()
+	pf.StringVarP(&outputFlag, "output", "o", "", "Save to file (directory auto-generates name)")
+	pf.BoolVar(&noImages, "no-images", false, "Strip image links from output")
+	pf.DurationVarP(&timeout, "timeout", "t", 30*time.Second, "Page load timeout")
+	pf.BoolVar(&browserFlag, "browser", false, "Use Chrome browser (for JS-heavy or authenticated sites)")
+	pf.StringVar(&browserWS, "browser-ws", "", "Connect to existing Chrome via DevTools WebSocket URL")
+	pf.StringVar(&typeFlag, "type", "", "Content type hint: article, github")
+	pf.BoolVar(&jsonFlag, "json", false, "Emit JSON output instead of raw markdown")
+	pf.BoolVarP(&quiet, "quiet", "q", false, "Suppress progress messages")
+	pf.BoolVarP(&verbose, "verbose", "v", false, "Debug output to stderr")
+	pf.BoolVar(&noColor, "no-color", false, "Disable ANSI colors")
+
+	// Root-only flags
+	rootCmd.Flags().BoolVarP(&clipboardFlag, "clipboard", "c", false, "Copy output to clipboard")
+
+	// browser-ws implies browser
+	rootCmd.PersistentPreRunE = func(cmd *cobra.Command, args []string) error {
+		if browserWS != "" {
+			browserFlag = true
+		}
+		// Respect NO_COLOR env
+		if os.Getenv("NO_COLOR") != "" {
+			noColor = true
+		}
+		return nil
+	}
+}
+
+// Execute sets up signal handling and runs the root command.
+func Execute() {
+	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
+	defer cancel()
+
+	// Second Ctrl-C forces exit
+	go func() {
+		<-ctx.Done()
+		// Context cancelled by first signal; wait for second
+		sig := make(chan os.Signal, 1)
+		signal.Notify(sig, os.Interrupt)
+		<-sig
+		os.Exit(130)
+	}()
+
+	rootCmd.SetContext(ctx)
+
+	if err := rootCmd.Execute(); err != nil {
+		// Check for batch error with explicit exit code
+		if be, ok := err.(*batchError); ok {
+			os.Exit(be.code)
+		}
+		exitCode := ExitPartial
+		if isUsageError(err) {
+			exitCode = ExitInvalidUsage
+		}
+		writeFatalError(err, exitCode)
+		os.Exit(exitCode)
+	}
+}
+
+func runRoot(cmd *cobra.Command, args []string) error {
+	start := time.Now()
+	inputURL := args[0]
+
+	if _, err := url.ParseRequestURI(inputURL); err != nil {
+		return &UsageError{Msg: fmt.Sprintf("invalid URL: %s", inputURL)}
+	}
+
+	// Fetch page
+	var pageResult *extractor.PageResult
+	var err error
+	if browserFlag {
+		if !quiet {
+			fmt.Fprintln(os.Stderr, "Fetching page (browser)...")
+		}
+		pageResult, err = extractor.FetchPage(inputURL, timeout)
+	} else {
+		if !quiet {
+			fmt.Fprintln(os.Stderr, "Fetching page...")
+		}
+		pageResult, err = extractor.FetchPageHTTP(cmd.Context(), inputURL, timeout, 0)
+	}
+	if err != nil {
+		return fmt.Errorf("failed to fetch page: %w", err)
+	}
+
+	// Extract article
+	if !quiet {
+		fmt.Fprintln(os.Stderr, "Extracting content...")
+	}
+	article, err := extractor.ExtractArticle(pageResult.HTML, pageResult.URL)
+	if err != nil {
+		return fmt.Errorf("failed to extract content: %w", err)
+	}
+
+	// Convert to markdown
+	includeImages := !noImages
+	markdown, err := extractor.FormatArticle(article, includeImages)
+	if err != nil {
+		return fmt.Errorf("failed to convert to markdown: %w", err)
+	}
+
+	durationMs := time.Since(start).Milliseconds()
+
+	// JSON output
+	if jsonFlag {
+		result := SingleResult{
+			OriginalURL: inputURL,
+			FinalURL:    pageResult.URL,
+			Title:       article.Title,
+			Author:      article.Byline,
+			Type:        resolveType(),
+			ContentType: pageResult.ContentType,
+			Markdown:    markdown,
+			DurationMs:  durationMs,
+			ExtractedAt: time.Now().UTC().Format(time.RFC3339),
+		}
+		enc := json.NewEncoder(os.Stdout)
+		enc.SetEscapeHTML(false)
+		return enc.Encode(result)
+	}
+
+	// Default: raw markdown to stdout
+	fmt.Println(markdown)
+
+	// Copy to clipboard if requested
+	if clipboardFlag {
+		if err := output.CopyToClipboard(markdown); err != nil {
+			fmt.Fprintf(os.Stderr, "Warning: failed to copy to clipboard: %v\n", err)
+		} else if !quiet {
+			fmt.Fprintln(os.Stderr, "Copied to clipboard!")
+		}
+	}
+
+	// Save to file if requested
+	if outputFlag != "" {
+		filePath, err := output.ResolveOutputPath(outputFlag, article.Title)
+		if err != nil {
+			return fmt.Errorf("failed to resolve output path: %w", err)
+		}
+		if err := output.WriteToFile(filePath, markdown); err != nil {
+			return fmt.Errorf("failed to write file: %w", err)
+		}
+		if !quiet {
+			fmt.Fprintf(os.Stderr, "Saved to: %s\n", filePath)
+		}
+	}
+
+	return nil
+}
+
+func resolveType() string {
+	if typeFlag != "" {
+		return typeFlag
+	}
+	return "article"
+}
+
+func writeFatalError(err error, exitCode int) {
+	fe := FatalError{
+		Error:   "runtime_error",
+		Message: err.Error(),
+	}
+	if exitCode == ExitInvalidUsage {
+		fe.Error = "invalid_usage"
+		fe.Hint = "ezycopy --help"
+	}
+	// Only write structured JSON if stderr is not a TTY or noColor is set
+	if noColor || !isTTY(os.Stderr) {
+		json.NewEncoder(os.Stderr).Encode(fe)
+	} else {
+		fmt.Fprintf(os.Stderr, "Error: %s\n", err.Error())
+	}
+}
+
+func isTTY(f *os.File) bool {
+	info, err := f.Stat()
+	if err != nil {
+		return false
+	}
+	return info.Mode()&os.ModeCharDevice != 0
+}
diff --git a/cmd/types.go b/cmd/types.go
new file mode 100644
index 0000000..466516d
--- /dev/null
+++ b/cmd/types.go
@@ -0,0 +1,61 @@
+package cmd
+
+// Exit codes
+const (
+	ExitOK           = 0
+	ExitPartial      = 1 // One or more URLs failed (batch: partial; root: extraction failed)
+	ExitInvalidUsage = 2 // Bad flag, missing arg, conflicting input sources
+	ExitTotalFailure = 3 // All URLs failed (batch only)
+)
+
+// SingleResult is the JSON output for the root command with --json.
+type SingleResult struct {
+	OriginalURL string `json:"original_url"`
+	FinalURL    string `json:"final_url"`
+	Title       string `json:"title"`
+	Author      string `json:"author,omitempty"`
+	Type        string `json:"type"`
+	ContentType string `json:"content_type"`
+	Markdown    string `json:"markdown"`
+	DurationMs  int64  `json:"duration_ms"`
+	ExtractedAt string `json:"extracted_at"`
+}
+
+// BatchResult is one line of NDJSON output from the batch subcommand.
+type BatchResult struct {
+	Seq         int    `json:"seq"`
+	OriginalURL string `json:"original_url"`
+	FinalURL    string `json:"final_url,omitempty"`
+	Title       *string `json:"title"` // null on error
+	Type        string `json:"type"`
+	ContentType string `json:"content_type,omitempty"`
+	Status      string `json:"status"` // "ok" or "error"
+	Error       string `json:"error,omitempty"`
+	Message     string `json:"message,omitempty"`
+	Markdown    string `json:"markdown,omitempty"`
+	DurationMs  int64  `json:"duration_ms"`
+	ExtractedAt string `json:"extracted_at"`
+}
+
+// FatalError is the structured error written to stderr for usage/config errors.
+type FatalError struct {
+	Error   string `json:"error"`
+	Message string `json:"message"`
+	Hint    string `json:"hint,omitempty"`
+}
+
+// UsageError indicates invalid CLI usage (exit code 2).
+type UsageError struct {
+	Msg  string
+	Hint string
+}
+
+func (e *UsageError) Error() string {
+	return e.Msg
+}
+
+func isUsageError(err error) bool {
+	_, ok := err.(*UsageError)
+	return ok
+}
+
diff --git a/extractor/browser.go b/extractor/browser.go
index ef51f29..b7949bb 100644
--- a/extractor/browser.go
+++ b/extractor/browser.go
@@ -13,8 +13,9 @@ import (
 
 // PageResult contains the fetched page data
 type PageResult struct {
-	HTML string
-	URL  string // Final URL after redirects
+	HTML        string
+	URL         string // Final URL after redirects
+	ContentType string // HTTP Content-Type header
 }
 
 // getDefaultChromeProfile returns the path to Chrome's default user data directory
diff --git a/extractor/http.go b/extractor/http.go
index bf3be0d..060ffe3 100644
--- a/extractor/http.go
+++ b/extractor/http.go
@@ -1,24 +1,34 @@
 package extractor
 
 import (
+	"context"
 	"fmt"
 	"io"
 	"net/http"
+	"strings"
 	"time"
 )
 
-// FetchPageHTTP fetches a page using simple HTTP (no JavaScript execution)
-func FetchPageHTTP(url string, timeout time.Duration) (*PageResult, error) {
+// supportedContentTypes lists MIME types we can meaningfully extract.
+var supportedContentTypes = []string{
+	"text/html",
+	"application/xhtml+xml",
+	"text/plain",
+	"text/xml",
+}
+
+// FetchPageHTTP fetches a page using simple HTTP (no JavaScript execution).
+// Pass maxBodySize=0 to disable the body size limit.
+func FetchPageHTTP(ctx context.Context, url string, timeout time.Duration, maxBodySize int64) (*PageResult, error) {
 	client := &http.Client{
 		Timeout: timeout,
 	}
 
-	req, err := http.NewRequest("GET", url, nil)
+	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create request: %w", err)
 	}
 
-	// Set User-Agent to avoid bot blocks
 	req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
 	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
 
@@ -29,16 +39,63 @@ func FetchPageHTTP(url string, timeout time.Duration) (*PageResult, error) {
 	defer resp.Body.Close()
 
 	if resp.StatusCode != http.StatusOK {
-		return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
+		return nil, &ExtractionError{
+			Code:    fmt.Sprintf("http_%d", resp.StatusCode),
+			Message: fmt.Sprintf("HTTP %d: %s", resp.StatusCode, resp.Status),
+		}
+	}
+
+	// Content-Type pre-check
+	ct := resp.Header.Get("Content-Type")
+	if ct != "" && !isHTMLContentType(ct) {
+		return nil, &ExtractionError{
+			Code:    "unsupported_content_type",
+			Message: fmt.Sprintf("Content-Type %s is not supported", ct),
+		}
 	}
 
-	body, err := io.ReadAll(resp.Body)
+	// Read body with optional size limit
+	var reader io.Reader = resp.Body
+	if maxBodySize > 0 {
+		reader = io.LimitReader(resp.Body, maxBodySize+1)
+	}
+
+	body, err := io.ReadAll(reader)
 	if err != nil {
 		return nil, fmt.Errorf("failed to read response: %w", err)
 	}
 
+	if maxBodySize > 0 && int64(len(body)) > maxBodySize {
+		return nil, &ExtractionError{
+			Code:    "body_too_large",
+			Message: fmt.Sprintf("response body exceeds %d bytes", maxBodySize),
+		}
+	}
+
 	return &PageResult{
-		HTML: string(body),
-		URL:  resp.Request.URL.String(), // Final URL after redirects
+		HTML:        string(body),
+		URL:         resp.Request.URL.String(),
+		ContentType: ct,
 	}, nil
 }
+
+// ExtractionError wraps per-URL errors with a machine-readable code.
+type ExtractionError struct {
+	Code    string
+	Message string
+}
+
+func (e *ExtractionError) Error() string {
+	return fmt.Sprintf("%s: %s", e.Code, e.Message)
+}
+
+// isHTMLContentType checks if the Content-Type is one we can extract from.
+func isHTMLContentType(ct string) bool {
+	ct = strings.ToLower(ct)
+	for _, supported := range supportedContentTypes {
+		if strings.HasPrefix(ct, supported) {
+			return true
+		}
+	}
+	return false
+}
diff --git a/extractor/http_test.go b/extractor/http_test.go
new file mode 100644
index 0000000..2aef1f6
--- /dev/null
+++ b/extractor/http_test.go
@@ -0,0 +1,150 @@
+package extractor
+
+import (
+	"context"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+)
+
+func TestFetchPageHTTP_Success(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		w.Write([]byte("<html><body>Hello</body></html>"))
+	}))
+	defer srv.Close()
+
+	result, err := FetchPageHTTP(context.Background(), srv.URL, 5*time.Second, 0)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if !strings.Contains(result.HTML, "Hello") {
+		t.Errorf("expected HTML to contain 'Hello', got %q", result.HTML)
+	}
+	if result.ContentType != "text/html" {
+		t.Errorf("expected content-type 'text/html', got %q", result.ContentType)
+	}
+}
+
+func TestFetchPageHTTP_RejectsNonHTML(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/pdf")
+		w.Write([]byte("%PDF-1.4"))
+	}))
+	defer srv.Close()
+
+	_, err := FetchPageHTTP(context.Background(), srv.URL, 5*time.Second, 0)
+	if err == nil {
+		t.Fatal("expected error for non-HTML content type")
+	}
+
+	var ee *ExtractionError
+	if !errors.As(err, &ee) {
+		t.Fatalf("expected ExtractionError, got %T: %v", err, err)
+	}
+	if ee.Code != "unsupported_content_type" {
+		t.Errorf("expected code 'unsupported_content_type', got %q", ee.Code)
+	}
+}
+
+func TestFetchPageHTTP_BodySizeLimit(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		w.Write([]byte(strings.Repeat("x", 1000)))
+	}))
+	defer srv.Close()
+
+	_, err := FetchPageHTTP(context.Background(), srv.URL, 5*time.Second, 500)
+	if err == nil {
+		t.Fatal("expected error for body exceeding limit")
+	}
+
+	var ee *ExtractionError
+	if !errors.As(err, &ee) {
+		t.Fatalf("expected ExtractionError, got %T: %v", err, err)
+	}
+	if ee.Code != "body_too_large" {
+		t.Errorf("expected code 'body_too_large', got %q", ee.Code)
+	}
+}
+
+func TestFetchPageHTTP_HTTPError(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+	}))
+	defer srv.Close()
+
+	_, err := FetchPageHTTP(context.Background(), srv.URL, 5*time.Second, 0)
+	if err == nil {
+		t.Fatal("expected error for 500 response")
+	}
+
+	var ee *ExtractionError
+	if !errors.As(err, &ee) {
+		t.Fatalf("expected ExtractionError, got %T: %v", err, err)
+	}
+	if ee.Code != "http_500" {
+		t.Errorf("expected code 'http_500', got %q", ee.Code)
+	}
+}
+
+func TestFetchPageHTTP_ContextCancellation(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		time.Sleep(5 * time.Second)
+	}))
+	defer srv.Close()
+
+	ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
+	defer cancel()
+
+	_, err := FetchPageHTTP(ctx, srv.URL, 10*time.Second, 0)
+	if err == nil {
+		t.Fatal("expected error from cancelled context")
+	}
+}
+
+func TestFetchPageHTTP_FollowsRedirects(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/" {
+			http.Redirect(w, r, "/final", http.StatusMovedPermanently)
+			return
+		}
+		w.Header().Set("Content-Type", "text/html")
+		w.Write([]byte("<html>Final</html>"))
+	}))
+	defer srv.Close()
+
+	result, err := FetchPageHTTP(context.Background(), srv.URL, 5*time.Second, 0)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if !strings.HasSuffix(result.URL, "/final") {
+		t.Errorf("expected final URL to end with '/final', got %q", result.URL)
+	}
+}
+
+func TestIsHTMLContentType(t *testing.T) {
+	tests := []struct {
+		ct   string
+		want bool
+	}{
+		{"text/html", true},
+		{"text/html; charset=utf-8", true},
+		{"application/xhtml+xml", true},
+		{"text/plain", true},
+		{"text/xml", true},
+		{"application/pdf", false},
+		{"image/png", false},
+		{"application/json", false},
+		{"", false},
+	}
+	for _, tt := range tests {
+		got := isHTMLContentType(tt.ct)
+		if got != tt.want {
+			t.Errorf("isHTMLContentType(%q) = %v, want %v", tt.ct, got, tt.want)
+		}
+	}
+}
diff --git a/extractor/pool.go b/extractor/pool.go
new file mode 100644
index 0000000..b696f88
--- /dev/null
+++ b/extractor/pool.go
@@ -0,0 +1,102 @@
+package extractor
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/go-rod/rod"
+	"github.com/go-rod/rod/lib/launcher"
+	"github.com/go-rod/rod/lib/proto"
+)
+
+// BrowserPool manages a shared Chrome instance for batch extraction.
+// Unlike FetchPage (root command), this uses an ephemeral profile —
+// no user data, safer for concurrent tab access.
+type BrowserPool struct {
+	browser  *rod.Browser
+	launcher *launcher.Launcher // nil when using external Chrome via --browser-ws
+}
+
+// NewBrowserPool creates a browser pool. If wsURL is non-empty, it connects
+// to an existing Chrome instance. Otherwise, it launches an ephemeral headless Chrome.
+func NewBrowserPool(wsURL string) (*BrowserPool, error) {
+	pool := &BrowserPool{}
+
+	if wsURL != "" {
+		browser := rod.New().ControlURL(wsURL)
+		if err := browser.Connect(); err != nil {
+			return nil, fmt.Errorf("failed to connect to Chrome at %s: %w", wsURL, err)
+		}
+		pool.browser = browser
+	} else {
+		l := launcher.New().Headless(true).Set("disable-gpu").Set("no-sandbox")
+		controlURL, err := l.Launch()
+		if err != nil {
+			return nil, fmt.Errorf("failed to launch Chrome: %w", err)
+		}
+		pool.launcher = l
+
+		browser := rod.New().ControlURL(controlURL)
+		if err := browser.Connect(); err != nil {
+			l.Cleanup()
+			return nil, fmt.Errorf("failed to connect to Chrome: %w", err)
+		}
+		pool.browser = browser
+	}
+
+	return pool, nil
+}
+
+// FetchPage opens a new tab, navigates to the URL, extracts HTML, and closes the tab.
+// Safe for concurrent use — each call gets its own tab.
+func (bp *BrowserPool) FetchPage(ctx context.Context, url string, timeout time.Duration) (*PageResult, error) {
+	page, err := bp.browser.Page(proto.TargetCreateTarget{URL: "about:blank"})
+	if err != nil {
+		return nil, fmt.Errorf("failed to create tab: %w", err)
+	}
+	defer func() {
+		_ = page.Close()
+	}()
+
+	page = page.Context(ctx).Timeout(timeout)
+
+	if err := page.Navigate(url); err != nil {
+		return nil, fmt.Errorf("failed to navigate to %s: %w", url, err)
+	}
+
+	if err := page.WaitLoad(); err != nil {
+		return nil, fmt.Errorf("page load timeout: %w", err)
+	}
+
+	// WaitStable replaces the old time.Sleep(2s) — waits for DOM to stabilize
+	if err := page.WaitStable(300 * time.Millisecond); err != nil {
+		// Non-fatal: page may be interactive enough
+	}
+
+	info, err := page.Info()
+	if err != nil {
+		return nil, fmt.Errorf("failed to get page info: %w", err)
+	}
+
+	html, err := page.HTML()
+	if err != nil {
+		return nil, fmt.Errorf("failed to extract HTML: %w", err)
+	}
+
+	return &PageResult{
+		HTML:        html,
+		URL:         info.URL,
+		ContentType: "text/html",
+	}, nil
+}
+
+// Close shuts down the browser. Only call after all FetchPage calls have returned.
+func (bp *BrowserPool) Close() {
+	if bp.browser != nil {
+		_ = bp.browser.Close()
+	}
+	if bp.launcher != nil {
+		bp.launcher.Cleanup()
+	}
+}
diff --git a/go.mod b/go.mod
index 597fe8e..eae3d58 100644
--- a/go.mod
+++ b/go.mod
@@ -24,5 +24,6 @@ require (
 	github.com/ysmood/gson v0.7.3 // indirect
 	github.com/ysmood/leakless v0.9.0 // indirect
 	golang.org/x/net v0.47.0 // indirect
+	golang.org/x/sync v0.19.0 // indirect
 	golang.org/x/text v0.31.0 // indirect
 )
diff --git a/go.sum b/go.sum
index f4689f8..6838fad 100644
--- a/go.sum
+++ b/go.sum
@@ -87,6 +87,8 @@ golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
 golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
+golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
diff --git a/main.go b/main.go
index ff79a5f..66bc373 100644
--- a/main.go
+++ b/main.go
@@ -1,113 +1,7 @@
 package main
 
-import (
-	"fmt"
-	"net/url"
-	"os"
-	"time"
-
-	"github.com/gupsammy/EzyCopy/extractor"
-	"github.com/gupsammy/EzyCopy/output"
-	"github.com/spf13/cobra"
-)
-
-var (
-	version = "0.4.0"
-
-	outputFlag    string
-	noImages      bool
-	timeout       time.Duration
-	browserFlag   bool
-	clipboardFlag bool
-)
+import "github.com/gupsammy/EzyCopy/cmd"
 
 func main() {
-	rootCmd := &cobra.Command{
-		Use:   "ezycopy <url>",
-		Short: "Extract web content as markdown",
-		Long: `EzyCopy extracts article content from web pages and converts it to markdown.
-
-By default, uses fast HTTP fetch. Use --browser for JS-heavy sites (Twitter, SPAs)
-or authenticated content (uses your Chrome profile).
-
-Content is printed to stdout. Use -c to copy to clipboard, -o to save to a file.`,
-		Args:    cobra.ExactArgs(1),
-		Version: version,
-		RunE:    run,
-	}
-
-	rootCmd.Flags().StringVarP(&outputFlag, "output", "o", "", "Save to file (directory auto-generates name)")
-	rootCmd.Flags().BoolVar(&noImages, "no-images", false, "Strip image links from output")
-	rootCmd.Flags().DurationVarP(&timeout, "timeout", "t", 30*time.Second, "Page load timeout")
-	rootCmd.Flags().BoolVar(&browserFlag, "browser", false, "Use Chrome browser (for JS-heavy or authenticated sites)")
-	rootCmd.Flags().BoolVarP(&clipboardFlag, "clipboard", "c", false, "Copy output to clipboard")
-
-	if err := rootCmd.Execute(); err != nil {
-		os.Exit(2)
-	}
-}
-
-func run(cmd *cobra.Command, args []string) error {
-	inputURL := args[0]
-
-	// Validate URL
-	if _, err := url.ParseRequestURI(inputURL); err != nil {
-		return fmt.Errorf("invalid URL: %s", inputURL)
-	}
-
-	// Fetch page
-	var pageResult *extractor.PageResult
-	var err error
-	if browserFlag {
-		fmt.Fprintln(os.Stderr, "Fetching page (browser)...")
-		pageResult, err = extractor.FetchPage(inputURL, timeout)
-	} else {
-		fmt.Fprintln(os.Stderr, "Fetching page...")
-		pageResult, err = extractor.FetchPageHTTP(inputURL, timeout)
-	}
-	if err != nil {
-		return fmt.Errorf("failed to fetch page: %w", err)
-	}
-
-	// Extract article
-	fmt.Fprintln(os.Stderr, "Extracting content...")
-	article, err := extractor.ExtractArticle(pageResult.HTML, pageResult.URL)
-	if err != nil {
-		return fmt.Errorf("failed to extract content: %w", err)
-	}
-
-	// Convert to markdown
-	includeImages := !noImages
-	markdown, err := extractor.FormatArticle(article, includeImages)
-	if err != nil {
-		return fmt.Errorf("failed to convert to markdown: %w", err)
-	}
-
-	// Output to stdout
-	fmt.Println(markdown)
-
-	// Copy to clipboard if requested
-	if clipboardFlag {
-		if err := output.CopyToClipboard(markdown); err != nil {
-			fmt.Fprintf(os.Stderr, "Warning: failed to copy to clipboard: %v\n", err)
-		} else {
-			fmt.Fprintln(os.Stderr, "Copied to clipboard!")
-		}
-	}
-
-	// Save to file if requested
-	if outputFlag != "" {
-		filePath, err := output.ResolveOutputPath(outputFlag, article.Title)
-		if err != nil {
-			return fmt.Errorf("failed to resolve output path: %w", err)
-		}
-
-		if err := output.WriteToFile(filePath, markdown); err != nil {
-			return fmt.Errorf("failed to write file: %w", err)
-		}
-
-		fmt.Fprintf(os.Stderr, "Saved to: %s\n", filePath)
-	}
-
-	return nil
+	cmd.Execute()
 }

From e0ffbd03758be8094b33a5dcfb1f637ec178c0fe Mon Sep 17 00:00:00 2001
From: gupsammy <samarthgupta1911@gmail.com>
Date: Wed, 4 Mar 2026 18:46:23 +0530
Subject: [PATCH 2/5] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?=
 =?UTF-8?q?browser-ws,=20fail-fast,=20retries=20validation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Root command now uses BrowserPool when --browser-ws is provided
  instead of always launching local Chrome profile
- Batch --fail-fast stops scheduling new jobs once context is cancelled
- Negative --retries value is rejected with a clean usage error

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 cmd/batch.go | 10 ++++++++++
 cmd/root.go  | 12 +++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/cmd/batch.go b/cmd/batch.go
index 61182ad..392a23f 100644
--- a/cmd/batch.go
+++ b/cmd/batch.go
@@ -77,6 +77,11 @@ func runBatch(cmd *cobra.Command, args []string) error {
 		return &UsageError{Msg: fmt.Sprintf("invalid --max-body-size: %v", err)}
 	}
 
+	// Validate retries
+	if retries < 0 {
+		return &UsageError{Msg: "--retries must be non-negative"}
+	}
+
 	// Setup concurrency primitives
 	limiter := batch.NewDomainLimiter(rateLimit)
 	retryCfg := &batch.RetryConfig{
@@ -132,6 +137,11 @@ func runBatch(cmd *cobra.Command, args []string) error {
 		seq := i
 		rawURL := u
 
+		// With --fail-fast, stop scheduling new jobs once context is cancelled
+		if failFast && gctx.Err() != nil {
+			break
+		}
+
 		g.Go(func() error {
 			result := fetchAndExtract(gctx, seq, rawURL, limiter, retryCfg, pool, bodyLimit)
 			results <- result
diff --git a/cmd/root.go b/cmd/root.go
index a663c88..10fe6de 100644
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -127,7 +127,17 @@ func runRoot(cmd *cobra.Command, args []string) error {
 		if !quiet {
 			fmt.Fprintln(os.Stderr, "Fetching page (browser)...")
 		}
-		pageResult, err = extractor.FetchPage(inputURL, timeout)
+		if browserWS != "" {
+			// Use BrowserPool to connect to existing Chrome via DevTools WebSocket
+			pool, poolErr := extractor.NewBrowserPool(browserWS)
+			if poolErr != nil {
+				return fmt.Errorf("failed to connect to browser: %w", poolErr)
+			}
+			defer pool.Close()
+			pageResult, err = pool.FetchPage(cmd.Context(), inputURL, timeout)
+		} else {
+			pageResult, err = extractor.FetchPage(inputURL, timeout)
+		}
 	} else {
 		if !quiet {
 			fmt.Fprintln(os.Stderr, "Fetching page...")

From 2c830ea8ae097a14852d8354941afb701f7fea80 Mon Sep 17 00:00:00 2001
From: gupsammy <samarthgupta1911@gmail.com>
Date: Wed, 4 Mar 2026 19:13:43 +0530
Subject: [PATCH 3/5] fix(ci): add allowed tools for code-review plugin to post
 comments

The code-review plugin needs explicit --allowedTools for the inline
comment MCP tool and gh CLI commands. Without this, all tool calls
were denied (37 permission denials) and no review comments posted.

Also added --comment flag to the prompt so the plugin actually posts
its findings rather than just printing to the action log.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/claude-code-review.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml
index 21cb993..75cc2a1 100644
--- a/.github/workflows/claude-code-review.yml
+++ b/.github/workflows/claude-code-review.yml
@@ -38,7 +38,7 @@ jobs:
           claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
           plugin_marketplaces: 'https://github.com/anthropics/claude-code.git'
           plugins: 'code-review@claude-code-plugins'
-          prompt: '/code-review:code-review ${{ github.repository }}/pull/${{ github.event.pull_request.number }}'
-          # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md
-          # or https://code.claude.com/docs/en/cli-reference for available options
+          prompt: '/code-review:code-review --comment ${{ github.repository }}/pull/${{ github.event.pull_request.number }}'
+          claude_args: |
+            --allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh issue view:*),Bash(gh search:*),Bash(gh issue list:*),Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr list:*)"
 

From 615d7ab807e9da74de40b24159584bfaf22faa5d Mon Sep 17 00:00:00 2001
From: gupsammy <samarthgupta1911@gmail.com>
Date: Wed, 4 Mar 2026 19:46:43 +0530
Subject: [PATCH 4/5] =?UTF-8?q?fix(batch):=20address=20code=20review=20?=
 =?UTF-8?q?=E2=80=94=20extractDomain=20and=20file=20write=20race?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- extractDomain: return rawURL when hostname is empty (scheme-less URLs
  were all bucketed under "" in the rate limiter)
- Move writeResultToFile into serialized writer goroutine to eliminate
  TOCTOU race on resolveCollision from concurrent workers
- Add junk/ to .gitignore

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .gitignore              | 3 +++
 batch/ratelimit.go      | 2 +-
 batch/ratelimit_test.go | 3 ++-
 cmd/batch.go            | 9 ++++-----
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 902ee36..6f39f1d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,3 +59,6 @@ videos/
 
 # Agent counselors output
 agents/
+
+# Junk / temp files
+junk/
diff --git a/batch/ratelimit.go b/batch/ratelimit.go
index 97d9dc3..1a88a93 100644
--- a/batch/ratelimit.go
+++ b/batch/ratelimit.go
@@ -55,7 +55,7 @@ func (d *DomainLimiter) Wait(ctx context.Context, rawURL string) error {
 
 func extractDomain(rawURL string) string {
 	u, err := url.Parse(rawURL)
-	if err != nil {
+	if err != nil || u.Hostname() == "" {
 		return rawURL
 	}
 	return u.Hostname()
diff --git a/batch/ratelimit_test.go b/batch/ratelimit_test.go
index ed190ec..5973ba7 100644
--- a/batch/ratelimit_test.go
+++ b/batch/ratelimit_test.go
@@ -99,7 +99,8 @@ func TestExtractDomain(t *testing.T) {
 	}{
 		{"https://example.com/path", "example.com"},
 		{"http://sub.example.com:8080/page", "sub.example.com"},
-		{"not-a-url", ""},
+		{"not-a-url", "not-a-url"},
+		{"example.com/page", "example.com/page"},
 	}
 	for _, tt := range tests {
 		got := extractDomain(tt.url)
diff --git a/cmd/batch.go b/cmd/batch.go
index 392a23f..745651d 100644
--- a/cmd/batch.go
+++ b/cmd/batch.go
@@ -122,6 +122,10 @@ func runBatch(cmd *cobra.Command, args []string) error {
 			enc.Encode(r)
 			if r.Status == "ok" {
 				okCount++
+				// Write file in serialized writer goroutine to avoid TOCTOU race
+				if outputFlag != "" && r.Title != nil {
+					writeResultToFile(r)
+				}
 			} else {
 				errCount++
 				errCodes = append(errCodes, r.Error)
@@ -146,11 +150,6 @@ func runBatch(cmd *cobra.Command, args []string) error {
 			result := fetchAndExtract(gctx, seq, rawURL, limiter, retryCfg, pool, bodyLimit)
 			results <- result
 
-			// Write to file if output dir is set and extraction succeeded
-			if outputFlag != "" && result.Status == "ok" && result.Title != nil {
-				writeResultToFile(result)
-			}
-
 			if !quiet {
 				status := "ok"
 				if result.Status == "error" {

From fa15444ebde85bb6354b0fdf77925df77e0b1f73 Mon Sep 17 00:00:00 2001
From: gupsammy <samarthgupta1911@gmail.com>
Date: Thu, 5 Mar 2026 11:09:47 +0530
Subject: [PATCH 5/5] fix(batch): don't retry 4xx HTTP errors in isRetryable

ExtractionError with http_4xx codes fell through the errors.As block
and hit the outer `return true`, causing 404/403/401 to be retried
with exponential backoff. Added `return false` at end of the errors.As
block so only explicit 5xx and generic non-HTTP errors are retried.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 cmd/batch.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmd/batch.go b/cmd/batch.go
index 745651d..5c458c7 100644
--- a/cmd/batch.go
+++ b/cmd/batch.go
@@ -278,6 +278,8 @@ func isRetryable(err error) bool {
 		if strings.HasPrefix(ee.Code, "http_5") {
 			return true
 		}
+		// Don't retry other known HTTP errors (4xx, etc.)
+		return false
 	}
 	// Retry generic fetch errors (timeouts, connection resets)
 	return true