patchright-docker/fetch.py at main · borski/patchright-docker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python3
"""
General-purpose stealth page fetcher using Patchright.

Usage:
    # Fetch page text (default)
    python3 fetch.py "https://example.com"

    # Fetch specific CSS selector
    python3 fetch.py "https://example.com" --selector "table"

    # Fetch as HTML instead of text
    python3 fetch.py "https://example.com" --html

    # JSON output (includes title, url, content)
    python3 fetch.py "https://example.com" --json

    # Take a screenshot
    python3 fetch.py "https://example.com" --screenshot /output/page.png

    # Multiple selectors
    python3 fetch.py "https://example.com" --selector "table" --selector "h1"

    # Wait longer for JS-heavy pages
    python3 fetch.py "https://example.com" --wait 10000

    # Visit a warmup URL first (bypasses some Cloudflare challenges)
    python3 fetch.py "https://example.com/protected" --warmup "https://example.com"

    # Execute JavaScript and return the result
    python3 fetch.py "https://example.com" --js "document.title"

    # Click something before extracting
    python3 fetch.py "https://example.com" --click "#load-more" --wait 2000

    # Fill a form field
    python3 fetch.py "https://example.com" --fill "#search" "my query" --click "#submit"
"""

import argparse
import json
import sys
import tempfile
from patchright.sync_api import sync_playwright


def main():
    parser = argparse.ArgumentParser(description="Stealth page fetcher via Patchright")
    parser.add_argument("url", help="URL to fetch")
    parser.add_argument(
        "--selector", action="append", help="CSS selector(s) to extract (default: body)"
    )
    parser.add_argument(
        "--html", action="store_true", help="Return HTML instead of text"
    )
    parser.add_argument(
        "--json",
        action="store_true",
        dest="json_output",
        help="JSON output with title, url, content",
    )
    parser.add_argument("--screenshot", metavar="PATH", help="Save screenshot to path")
    parser.add_argument("--full-page", action="store_true", help="Full page screenshot")
    parser.add_argument(
        "--wait", type=int, default=5000, help="Wait ms after load (default: 5000)"
    )
    parser.add_argument(
        "--timeout",
        type=int,
        default=30000,
        help="Navigation timeout ms (default: 30000)",
    )
    parser.add_argument(
        "--warmup", metavar="URL", help="Visit this URL first to warm up session"
    )
    parser.add_argument(
        "--js", metavar="EXPR", help="Execute JS expression and return result"
    )
    parser.add_argument(
        "--click",
        action="append",
        metavar="SEL",
        help="Click selector(s) before extracting",
    )
    parser.add_argument(
        "--fill",
        nargs=2,
        action="append",
        metavar=("SEL", "VALUE"),
        help="Fill input before extracting",
    )
    parser.add_argument("--user-agent", help="Override user agent string")

    args = parser.parse_args()

    with sync_playwright() as p:
        launch_args = ["--window-size=1440,900"]
        if args.user_agent:
            launch_args.append(f"--user-agent={args.user_agent}")

        ctx = p.chromium.launch_persistent_context(
            user_data_dir=tempfile.mkdtemp(),
            headless=False,
            args=launch_args,
            viewport={"width": 1440, "height": 900},
        )
        page = ctx.pages[0] if ctx.pages else ctx.new_page()

        try:
            # Warmup visit (Cloudflare bypass pattern)
            if args.warmup:
                page.goto(
                    args.warmup, wait_until="domcontentloaded", timeout=args.timeout
                )
                page.wait_for_timeout(3000)

            # Main navigation
            page.goto(args.url, wait_until="domcontentloaded", timeout=args.timeout)
            page.wait_for_timeout(args.wait)

            # Check for Cloudflare challenge
            title = page.title()
            if "Attention" in title or "Just a moment" in title:
                # Wait for challenge to resolve
                page.wait_for_timeout(8000)
                title = page.title()
                if "Attention" in title or "Just a moment" in title:
                    print(
                        "ERROR: Cloudflare challenge did not resolve", file=sys.stderr
                    )
                    sys.exit(1)

            # Fill actions
            if args.fill:
                for sel, value in args.fill:
                    page.fill(sel, value)
                    page.wait_for_timeout(500)

            # Click actions
            if args.click:
                for sel in args.click:
                    page.click(sel)
                    page.wait_for_timeout(1000)

            # Screenshot
            if args.screenshot:
                page.screenshot(path=args.screenshot, full_page=args.full_page)
                if not args.json_output and not args.selector and not args.js:
                    print(f"Screenshot saved to {args.screenshot}")
                    return

            # JS execution
            if args.js:
                result = page.evaluate(args.js)
                if args.json_output:
                    output = {
                        "title": page.title(),
                        "url": page.url,
                        "js_result": result,
                    }
                    print(json.dumps(output, indent=2, ensure_ascii=False))
                else:
                    print(result)
                return

            # Content extraction
            selectors = args.selector or ["body"]
            results = []

            for sel in selectors:
                elements = page.query_selector_all(sel)
                for el in elements:
                    if args.html:
                        results.append(el.inner_html())
                    else:
                        results.append(el.inner_text())

            content = "\n\n".join(results)

            if args.json_output:
                output = {
                    "title": page.title(),
                    "url": page.url,
                    "selectors": selectors,
                    "content": content,
                }
                print(json.dumps(output, indent=2, ensure_ascii=False))
            else:
                print(content)

        finally:
            ctx.close()


if __name__ == "__main__":
    main()