From 33621bc5c49affa3ab17c12f2ba240e6cd778300 Mon Sep 17 00:00:00 2001 From: GeiserX <9169332+GeiserX@users.noreply.github.com> Date: Tue, 28 Apr 2026 17:54:53 +0200 Subject: [PATCH 1/3] Rename AGENTS.md to CLAUDE.md --- .coderabbit.yaml | 2 +- AGENTS.md => CLAUDE.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename AGENTS.md => CLAUDE.md (93%) diff --git a/.coderabbit.yaml b/.coderabbit.yaml index 664e8ac..8ab9b48 100644 --- a/.coderabbit.yaml +++ b/.coderabbit.yaml @@ -41,5 +41,5 @@ knowledge_base: code_guidelines: enabled: true filePatterns: - - "AGENTS.md" + - "CLAUDE.md" - "CLAUDE.md" diff --git a/AGENTS.md b/CLAUDE.md similarity index 93% rename from AGENTS.md rename to CLAUDE.md index 5c43075..e70d754 100644 --- a/AGENTS.md +++ b/CLAUDE.md @@ -1,4 +1,4 @@ -# AGENTS.md — Wayback-Archive +# CLAUDE.md — Wayback-Archive ## Project Overview From 038aae610bd54eede81bc382ffd95af66d02f259 Mon Sep 17 00:00:00 2001 From: GeiserX <9169332+GeiserX@users.noreply.github.com> Date: Mon, 4 May 2026 10:09:18 +0200 Subject: [PATCH 2/3] fix: unify is_html heuristic, fix content sniffing, add tests Address all review comments from PR #6: - Extract shared _is_html_url() static method used by both download_file() and download(), preventing heuristic drift - Fix content sniffing to .lstrip() before checking signatures, handling BOM/whitespace/Wayback-injected scripts before - Add bool: url_domain = parsed.netloc.lower().lstrip("www.") return any(domain in url_domain for domain in squarespace_domains) + @staticmethod + def _is_html_url(url: str, parsed=None) -> bool: + """Determine if a URL likely points to an HTML page based on its path.""" + if parsed is None: + parsed = urlparse(url) + path_lower = parsed.path.lower() + if not path_lower or path_lower == "/": + return True + if path_lower.endswith('.html') or path_lower.endswith('.htm'): + return True + ext = os.path.splitext(path_lower)[1] + if ext: + return False + non_html = {'.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.svg', + '.woff', '.woff2', '.ttf', '.eot', '.otf', '.ico', + '.json', '.xml', '.txt', '.pdf'} + return not any(path_lower.endswith(e) for e in non_html) + def _is_tracker(self, url: str) -> bool: """Check if URL is a tracker/analytics script.""" for pattern in self.TRACKER_PATTERNS: @@ -519,13 +537,7 @@ def download_file(self, url: str) -> Optional[bytes]: # Determine if this is an HTML page (we should NOT fallback to live for HTML) parsed = urlparse(url) path_lower = parsed.path.lower() - is_html_page = ( - not path_lower or - path_lower.endswith('.html') or - path_lower.endswith('.htm') or - (not os.path.splitext(path_lower)[1] and - not any(path_lower.endswith(ext) for ext in ['.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.svg', '.woff', '.woff2', '.ttf', '.eot', '.otf', '.ico', '.json', '.xml', '.pdf'])) - ) + is_html_page = self._is_html_url(url, parsed) # For HTML pages, try the 'if_' version first to get unwrapped content # This avoids the Wayback Machine interface wrapper @@ -1977,12 +1989,12 @@ def download(self): # Try to detect from actual content if still unknown if not content_type and len(content) > 0: - # Check content signatures - if content.startswith(b' Date: Mon, 4 May 2026 10:31:03 +0200 Subject: [PATCH 3/3] Add missing docstrings to satisfy coverage threshold --- tests/test_downloader.py | 3 +++ wayback_archive/downloader.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 5eb8289..c8456c5 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -191,14 +191,17 @@ def test_is_html_url_bare_host_and_root(self): assert WaybackDownloader._is_html_url(url), f"{url} should be HTML" def test_is_html_url_html_extensions(self): + """HTML extensions are classified as HTML.""" assert WaybackDownloader._is_html_url("https://example.com/page.html") assert WaybackDownloader._is_html_url("https://example.com/page.htm") def test_is_html_url_extensionless_paths(self): + """Extensionless paths are classified as HTML.""" assert WaybackDownloader._is_html_url("https://example.com/about") assert WaybackDownloader._is_html_url("https://example.com/foo/bar") def test_is_html_url_asset_extensions(self): + """Known asset extensions are NOT classified as HTML.""" for ext in [".css", ".js", ".png", ".jpg", ".gif", ".svg", ".woff2", ".pdf", ".json", ".xml"]: url = f"https://example.com/file{ext}" assert not WaybackDownloader._is_html_url(url), f"{url} should NOT be HTML" diff --git a/wayback_archive/downloader.py b/wayback_archive/downloader.py index c815e14..52654de 100644 --- a/wayback_archive/downloader.py +++ b/wayback_archive/downloader.py @@ -927,6 +927,7 @@ def _extract_css_urls(self, css: str, base_url: str) -> List[str]: def _rewrite_css_urls(self, css: str, base_url: str) -> str: """Rewrite URLs in CSS to relative paths.""" def replace_css_url(match): + """Rewrite a single CSS url() match to a relative local path.""" full_match = match.group(0) url_part = match.group(1) @@ -1732,6 +1733,7 @@ def _process_html(self, html: str, base_url: str) -> tuple[str, List[str]]: # Rewrite URLs in inline styles - handle url() functions if "web.archive.org" in style or "/web/" in style or "url(" in style: def replace_url_in_style(match): + """Rewrite a single url() inside an inline style attribute.""" full_match = match.group(0) url_part = match.group(1) if len(match.groups()) > 0 else full_match