diff --git a/.coderabbit.yaml b/.coderabbit.yaml index 664e8ac..8ab9b48 100644 --- a/.coderabbit.yaml +++ b/.coderabbit.yaml @@ -41,5 +41,5 @@ knowledge_base: code_guidelines: enabled: true filePatterns: - - "AGENTS.md" + - "CLAUDE.md" - "CLAUDE.md" diff --git a/AGENTS.md b/CLAUDE.md similarity index 93% rename from AGENTS.md rename to CLAUDE.md index 5c43075..e70d754 100644 --- a/AGENTS.md +++ b/CLAUDE.md @@ -1,4 +1,4 @@ -# AGENTS.md — Wayback-Archive +# CLAUDE.md — Wayback-Archive ## Project Overview diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 562b1e3..c8456c5 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -185,3 +185,24 @@ def test_process_html_rewrites_legacy_background_attributes(self): # The background image must be queued for download assert "http://example.com/background.gif" in links_to_follow + def test_is_html_url_bare_host_and_root(self): + """Bare-host and root URLs are classified as HTML.""" + for url in ["https://example.com", "https://example.com/"]: + assert WaybackDownloader._is_html_url(url), f"{url} should be HTML" + + def test_is_html_url_html_extensions(self): + """HTML extensions are classified as HTML.""" + assert WaybackDownloader._is_html_url("https://example.com/page.html") + assert WaybackDownloader._is_html_url("https://example.com/page.htm") + + def test_is_html_url_extensionless_paths(self): + """Extensionless paths are classified as HTML.""" + assert WaybackDownloader._is_html_url("https://example.com/about") + assert WaybackDownloader._is_html_url("https://example.com/foo/bar") + + def test_is_html_url_asset_extensions(self): + """Known asset extensions are NOT classified as HTML.""" + for ext in [".css", ".js", ".png", ".jpg", ".gif", ".svg", ".woff2", ".pdf", ".json", ".xml"]: + url = f"https://example.com/file{ext}" + assert not WaybackDownloader._is_html_url(url), f"{url} should NOT be HTML" + diff --git a/wayback_archive/__init__.py b/wayback_archive/__init__.py index fe8b3da..bc8b8cc 100644 --- a/wayback_archive/__init__.py +++ b/wayback_archive/__init__.py @@ -1,5 +1,5 @@ """Wayback-Archive - A comprehensive tool for downloading websites from Wayback Machine.""" -__version__ = "1.4.0" +__version__ = "1.4.1" diff --git a/wayback_archive/downloader.py b/wayback_archive/downloader.py index 5c415ca..52654de 100644 --- a/wayback_archive/downloader.py +++ b/wayback_archive/downloader.py @@ -143,6 +143,24 @@ def _is_squarespace_cdn(self, url: str) -> bool: url_domain = parsed.netloc.lower().lstrip("www.") return any(domain in url_domain for domain in squarespace_domains) + @staticmethod + def _is_html_url(url: str, parsed=None) -> bool: + """Determine if a URL likely points to an HTML page based on its path.""" + if parsed is None: + parsed = urlparse(url) + path_lower = parsed.path.lower() + if not path_lower or path_lower == "/": + return True + if path_lower.endswith('.html') or path_lower.endswith('.htm'): + return True + ext = os.path.splitext(path_lower)[1] + if ext: + return False + non_html = {'.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.svg', + '.woff', '.woff2', '.ttf', '.eot', '.otf', '.ico', + '.json', '.xml', '.txt', '.pdf'} + return not any(path_lower.endswith(e) for e in non_html) + def _is_tracker(self, url: str) -> bool: """Check if URL is a tracker/analytics script.""" for pattern in self.TRACKER_PATTERNS: @@ -519,13 +537,7 @@ def download_file(self, url: str) -> Optional[bytes]: # Determine if this is an HTML page (we should NOT fallback to live for HTML) parsed = urlparse(url) path_lower = parsed.path.lower() - is_html_page = ( - not path_lower or - path_lower.endswith('.html') or - path_lower.endswith('.htm') or - (not os.path.splitext(path_lower)[1] and - not any(path_lower.endswith(ext) for ext in ['.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.svg', '.woff', '.woff2', '.ttf', '.eot', '.otf', '.ico', '.json', '.xml', '.pdf'])) - ) + is_html_page = self._is_html_url(url, parsed) # For HTML pages, try the 'if_' version first to get unwrapped content # This avoids the Wayback Machine interface wrapper @@ -915,6 +927,7 @@ def _extract_css_urls(self, css: str, base_url: str) -> List[str]: def _rewrite_css_urls(self, css: str, base_url: str) -> str: """Rewrite URLs in CSS to relative paths.""" def replace_css_url(match): + """Rewrite a single CSS url() match to a relative local path.""" full_match = match.group(0) url_part = match.group(1) @@ -1720,6 +1733,7 @@ def _process_html(self, html: str, base_url: str) -> tuple[str, List[str]]: # Rewrite URLs in inline styles - handle url() functions if "web.archive.org" in style or "/web/" in style or "url(" in style: def replace_url_in_style(match): + """Rewrite a single url() inside an inline style attribute.""" full_match = match.group(0) url_part = match.group(1) if len(match.groups()) > 0 else full_match @@ -1977,12 +1991,12 @@ def download(self): # Try to detect from actual content if still unknown if not content_type and len(content) > 0: - # Check content signatures - if content.startswith(b'