From 33621bc5c49affa3ab17c12f2ba240e6cd778300 Mon Sep 17 00:00:00 2001
From: GeiserX <9169332+GeiserX@users.noreply.github.com>
Date: Tue, 28 Apr 2026 17:54:53 +0200
Subject: [PATCH 1/3] Rename AGENTS.md to CLAUDE.md

---
 .coderabbit.yaml       | 2 +-
 AGENTS.md => CLAUDE.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename AGENTS.md => CLAUDE.md (93%)
diff --git a/.coderabbit.yaml b/.coderabbit.yaml
index 664e8ac..8ab9b48 100644
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -41,5 +41,5 @@ knowledge_base:
   code_guidelines:
     enabled: true
     filePatterns:
-      - "AGENTS.md"
+      - "CLAUDE.md"
       - "CLAUDE.md"
diff --git a/AGENTS.md b/CLAUDE.md
similarity index 93%
rename from AGENTS.md
rename to CLAUDE.md
index 5c43075..e70d754 100644
--- a/AGENTS.md
+++ b/CLAUDE.md
@@ -1,4 +1,4 @@
-# AGENTS.md — Wayback-Archive
+# CLAUDE.md — Wayback-Archive
 
 ## Project Overview
 

From 038aae610bd54eede81bc382ffd95af66d02f259 Mon Sep 17 00:00:00 2001
From: GeiserX <9169332+GeiserX@users.noreply.github.com>
Date: Mon, 4 May 2026 10:09:18 +0200
Subject: [PATCH 2/3] fix: unify is_html heuristic, fix content sniffing, add
 tests

Address all review comments from PR #6:

- Extract shared _is_html_url() static method used by both
  download_file() and download(), preventing heuristic drift
- Fix content sniffing to .lstrip() before checking signatures,
  handling BOM/whitespace/Wayback-injected scripts before <html>
- Add <!doctype lowercase to content signature detection
- Add regression tests for bare-host, root, extension, and
  extensionless URL classification
---
 tests/test_downloader.py      | 18 ++++++++++++++
 wayback_archive/__init__.py   |  2 +-
 wayback_archive/downloader.py | 44 ++++++++++++++++++++---------------
 3 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/tests/test_downloader.py b/tests/test_downloader.py
index 562b1e3..5eb8289 100644
--- a/tests/test_downloader.py
+++ b/tests/test_downloader.py
@@ -185,3 +185,21 @@ def test_process_html_rewrites_legacy_background_attributes(self):
         # The background image must be queued for download
         assert "http://example.com/background.gif" in links_to_follow
 
+    def test_is_html_url_bare_host_and_root(self):
+        """Bare-host and root URLs are classified as HTML."""
+        for url in ["https://example.com", "https://example.com/"]:
+            assert WaybackDownloader._is_html_url(url), f"{url} should be HTML"
+
+    def test_is_html_url_html_extensions(self):
+        assert WaybackDownloader._is_html_url("https://example.com/page.html")
+        assert WaybackDownloader._is_html_url("https://example.com/page.htm")
+
+    def test_is_html_url_extensionless_paths(self):
+        assert WaybackDownloader._is_html_url("https://example.com/about")
+        assert WaybackDownloader._is_html_url("https://example.com/foo/bar")
+
+    def test_is_html_url_asset_extensions(self):
+        for ext in [".css", ".js", ".png", ".jpg", ".gif", ".svg", ".woff2", ".pdf", ".json", ".xml"]:
+            url = f"https://example.com/file{ext}"
+            assert not WaybackDownloader._is_html_url(url), f"{url} should NOT be HTML"
+
diff --git a/wayback_archive/__init__.py b/wayback_archive/__init__.py
index fe8b3da..bc8b8cc 100644
--- a/wayback_archive/__init__.py
+++ b/wayback_archive/__init__.py
@@ -1,5 +1,5 @@
 """Wayback-Archive - A comprehensive tool for downloading websites from Wayback Machine."""
 
-__version__ = "1.4.0"
+__version__ = "1.4.1"
 
 
diff --git a/wayback_archive/downloader.py b/wayback_archive/downloader.py
index 5c415ca..c815e14 100644
--- a/wayback_archive/downloader.py
+++ b/wayback_archive/downloader.py
@@ -143,6 +143,24 @@ def _is_squarespace_cdn(self, url: str) -> bool:
         url_domain = parsed.netloc.lower().lstrip("www.")
         return any(domain in url_domain for domain in squarespace_domains)
 
+    @staticmethod
+    def _is_html_url(url: str, parsed=None) -> bool:
+        """Determine if a URL likely points to an HTML page based on its path."""
+        if parsed is None:
+            parsed = urlparse(url)
+        path_lower = parsed.path.lower()
+        if not path_lower or path_lower == "/":
+            return True
+        if path_lower.endswith('.html') or path_lower.endswith('.htm'):
+            return True
+        ext = os.path.splitext(path_lower)[1]
+        if ext:
+            return False
+        non_html = {'.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.svg',
+                    '.woff', '.woff2', '.ttf', '.eot', '.otf', '.ico',
+                    '.json', '.xml', '.txt', '.pdf'}
+        return not any(path_lower.endswith(e) for e in non_html)
+
     def _is_tracker(self, url: str) -> bool:
         """Check if URL is a tracker/analytics script."""
         for pattern in self.TRACKER_PATTERNS:
@@ -519,13 +537,7 @@ def download_file(self, url: str) -> Optional[bytes]:
         # Determine if this is an HTML page (we should NOT fallback to live for HTML)
         parsed = urlparse(url)
         path_lower = parsed.path.lower()
-        is_html_page = (
-            not path_lower or 
-            path_lower.endswith('.html') or 
-            path_lower.endswith('.htm') or
-            (not os.path.splitext(path_lower)[1] and 
-             not any(path_lower.endswith(ext) for ext in ['.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.svg', '.woff', '.woff2', '.ttf', '.eot', '.otf', '.ico', '.json', '.xml', '.pdf']))
-        )
+        is_html_page = self._is_html_url(url, parsed)
         
         # For HTML pages, try the 'if_' version first to get unwrapped content
         # This avoids the Wayback Machine interface wrapper
@@ -1977,12 +1989,12 @@ def download(self):
                 
                 # Try to detect from actual content if still unknown
                 if not content_type and len(content) > 0:
-                    # Check content signatures
-                    if content.startswith(b'<!DOCTYPE') or content.startswith(b'<html') or content.startswith(b'<HTML'):
+                    content_stripped = content.lstrip()
+                    if content_stripped.startswith((b'<!DOCTYPE', b'<!doctype', b'<html', b'<HTML')):
                         content_type = "text/html"
-                    elif content.startswith(b'/*') or content.startswith(b'@charset') or b'@media' in content[:200]:
+                    elif content_stripped.startswith((b'/*', b'@charset')) or b'@media' in content[:200]:
                         content_type = "text/css"
-                    elif content.startswith(b'<?xml') or b'<svg' in content[:200]:
+                    elif content_stripped.startswith(b'<?xml') or b'<svg' in content[:200]:
                         content_type = "image/svg+xml"
                     elif content.startswith(b'\x89PNG'):
                         content_type = "image/png"
@@ -2016,14 +2028,8 @@ def download(self):
                 # Process based on content type - be more conservative about what we treat as HTML
                 is_html = (
                     not is_google_fonts_css and (
-                        content_type == "text/html" or 
-                        (not content_type and (
-                            url.endswith(".html") or
-                            url.endswith(".htm") or
-                            # Bare-host or root URLs (empty path or "/") are HTML.
-                            (not parsed.path or parsed.path == "/") or
-                            (parsed.path and not os.path.splitext(parsed.path)[1] and "?" not in url and not any(parsed.path.lower().endswith(ext) for ext in [".css", ".js", ".json", ".xml", ".txt"]))
-                        ))
+                        content_type == "text/html" or
+                        (not content_type and self._is_html_url(url, parsed))
                     )
                 )
                 

From 7cec6e0c8278831c9f49eb9cfc05778bac4c1b15 Mon Sep 17 00:00:00 2001
From: GeiserX <9169332+GeiserX@users.noreply.github.com>
Date: Mon, 4 May 2026 10:31:03 +0200
Subject: [PATCH 3/3] Add missing docstrings to satisfy coverage threshold

---
 tests/test_downloader.py      | 3 +++
 wayback_archive/downloader.py | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/tests/test_downloader.py b/tests/test_downloader.py
index 5eb8289..c8456c5 100644
--- a/tests/test_downloader.py
+++ b/tests/test_downloader.py
@@ -191,14 +191,17 @@ def test_is_html_url_bare_host_and_root(self):
             assert WaybackDownloader._is_html_url(url), f"{url} should be HTML"
 
     def test_is_html_url_html_extensions(self):
+        """HTML extensions are classified as HTML."""
         assert WaybackDownloader._is_html_url("https://example.com/page.html")
         assert WaybackDownloader._is_html_url("https://example.com/page.htm")
 
     def test_is_html_url_extensionless_paths(self):
+        """Extensionless paths are classified as HTML."""
         assert WaybackDownloader._is_html_url("https://example.com/about")
         assert WaybackDownloader._is_html_url("https://example.com/foo/bar")
 
     def test_is_html_url_asset_extensions(self):
+        """Known asset extensions are NOT classified as HTML."""
         for ext in [".css", ".js", ".png", ".jpg", ".gif", ".svg", ".woff2", ".pdf", ".json", ".xml"]:
             url = f"https://example.com/file{ext}"
             assert not WaybackDownloader._is_html_url(url), f"{url} should NOT be HTML"
diff --git a/wayback_archive/downloader.py b/wayback_archive/downloader.py
index c815e14..52654de 100644
--- a/wayback_archive/downloader.py
+++ b/wayback_archive/downloader.py
@@ -927,6 +927,7 @@ def _extract_css_urls(self, css: str, base_url: str) -> List[str]:
     def _rewrite_css_urls(self, css: str, base_url: str) -> str:
         """Rewrite URLs in CSS to relative paths."""
         def replace_css_url(match):
+            """Rewrite a single CSS url() match to a relative local path."""
             full_match = match.group(0)
             url_part = match.group(1)
             
@@ -1732,6 +1733,7 @@ def _process_html(self, html: str, base_url: str) -> tuple[str, List[str]]:
             # Rewrite URLs in inline styles - handle url() functions
             if "web.archive.org" in style or "/web/" in style or "url(" in style:
                 def replace_url_in_style(match):
+                    """Rewrite a single url() inside an inline style attribute."""
                     full_match = match.group(0)
                     url_part = match.group(1) if len(match.groups()) > 0 else full_match