From 5e3756ee843871514f3fafd7962f4a62fa3585ae Mon Sep 17 00:00:00 2001
From: claude <claude@local>
Date: Tue, 14 Apr 2026 09:32:15 -0400
Subject: [PATCH] Detect bare-host/root URLs as HTML so link extraction runs

For a target like https://example.com or https://example.com/ the parsed
path is '' or '/', and the existing is_html heuristic fell through to
False. The downloader then saved the page as a binary blob and never
extracted <img>/<a>/<link> references, leaving the archive with only
the index file and no assets. Treat empty and '/' paths as HTML.
---
 wayback_archive/downloader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/wayback_archive/downloader.py b/wayback_archive/downloader.py
index 5943423..5c415ca 100644
--- a/wayback_archive/downloader.py
+++ b/wayback_archive/downloader.py
@@ -2018,8 +2018,10 @@ def download(self):
                     not is_google_fonts_css and (
                         content_type == "text/html" or 
                         (not content_type and (
-                            url.endswith(".html") or 
+                            url.endswith(".html") or
                             url.endswith(".htm") or
+                            # Bare-host or root URLs (empty path or "/") are HTML.
+                            (not parsed.path or parsed.path == "/") or
                             (parsed.path and not os.path.splitext(parsed.path)[1] and "?" not in url and not any(parsed.path.lower().endswith(ext) for ext in [".css", ".js", ".json", ".xml", ".txt"]))
                         ))
                     )