From 5e3756ee843871514f3fafd7962f4a62fa3585ae Mon Sep 17 00:00:00 2001 From: claude Date: Tue, 14 Apr 2026 09:32:15 -0400 Subject: [PATCH] Detect bare-host/root URLs as HTML so link extraction runs For a target like https://example.com or https://example.com/ the parsed path is '' or '/', and the existing is_html heuristic fell through to False. The downloader then saved the page as a binary blob and never extracted // references, leaving the archive with only the index file and no assets. Treat empty and '/' paths as HTML. --- wayback_archive/downloader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wayback_archive/downloader.py b/wayback_archive/downloader.py index 5943423..5c415ca 100644 --- a/wayback_archive/downloader.py +++ b/wayback_archive/downloader.py @@ -2018,8 +2018,10 @@ def download(self): not is_google_fonts_css and ( content_type == "text/html" or (not content_type and ( - url.endswith(".html") or + url.endswith(".html") or url.endswith(".htm") or + # Bare-host or root URLs (empty path or "/") are HTML. + (not parsed.path or parsed.path == "/") or (parsed.path and not os.path.splitext(parsed.path)[1] and "?" not in url and not any(parsed.path.lower().endswith(ext) for ext in [".css", ".js", ".json", ".xml", ".txt"])) )) )