Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .coderabbit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ knowledge_base:
code_guidelines:
enabled: true
filePatterns:
- "AGENTS.md"
- "CLAUDE.md"
- "CLAUDE.md"
Comment on lines 43 to 45
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Duplicate "CLAUDE.md" entry in filePatterns.

Lines 44 and 45 are identical. The rename of AGENTS.mdCLAUDE.md landed on top of an already-existing CLAUDE.md entry.

🛠️ Proposed fix
     filePatterns:
-      - "CLAUDE.md"
       - "CLAUDE.md"
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In @.coderabbit.yaml around lines 43 - 45, Remove the duplicate entry in the
filePatterns array so "CLAUDE.md" only appears once; locate the filePatterns
block in .coderabbit.yaml and delete the redundant "CLAUDE.md" line (the
duplicate entries under filePatterns) so the array contains a single "CLAUDE.md"
entry.

2 changes: 1 addition & 1 deletion AGENTS.md → CLAUDE.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# AGENTS.md — Wayback-Archive
# CLAUDE.md — Wayback-Archive

## Project Overview

Expand Down
21 changes: 21 additions & 0 deletions tests/test_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,24 @@ def test_process_html_rewrites_legacy_background_attributes(self):
# The background image must be queued for download
assert "http://example.com/background.gif" in links_to_follow

def test_is_html_url_bare_host_and_root(self):
"""Bare-host and root URLs are classified as HTML."""
for url in ["https://example.com", "https://example.com/"]:
assert WaybackDownloader._is_html_url(url), f"{url} should be HTML"

def test_is_html_url_html_extensions(self):
"""HTML extensions are classified as HTML."""
assert WaybackDownloader._is_html_url("https://example.com/page.html")
assert WaybackDownloader._is_html_url("https://example.com/page.htm")

def test_is_html_url_extensionless_paths(self):
"""Extensionless paths are classified as HTML."""
assert WaybackDownloader._is_html_url("https://example.com/about")
assert WaybackDownloader._is_html_url("https://example.com/foo/bar")

def test_is_html_url_asset_extensions(self):
"""Known asset extensions are NOT classified as HTML."""
for ext in [".css", ".js", ".png", ".jpg", ".gif", ".svg", ".woff2", ".pdf", ".json", ".xml"]:
url = f"https://example.com/file{ext}"
assert not WaybackDownloader._is_html_url(url), f"{url} should NOT be HTML"

2 changes: 1 addition & 1 deletion wayback_archive/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Wayback-Archive - A comprehensive tool for downloading websites from Wayback Machine."""

__version__ = "1.4.0"
__version__ = "1.4.1"


46 changes: 27 additions & 19 deletions wayback_archive/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,24 @@ def _is_squarespace_cdn(self, url: str) -> bool:
url_domain = parsed.netloc.lower().lstrip("www.")
return any(domain in url_domain for domain in squarespace_domains)

@staticmethod
def _is_html_url(url: str, parsed=None) -> bool:
"""Determine if a URL likely points to an HTML page based on its path."""
if parsed is None:
parsed = urlparse(url)
path_lower = parsed.path.lower()
if not path_lower or path_lower == "/":
return True
if path_lower.endswith('.html') or path_lower.endswith('.htm'):
return True
ext = os.path.splitext(path_lower)[1]
if ext:
return False
non_html = {'.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.svg',
'.woff', '.woff2', '.ttf', '.eot', '.otf', '.ico',
'.json', '.xml', '.txt', '.pdf'}
return not any(path_lower.endswith(e) for e in non_html)

def _is_tracker(self, url: str) -> bool:
"""Check if URL is a tracker/analytics script."""
for pattern in self.TRACKER_PATTERNS:
Expand Down Expand Up @@ -519,13 +537,7 @@ def download_file(self, url: str) -> Optional[bytes]:
# Determine if this is an HTML page (we should NOT fallback to live for HTML)
parsed = urlparse(url)
path_lower = parsed.path.lower()
is_html_page = (
not path_lower or
path_lower.endswith('.html') or
path_lower.endswith('.htm') or
(not os.path.splitext(path_lower)[1] and
not any(path_lower.endswith(ext) for ext in ['.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.svg', '.woff', '.woff2', '.ttf', '.eot', '.otf', '.ico', '.json', '.xml', '.pdf']))
)
is_html_page = self._is_html_url(url, parsed)

# For HTML pages, try the 'if_' version first to get unwrapped content
# This avoids the Wayback Machine interface wrapper
Expand Down Expand Up @@ -915,6 +927,7 @@ def _extract_css_urls(self, css: str, base_url: str) -> List[str]:
def _rewrite_css_urls(self, css: str, base_url: str) -> str:
"""Rewrite URLs in CSS to relative paths."""
def replace_css_url(match):
"""Rewrite a single CSS url() match to a relative local path."""
full_match = match.group(0)
url_part = match.group(1)

Expand Down Expand Up @@ -1720,6 +1733,7 @@ def _process_html(self, html: str, base_url: str) -> tuple[str, List[str]]:
# Rewrite URLs in inline styles - handle url() functions
if "web.archive.org" in style or "/web/" in style or "url(" in style:
def replace_url_in_style(match):
"""Rewrite a single url() inside an inline style attribute."""
full_match = match.group(0)
url_part = match.group(1) if len(match.groups()) > 0 else full_match

Expand Down Expand Up @@ -1977,12 +1991,12 @@ def download(self):

# Try to detect from actual content if still unknown
if not content_type and len(content) > 0:
# Check content signatures
if content.startswith(b'<!DOCTYPE') or content.startswith(b'<html') or content.startswith(b'<HTML'):
content_stripped = content.lstrip()
if content_stripped.startswith((b'<!DOCTYPE', b'<!doctype', b'<html', b'<HTML')):
content_type = "text/html"
elif content.startswith(b'/*') or content.startswith(b'@charset') or b'@media' in content[:200]:
elif content_stripped.startswith((b'/*', b'@charset')) or b'@media' in content[:200]:
content_type = "text/css"
elif content.startswith(b'<?xml') or b'<svg' in content[:200]:
elif content_stripped.startswith(b'<?xml') or b'<svg' in content[:200]:
content_type = "image/svg+xml"
elif content.startswith(b'\x89PNG'):
content_type = "image/png"
Expand Down Expand Up @@ -2016,14 +2030,8 @@ def download(self):
# Process based on content type - be more conservative about what we treat as HTML
is_html = (
not is_google_fonts_css and (
content_type == "text/html" or
(not content_type and (
url.endswith(".html") or
url.endswith(".htm") or
# Bare-host or root URLs (empty path or "/") are HTML.
(not parsed.path or parsed.path == "/") or
(parsed.path and not os.path.splitext(parsed.path)[1] and "?" not in url and not any(parsed.path.lower().endswith(ext) for ext in [".css", ".js", ".json", ".xml", ".txt"]))
))
content_type == "text/html" or
(not content_type and self._is_html_url(url, parsed))
)
)

Expand Down
Loading