Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions lib/crawlscope/crawl.rb
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ def collect(pages, issues)
issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
elsif !@allowed_statuses.include?(page.status)
issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
elsif redirected?(page)
issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
end
end
end
Expand Down Expand Up @@ -128,5 +130,9 @@ def resolution(page, normalized_url, crawled:)
status: page.status
}
end

def redirected?(page)
page.normalized_url.to_s != page.normalized_final_url.to_s
end
end
end
30 changes: 24 additions & 6 deletions lib/crawlscope/rules/links.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
module Crawlscope
module Rules
class Links
CONTEXTUAL_LINK_SELECTORS = "main a[href], article a[href]"
LINK_SELECTORS = "a[href]"
INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze
LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
MAX_SOURCES_IN_ERROR = 3
Expand Down Expand Up @@ -33,10 +33,7 @@ def call(urls:, pages:, issues:, context:)
private

def contextual_links(doc)
links = doc.css(CONTEXTUAL_LINK_SELECTORS)
return links unless links.empty?

doc.css("a[href]")
doc.css(LINK_SELECTORS)
end

def extract_links(pages)
Expand All @@ -45,7 +42,7 @@ def extract_links(pages)

def page_links(page)
source_path = Url.path(page.normalized_url)
return [] unless crawlable_path?(source_path)
return [] unless crawlable_source_path?(source_path)

contextual_links(page.doc).filter_map do |node|
link_for(page: page, source_path: source_path, node: node)
Expand Down Expand Up @@ -146,6 +143,7 @@ def resolve_links(links, issues)
next
end

report_redirect_target(target_url, grouped_links, issues, target) if target.redirect?
next unless crawlable_path?(target.final_path)

grouped_links.each do |link|
Expand All @@ -156,6 +154,18 @@ def resolve_links(links, issues)
resolved_links
end

def report_redirect_target(target_url, grouped_links, issues, target)
source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
issues.add(
code: :internal_link_redirects,
severity: :warning,
category: :links,
url: target_url,
message: "internal link redirects to #{target.final_url} (sources: #{source_urls.join(", ")})",
details: {final_url: target.final_url, source_urls: source_urls, status: target.status}
)
end

def resolve_target(target_url)
resolution = @resolve_target.call(target_url)
LinkTarget.new(target_url: target_url, resolution: resolution)
Expand Down Expand Up @@ -183,11 +193,19 @@ def status
resolution && resolution[:status]
end

def redirect?
(status && (300..399).cover?(status.to_i)) || final_url != target_url
end

def unresolved?
resolution.nil? || (status.nil? && !ignored_error?)
end
end

def crawlable_source_path?(path)
!path.nil? && INTERNAL_PATH_PREFIXES_TO_SKIP.none? { |prefix| path.start_with?(prefix) }
end

def skip_internal_path?(path)
return true if path == "/"

Expand Down
68 changes: 57 additions & 11 deletions lib/crawlscope/rules/metadata.rb
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# frozen_string_literal: true

require "uri"

module Crawlscope
module Rules
class Metadata
TITLE_MAX_LENGTH = 72
DESCRIPTION_MIN_LENGTH = 110
DESCRIPTION_MAX_LENGTH = 160
REQUIRED_OPEN_GRAPH_PROPERTIES = %w[og:title og:description og:url og:type og:image].freeze

attr_reader :code

Expand All @@ -21,22 +25,35 @@ def call(urls:, pages:, issues:, context: nil)
validate_title(page, issues)
validate_description(page, issues)
validate_canonical(page, issues)
validate_open_graph(page, issues)
end
end

private

def validate_h1(page, issues)
return unless page.doc.at_css("h1").nil?

issues.add(
code: :missing_h1,
severity: :warning,
category: :metadata,
url: page.url,
message: "missing <h1>",
details: {}
)
h1s = page.doc.css("h1")
return if h1s.one?

if h1s.empty?
issues.add(
code: :missing_h1,
severity: :warning,
category: :metadata,
url: page.url,
message: "missing <h1>",
details: {}
)
else
issues.add(
code: :multiple_h1,
severity: :warning,
category: :metadata,
url: page.url,
message: "multiple <h1> tags (#{h1s.size})",
details: {count: h1s.size}
)
end
end

def validate_title(page, issues)
Expand All @@ -56,6 +73,8 @@ def validate_description(page, issues)

if description.empty?
issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
elsif description.length < DESCRIPTION_MIN_LENGTH
issues.add(code: :meta_description_too_short, severity: :warning, category: :metadata, url: page.url, message: "meta description too short (#{description.length})", details: {length: description.length, minimum: DESCRIPTION_MIN_LENGTH})
elsif description.length > DESCRIPTION_MAX_LENGTH
issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length})
end
Expand All @@ -71,7 +90,7 @@ def validate_canonical(page, issues)

normalized_canonical = Url.normalize(canonical, base_url: page.url)
normalized_page_url = Url.normalize(page.url, base_url: page.url)
return if normalized_canonical == normalized_page_url
return if canonical_matches_page?(normalized_canonical, normalized_page_url)

issues.add(
code: :canonical_mismatch,
Expand All @@ -88,6 +107,33 @@ def repeated_site_name?(title)

title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1
end

def validate_open_graph(page, issues)
missing = REQUIRED_OPEN_GRAPH_PROPERTIES.reject do |property|
page.doc.at_css(%(meta[property="#{property}"][content]))
end
return if missing.empty?

issues.add(
code: :incomplete_open_graph_tags,
severity: :warning,
category: :metadata,
url: page.url,
message: "Open Graph tags incomplete (missing #{missing.join(", ")})",
details: {missing: missing}
)
end

def canonical_matches_page?(canonical, page_url)
canonical == page_url || (local_url?(page_url) && Url.path(canonical) == Url.path(page_url))
end

def local_url?(url)
host = URI.parse(url.to_s).host.to_s
["localhost", "127.0.0.1", "0.0.0.0", "::1"].include?(host)
rescue URI::InvalidURIError
false
end
end
end
end
16 changes: 13 additions & 3 deletions test/crawlscope/crawl_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,13 @@ def test_returns_ok_when_metadata_is_valid
<html>
<head>
<title>Pricing</title>
<meta name="description" content="Plans for hotels and restaurants">
<meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
<link rel="canonical" href="https://example.com/pricing">
<meta property="og:title" content="Pricing">
<meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
<meta property="og:url" content="https://example.com/pricing">
<meta property="og:type" content="website">
<meta property="og:image" content="https://example.com/icon.png">
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
</script>
Expand Down Expand Up @@ -95,7 +100,7 @@ def test_collects_metadata_issues_for_invalid_page
).call

refute result.ok?
assert_equal %i[meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
assert_equal %i[incomplete_open_graph_tags meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
end

def test_uses_browser_when_renderer_is_browser
Expand Down Expand Up @@ -128,8 +133,13 @@ def fetch(url)
<html>
<head>
<title>Pricing</title>
<meta name="description" content="Plans for hotels and restaurants">
<meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
<link rel="canonical" href="https://example.com/pricing">
<meta property="og:title" content="Pricing">
<meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
<meta property="og:url" content="https://example.com/pricing">
<meta property="og:type" content="website">
<meta property="og:image" content="https://example.com/icon.png">
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
</script>
Expand Down
39 changes: 39 additions & 0 deletions test/crawlscope/links_rule_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,45 @@ def test_reports_low_inbound_anchor_links
assert_equal "https://example.com/guide", issues.to_a.first.url
end

def test_counts_root_page_links_as_inbound_links
issues = Crawlscope::IssueCollection.new

Crawlscope::Rules::Links.new.call(
urls: ["https://example.com/", "https://example.com/about"],
pages: [
page(url: "https://example.com/", body: "<main><a href=\"/about\">About</a></main>"),
page(url: "https://example.com/about", body: "<main><p>About</p></main>")
],
issues: issues,
context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
)

refute_includes issues.to_a.map(&:code), :low_inbound_anchor_links
end

def test_reports_internal_links_that_redirect
issues = Crawlscope::IssueCollection.new
resolver = lambda do |target_url|
{
crawled: false,
error: nil,
final_url: "https://example.com/pricing",
status: 200
}
end

Crawlscope::Rules::Links.new.call(
urls: ["https://example.com/guide"],
pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/plans\">Plans</a></main>")],
issues: issues,
context: context(resolver: resolver)
)

redirect_issue = issues.to_a.find { |issue| issue.code == :internal_link_redirects }
assert redirect_issue
assert_includes redirect_issue.message, "https://example.com/pricing"
end

def test_ignores_links_that_should_not_be_crawled
issues = Crawlscope::IssueCollection.new

Expand Down
77 changes: 77 additions & 0 deletions test/crawlscope/metadata_rule_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# frozen_string_literal: true

require "test_helper"

class CrawlscopeMetadataRuleTest < Minitest::Test
def test_reports_short_meta_description_multiple_h1_and_incomplete_open_graph
issues = Crawlscope::IssueCollection.new

Crawlscope::Rules::Metadata.new.call(
urls: [page.url],
pages: [page],
issues: issues
)

codes = issues.to_a.map(&:code)
assert_includes codes, :meta_description_too_short
assert_includes codes, :multiple_h1
assert_includes codes, :incomplete_open_graph_tags
end

def test_allows_localhost_page_with_matching_production_canonical_path
issues = Crawlscope::IssueCollection.new
local_page = page(
url: "http://localhost:3000/about",
body: <<~HTML
<html>
<head>
<title>About</title>
<meta name="description" content="A clear description that is long enough for search snippets, local validation checks, and realistic production metadata audits.">
<link rel="canonical" href="https://www.example.com/about">
<meta property="og:title" content="About">
<meta property="og:description" content="About page">
<meta property="og:url" content="https://www.example.com/about">
<meta property="og:type" content="website">
<meta property="og:image" content="https://www.example.com/icon.png">
</head>
<body><main><h1>About</h1></main></body>
</html>
HTML
)

Crawlscope::Rules::Metadata.new.call(
urls: [local_page.url],
pages: [local_page],
issues: issues
)

refute_includes issues.to_a.map(&:code), :canonical_mismatch
end

private

def page(url: "https://example.com/about", body: nil)
body ||= <<~HTML
<html>
<head>
<title>About</title>
<meta name="description" content="Too short">
<link rel="canonical" href="https://example.com/about">
<meta property="og:title" content="About">
</head>
<body><main><h1>About</h1><h1>Team</h1></main></body>
</html>
HTML

Crawlscope::Page.new(
url: url,
normalized_url: Crawlscope::Url.normalize(url, base_url: url),
final_url: url,
normalized_final_url: Crawlscope::Url.normalize(url, base_url: url),
status: 200,
headers: {"content-type" => "text/html"},
body: body,
doc: Nokogiri::HTML(body)
)
end
end
Loading