diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb
index d2156c4..f32fdef 100644
--- a/lib/crawlscope/crawl.rb
+++ b/lib/crawlscope/crawl.rb
@@ -81,6 +81,8 @@ def collect(pages, issues)
issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
elsif !@allowed_statuses.include?(page.status)
issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
+ elsif redirected?(page)
+ issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
end
end
end
@@ -128,5 +130,9 @@ def resolution(page, normalized_url, crawled:)
status: page.status
}
end
+
+ def redirected?(page)
+ page.normalized_url.to_s != page.normalized_final_url.to_s
+ end
end
end
diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb
index 56ad8e8..a17915b 100644
--- a/lib/crawlscope/rules/links.rb
+++ b/lib/crawlscope/rules/links.rb
@@ -5,7 +5,7 @@
module Crawlscope
module Rules
class Links
- CONTEXTUAL_LINK_SELECTORS = "main a[href], article a[href]"
+ LINK_SELECTORS = "a[href]"
INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze
LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
MAX_SOURCES_IN_ERROR = 3
@@ -33,10 +33,7 @@ def call(urls:, pages:, issues:, context:)
private
def contextual_links(doc)
- links = doc.css(CONTEXTUAL_LINK_SELECTORS)
- return links unless links.empty?
-
- doc.css("a[href]")
+ doc.css(LINK_SELECTORS)
end
def extract_links(pages)
@@ -45,7 +42,7 @@ def extract_links(pages)
def page_links(page)
source_path = Url.path(page.normalized_url)
- return [] unless crawlable_path?(source_path)
+ return [] unless crawlable_source_path?(source_path)
contextual_links(page.doc).filter_map do |node|
link_for(page: page, source_path: source_path, node: node)
@@ -146,6 +143,7 @@ def resolve_links(links, issues)
next
end
+ report_redirect_target(target_url, grouped_links, issues, target) if target.redirect?
next unless crawlable_path?(target.final_path)
grouped_links.each do |link|
@@ -156,6 +154,18 @@ def resolve_links(links, issues)
resolved_links
end
+ def report_redirect_target(target_url, grouped_links, issues, target)
+ source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+ issues.add(
+ code: :internal_link_redirects,
+ severity: :warning,
+ category: :links,
+ url: target_url,
+ message: "internal link redirects to #{target.final_url} (sources: #{source_urls.join(", ")})",
+ details: {final_url: target.final_url, source_urls: source_urls, status: target.status}
+ )
+ end
+
def resolve_target(target_url)
resolution = @resolve_target.call(target_url)
LinkTarget.new(target_url: target_url, resolution: resolution)
@@ -183,11 +193,19 @@ def status
resolution && resolution[:status]
end
+ def redirect?
+ (status && (300..399).cover?(status.to_i)) || final_url != target_url
+ end
+
def unresolved?
resolution.nil? || (status.nil? && !ignored_error?)
end
end
+ def crawlable_source_path?(path)
+ !path.nil? && INTERNAL_PATH_PREFIXES_TO_SKIP.none? { |prefix| path.start_with?(prefix) }
+ end
+
def skip_internal_path?(path)
return true if path == "/"
diff --git a/lib/crawlscope/rules/metadata.rb b/lib/crawlscope/rules/metadata.rb
index ccb6b3e..9f261f2 100644
--- a/lib/crawlscope/rules/metadata.rb
+++ b/lib/crawlscope/rules/metadata.rb
@@ -1,10 +1,14 @@
# frozen_string_literal: true
+require "uri"
+
module Crawlscope
module Rules
class Metadata
TITLE_MAX_LENGTH = 72
+ DESCRIPTION_MIN_LENGTH = 110
DESCRIPTION_MAX_LENGTH = 160
+ REQUIRED_OPEN_GRAPH_PROPERTIES = %w[og:title og:description og:url og:type og:image].freeze
attr_reader :code
@@ -21,22 +25,35 @@ def call(urls:, pages:, issues:, context: nil)
validate_title(page, issues)
validate_description(page, issues)
validate_canonical(page, issues)
+ validate_open_graph(page, issues)
end
end
private
def validate_h1(page, issues)
- return unless page.doc.at_css("h1").nil?
-
- issues.add(
- code: :missing_h1,
- severity: :warning,
- category: :metadata,
- url: page.url,
- message: "missing
",
- details: {}
- )
+ h1s = page.doc.css("h1")
+ return if h1s.one?
+
+ if h1s.empty?
+ issues.add(
+ code: :missing_h1,
+ severity: :warning,
+ category: :metadata,
+ url: page.url,
+ message: "missing ",
+ details: {}
+ )
+ else
+ issues.add(
+ code: :multiple_h1,
+ severity: :warning,
+ category: :metadata,
+ url: page.url,
+ message: "multiple tags (#{h1s.size})",
+ details: {count: h1s.size}
+ )
+ end
end
def validate_title(page, issues)
@@ -56,6 +73,8 @@ def validate_description(page, issues)
if description.empty?
issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
+ elsif description.length < DESCRIPTION_MIN_LENGTH
+ issues.add(code: :meta_description_too_short, severity: :warning, category: :metadata, url: page.url, message: "meta description too short (#{description.length})", details: {length: description.length, minimum: DESCRIPTION_MIN_LENGTH})
elsif description.length > DESCRIPTION_MAX_LENGTH
issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length})
end
@@ -71,7 +90,7 @@ def validate_canonical(page, issues)
normalized_canonical = Url.normalize(canonical, base_url: page.url)
normalized_page_url = Url.normalize(page.url, base_url: page.url)
- return if normalized_canonical == normalized_page_url
+ return if canonical_matches_page?(normalized_canonical, normalized_page_url)
issues.add(
code: :canonical_mismatch,
@@ -88,6 +107,33 @@ def repeated_site_name?(title)
title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1
end
+
+ def validate_open_graph(page, issues)
+ missing = REQUIRED_OPEN_GRAPH_PROPERTIES.reject do |property|
+ page.doc.at_css(%(meta[property="#{property}"][content]))
+ end
+ return if missing.empty?
+
+ issues.add(
+ code: :incomplete_open_graph_tags,
+ severity: :warning,
+ category: :metadata,
+ url: page.url,
+ message: "Open Graph tags incomplete (missing #{missing.join(", ")})",
+ details: {missing: missing}
+ )
+ end
+
+ def canonical_matches_page?(canonical, page_url)
+ canonical == page_url || (local_url?(page_url) && Url.path(canonical) == Url.path(page_url))
+ end
+
+ def local_url?(url)
+ host = URI.parse(url.to_s).host.to_s
+ ["localhost", "127.0.0.1", "0.0.0.0", "::1"].include?(host)
+ rescue URI::InvalidURIError
+ false
+ end
end
end
end
diff --git a/test/crawlscope/crawl_test.rb b/test/crawlscope/crawl_test.rb
index ba8ce39..87976cd 100644
--- a/test/crawlscope/crawl_test.rb
+++ b/test/crawlscope/crawl_test.rb
@@ -31,8 +31,13 @@ def test_returns_ok_when_metadata_is_valid
Pricing
-
+
+
+
+
+
+
@@ -95,7 +100,7 @@ def test_collects_metadata_issues_for_invalid_page
).call
refute result.ok?
- assert_equal %i[meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
+ assert_equal %i[incomplete_open_graph_tags meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
end
def test_uses_browser_when_renderer_is_browser
@@ -128,8 +133,13 @@ def fetch(url)
Pricing
-
+
+
+
+
+
+
diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb
index a77b738..8bb8617 100644
--- a/test/crawlscope/links_rule_test.rb
+++ b/test/crawlscope/links_rule_test.rb
@@ -118,6 +118,45 @@ def test_reports_low_inbound_anchor_links
assert_equal "https://example.com/guide", issues.to_a.first.url
end
+ def test_counts_root_page_links_as_inbound_links
+ issues = Crawlscope::IssueCollection.new
+
+ Crawlscope::Rules::Links.new.call(
+ urls: ["https://example.com/", "https://example.com/about"],
+ pages: [
+ page(url: "https://example.com/", body: "About"),
+ page(url: "https://example.com/about", body: "About
")
+ ],
+ issues: issues,
+ context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
+ )
+
+ refute_includes issues.to_a.map(&:code), :low_inbound_anchor_links
+ end
+
+ def test_reports_internal_links_that_redirect
+ issues = Crawlscope::IssueCollection.new
+ resolver = lambda do |target_url|
+ {
+ crawled: false,
+ error: nil,
+ final_url: "https://example.com/pricing",
+ status: 200
+ }
+ end
+
+ Crawlscope::Rules::Links.new.call(
+ urls: ["https://example.com/guide"],
+ pages: [page(url: "https://example.com/guide", body: "Plans")],
+ issues: issues,
+ context: context(resolver: resolver)
+ )
+
+ redirect_issue = issues.to_a.find { |issue| issue.code == :internal_link_redirects }
+ assert redirect_issue
+ assert_includes redirect_issue.message, "https://example.com/pricing"
+ end
+
def test_ignores_links_that_should_not_be_crawled
issues = Crawlscope::IssueCollection.new
diff --git a/test/crawlscope/metadata_rule_test.rb b/test/crawlscope/metadata_rule_test.rb
new file mode 100644
index 0000000..694404f
--- /dev/null
+++ b/test/crawlscope/metadata_rule_test.rb
@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+
+require "test_helper"
+
+class CrawlscopeMetadataRuleTest < Minitest::Test
+ def test_reports_short_meta_description_multiple_h1_and_incomplete_open_graph
+ issues = Crawlscope::IssueCollection.new
+
+ Crawlscope::Rules::Metadata.new.call(
+ urls: [page.url],
+ pages: [page],
+ issues: issues
+ )
+
+ codes = issues.to_a.map(&:code)
+ assert_includes codes, :meta_description_too_short
+ assert_includes codes, :multiple_h1
+ assert_includes codes, :incomplete_open_graph_tags
+ end
+
+ def test_allows_localhost_page_with_matching_production_canonical_path
+ issues = Crawlscope::IssueCollection.new
+ local_page = page(
+ url: "http://localhost:3000/about",
+ body: <<~HTML
+
+
+ About
+
+
+
+
+
+
+
+
+ About
+
+ HTML
+ )
+
+ Crawlscope::Rules::Metadata.new.call(
+ urls: [local_page.url],
+ pages: [local_page],
+ issues: issues
+ )
+
+ refute_includes issues.to_a.map(&:code), :canonical_mismatch
+ end
+
+ private
+
+ def page(url: "https://example.com/about", body: nil)
+ body ||= <<~HTML
+
+
+ About
+
+
+
+
+ About
Team
+
+ HTML
+
+ Crawlscope::Page.new(
+ url: url,
+ normalized_url: Crawlscope::Url.normalize(url, base_url: url),
+ final_url: url,
+ normalized_final_url: Crawlscope::Url.normalize(url, base_url: url),
+ status: 200,
+ headers: {"content-type" => "text/html"},
+ body: body,
+ doc: Nokogiri::HTML(body)
+ )
+ end
+end