From b0f41c48b130e43eec867c008185f82cbf8ab639 Mon Sep 17 00:00:00 2001 From: Paulo Fidalgo Date: Tue, 28 Apr 2026 19:50:46 +0300 Subject: [PATCH] fix: tighten SEO audit coverage Detect short meta descriptions, multiple H1s, incomplete Open Graph tags, redirected pages, and internal links that resolve through redirects. Also allow localhost crawls to compare canonicals by path when pages emit production canonical URLs, and count navigation/footer links toward inbound internal link checks. --- lib/crawlscope/crawl.rb | 6 +++ lib/crawlscope/rules/links.rb | 30 ++++++++--- lib/crawlscope/rules/metadata.rb | 68 +++++++++++++++++++---- test/crawlscope/crawl_test.rb | 16 ++++-- test/crawlscope/links_rule_test.rb | 39 ++++++++++++++ test/crawlscope/metadata_rule_test.rb | 77 +++++++++++++++++++++++++++ 6 files changed, 216 insertions(+), 20 deletions(-) create mode 100644 test/crawlscope/metadata_rule_test.rb diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb index d2156c4..f32fdef 100644 --- a/lib/crawlscope/crawl.rb +++ b/lib/crawlscope/crawl.rb @@ -81,6 +81,8 @@ def collect(pages, issues) issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {}) elsif !@allowed_statuses.include?(page.status) issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status}) + elsif redirected?(page) + issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status}) end end end @@ -128,5 +130,9 @@ def resolution(page, normalized_url, crawled:) status: page.status } end + + def redirected?(page) + page.normalized_url.to_s != page.normalized_final_url.to_s + end end end diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb index 56ad8e8..a17915b 100644 --- a/lib/crawlscope/rules/links.rb +++ b/lib/crawlscope/rules/links.rb @@ -5,7 +5,7 @@ module Crawlscope module Rules class Links - CONTEXTUAL_LINK_SELECTORS = "main a[href], article a[href]" + LINK_SELECTORS = "a[href]" INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze MAX_SOURCES_IN_ERROR = 3 @@ -33,10 +33,7 @@ def call(urls:, pages:, issues:, context:) private def contextual_links(doc) - links = doc.css(CONTEXTUAL_LINK_SELECTORS) - return links unless links.empty? - - doc.css("a[href]") + doc.css(LINK_SELECTORS) end def extract_links(pages) @@ -45,7 +42,7 @@ def extract_links(pages) def page_links(page) source_path = Url.path(page.normalized_url) - return [] unless crawlable_path?(source_path) + return [] unless crawlable_source_path?(source_path) contextual_links(page.doc).filter_map do |node| link_for(page: page, source_path: source_path, node: node) @@ -146,6 +143,7 @@ def resolve_links(links, issues) next end + report_redirect_target(target_url, grouped_links, issues, target) if target.redirect? next unless crawlable_path?(target.final_path) grouped_links.each do |link| @@ -156,6 +154,18 @@ def resolve_links(links, issues) resolved_links end + def report_redirect_target(target_url, grouped_links, issues, target) + source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR) + issues.add( + code: :internal_link_redirects, + severity: :warning, + category: :links, + url: target_url, + message: "internal link redirects to #{target.final_url} (sources: #{source_urls.join(", ")})", + details: {final_url: target.final_url, source_urls: source_urls, status: target.status} + ) + end + def resolve_target(target_url) resolution = @resolve_target.call(target_url) LinkTarget.new(target_url: target_url, resolution: resolution) @@ -183,11 +193,19 @@ def status resolution && resolution[:status] end + def redirect? + (status && (300..399).cover?(status.to_i)) || final_url != target_url + end + def unresolved? resolution.nil? || (status.nil? && !ignored_error?) end end + def crawlable_source_path?(path) + !path.nil? && INTERNAL_PATH_PREFIXES_TO_SKIP.none? { |prefix| path.start_with?(prefix) } + end + def skip_internal_path?(path) return true if path == "/" diff --git a/lib/crawlscope/rules/metadata.rb b/lib/crawlscope/rules/metadata.rb index ccb6b3e..9f261f2 100644 --- a/lib/crawlscope/rules/metadata.rb +++ b/lib/crawlscope/rules/metadata.rb @@ -1,10 +1,14 @@ # frozen_string_literal: true +require "uri" + module Crawlscope module Rules class Metadata TITLE_MAX_LENGTH = 72 + DESCRIPTION_MIN_LENGTH = 110 DESCRIPTION_MAX_LENGTH = 160 + REQUIRED_OPEN_GRAPH_PROPERTIES = %w[og:title og:description og:url og:type og:image].freeze attr_reader :code @@ -21,22 +25,35 @@ def call(urls:, pages:, issues:, context: nil) validate_title(page, issues) validate_description(page, issues) validate_canonical(page, issues) + validate_open_graph(page, issues) end end private def validate_h1(page, issues) - return unless page.doc.at_css("h1").nil? - - issues.add( - code: :missing_h1, - severity: :warning, - category: :metadata, - url: page.url, - message: "missing

", - details: {} - ) + h1s = page.doc.css("h1") + return if h1s.one? + + if h1s.empty? + issues.add( + code: :missing_h1, + severity: :warning, + category: :metadata, + url: page.url, + message: "missing

", + details: {} + ) + else + issues.add( + code: :multiple_h1, + severity: :warning, + category: :metadata, + url: page.url, + message: "multiple

tags (#{h1s.size})", + details: {count: h1s.size} + ) + end end def validate_title(page, issues) @@ -56,6 +73,8 @@ def validate_description(page, issues) if description.empty? issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {}) + elsif description.length < DESCRIPTION_MIN_LENGTH + issues.add(code: :meta_description_too_short, severity: :warning, category: :metadata, url: page.url, message: "meta description too short (#{description.length})", details: {length: description.length, minimum: DESCRIPTION_MIN_LENGTH}) elsif description.length > DESCRIPTION_MAX_LENGTH issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length}) end @@ -71,7 +90,7 @@ def validate_canonical(page, issues) normalized_canonical = Url.normalize(canonical, base_url: page.url) normalized_page_url = Url.normalize(page.url, base_url: page.url) - return if normalized_canonical == normalized_page_url + return if canonical_matches_page?(normalized_canonical, normalized_page_url) issues.add( code: :canonical_mismatch, @@ -88,6 +107,33 @@ def repeated_site_name?(title) title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1 end + + def validate_open_graph(page, issues) + missing = REQUIRED_OPEN_GRAPH_PROPERTIES.reject do |property| + page.doc.at_css(%(meta[property="#{property}"][content])) + end + return if missing.empty? + + issues.add( + code: :incomplete_open_graph_tags, + severity: :warning, + category: :metadata, + url: page.url, + message: "Open Graph tags incomplete (missing #{missing.join(", ")})", + details: {missing: missing} + ) + end + + def canonical_matches_page?(canonical, page_url) + canonical == page_url || (local_url?(page_url) && Url.path(canonical) == Url.path(page_url)) + end + + def local_url?(url) + host = URI.parse(url.to_s).host.to_s + ["localhost", "127.0.0.1", "0.0.0.0", "::1"].include?(host) + rescue URI::InvalidURIError + false + end end end end diff --git a/test/crawlscope/crawl_test.rb b/test/crawlscope/crawl_test.rb index ba8ce39..87976cd 100644 --- a/test/crawlscope/crawl_test.rb +++ b/test/crawlscope/crawl_test.rb @@ -31,8 +31,13 @@ def test_returns_ok_when_metadata_is_valid Pricing - + + + + + + @@ -95,7 +100,7 @@ def test_collects_metadata_issues_for_invalid_page ).call refute result.ok? - assert_equal %i[meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort + assert_equal %i[incomplete_open_graph_tags meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort end def test_uses_browser_when_renderer_is_browser @@ -128,8 +133,13 @@ def fetch(url) Pricing - + + + + + + diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb index a77b738..8bb8617 100644 --- a/test/crawlscope/links_rule_test.rb +++ b/test/crawlscope/links_rule_test.rb @@ -118,6 +118,45 @@ def test_reports_low_inbound_anchor_links assert_equal "https://example.com/guide", issues.to_a.first.url end + def test_counts_root_page_links_as_inbound_links + issues = Crawlscope::IssueCollection.new + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/", "https://example.com/about"], + pages: [ + page(url: "https://example.com/", body: "
About
"), + page(url: "https://example.com/about", body: "

About

") + ], + issues: issues, + context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} }) + ) + + refute_includes issues.to_a.map(&:code), :low_inbound_anchor_links + end + + def test_reports_internal_links_that_redirect + issues = Crawlscope::IssueCollection.new + resolver = lambda do |target_url| + { + crawled: false, + error: nil, + final_url: "https://example.com/pricing", + status: 200 + } + end + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide"], + pages: [page(url: "https://example.com/guide", body: "
Plans
")], + issues: issues, + context: context(resolver: resolver) + ) + + redirect_issue = issues.to_a.find { |issue| issue.code == :internal_link_redirects } + assert redirect_issue + assert_includes redirect_issue.message, "https://example.com/pricing" + end + def test_ignores_links_that_should_not_be_crawled issues = Crawlscope::IssueCollection.new diff --git a/test/crawlscope/metadata_rule_test.rb b/test/crawlscope/metadata_rule_test.rb new file mode 100644 index 0000000..694404f --- /dev/null +++ b/test/crawlscope/metadata_rule_test.rb @@ -0,0 +1,77 @@ +# frozen_string_literal: true + +require "test_helper" + +class CrawlscopeMetadataRuleTest < Minitest::Test + def test_reports_short_meta_description_multiple_h1_and_incomplete_open_graph + issues = Crawlscope::IssueCollection.new + + Crawlscope::Rules::Metadata.new.call( + urls: [page.url], + pages: [page], + issues: issues + ) + + codes = issues.to_a.map(&:code) + assert_includes codes, :meta_description_too_short + assert_includes codes, :multiple_h1 + assert_includes codes, :incomplete_open_graph_tags + end + + def test_allows_localhost_page_with_matching_production_canonical_path + issues = Crawlscope::IssueCollection.new + local_page = page( + url: "http://localhost:3000/about", + body: <<~HTML + + + About + + + + + + + + +

About

+ + HTML + ) + + Crawlscope::Rules::Metadata.new.call( + urls: [local_page.url], + pages: [local_page], + issues: issues + ) + + refute_includes issues.to_a.map(&:code), :canonical_mismatch + end + + private + + def page(url: "https://example.com/about", body: nil) + body ||= <<~HTML + + + About + + + + +

About

Team

+ + HTML + + Crawlscope::Page.new( + url: url, + normalized_url: Crawlscope::Url.normalize(url, base_url: url), + final_url: url, + normalized_final_url: Crawlscope::Url.normalize(url, base_url: url), + status: 200, + headers: {"content-type" => "text/html"}, + body: body, + doc: Nokogiri::HTML(body) + ) + end +end