diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb index d2156c4..f32fdef 100644 --- a/lib/crawlscope/crawl.rb +++ b/lib/crawlscope/crawl.rb @@ -81,6 +81,8 @@ def collect(pages, issues) issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {}) elsif !@allowed_statuses.include?(page.status) issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status}) + elsif redirected?(page) + issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status}) end end end @@ -128,5 +130,9 @@ def resolution(page, normalized_url, crawled:) status: page.status } end + + def redirected?(page) + page.normalized_url.to_s != page.normalized_final_url.to_s + end end end diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb index 56ad8e8..a17915b 100644 --- a/lib/crawlscope/rules/links.rb +++ b/lib/crawlscope/rules/links.rb @@ -5,7 +5,7 @@ module Crawlscope module Rules class Links - CONTEXTUAL_LINK_SELECTORS = "main a[href], article a[href]" + LINK_SELECTORS = "a[href]" INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze MAX_SOURCES_IN_ERROR = 3 @@ -33,10 +33,7 @@ def call(urls:, pages:, issues:, context:) private def contextual_links(doc) - links = doc.css(CONTEXTUAL_LINK_SELECTORS) - return links unless links.empty? - - doc.css("a[href]") + doc.css(LINK_SELECTORS) end def extract_links(pages) @@ -45,7 +42,7 @@ def extract_links(pages) def page_links(page) source_path = Url.path(page.normalized_url) - return [] unless crawlable_path?(source_path) + return [] unless crawlable_source_path?(source_path) contextual_links(page.doc).filter_map do |node| link_for(page: page, source_path: source_path, node: node) @@ -146,6 +143,7 @@ def resolve_links(links, issues) next end + report_redirect_target(target_url, grouped_links, issues, target) if target.redirect? next unless crawlable_path?(target.final_path) grouped_links.each do |link| @@ -156,6 +154,18 @@ def resolve_links(links, issues) resolved_links end + def report_redirect_target(target_url, grouped_links, issues, target) + source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR) + issues.add( + code: :internal_link_redirects, + severity: :warning, + category: :links, + url: target_url, + message: "internal link redirects to #{target.final_url} (sources: #{source_urls.join(", ")})", + details: {final_url: target.final_url, source_urls: source_urls, status: target.status} + ) + end + def resolve_target(target_url) resolution = @resolve_target.call(target_url) LinkTarget.new(target_url: target_url, resolution: resolution) @@ -183,11 +193,19 @@ def status resolution && resolution[:status] end + def redirect? + (status && (300..399).cover?(status.to_i)) || final_url != target_url + end + def unresolved? resolution.nil? || (status.nil? && !ignored_error?) end end + def crawlable_source_path?(path) + !path.nil? && INTERNAL_PATH_PREFIXES_TO_SKIP.none? { |prefix| path.start_with?(prefix) } + end + def skip_internal_path?(path) return true if path == "/" diff --git a/lib/crawlscope/rules/metadata.rb b/lib/crawlscope/rules/metadata.rb index ccb6b3e..9f261f2 100644 --- a/lib/crawlscope/rules/metadata.rb +++ b/lib/crawlscope/rules/metadata.rb @@ -1,10 +1,14 @@ # frozen_string_literal: true +require "uri" + module Crawlscope module Rules class Metadata TITLE_MAX_LENGTH = 72 + DESCRIPTION_MIN_LENGTH = 110 DESCRIPTION_MAX_LENGTH = 160 + REQUIRED_OPEN_GRAPH_PROPERTIES = %w[og:title og:description og:url og:type og:image].freeze attr_reader :code @@ -21,22 +25,35 @@ def call(urls:, pages:, issues:, context: nil) validate_title(page, issues) validate_description(page, issues) validate_canonical(page, issues) + validate_open_graph(page, issues) end end private def validate_h1(page, issues) - return unless page.doc.at_css("h1").nil? - - issues.add( - code: :missing_h1, - severity: :warning, - category: :metadata, - url: page.url, - message: "missing

", - details: {} - ) + h1s = page.doc.css("h1") + return if h1s.one? + + if h1s.empty? + issues.add( + code: :missing_h1, + severity: :warning, + category: :metadata, + url: page.url, + message: "missing

", + details: {} + ) + else + issues.add( + code: :multiple_h1, + severity: :warning, + category: :metadata, + url: page.url, + message: "multiple

tags (#{h1s.size})", + details: {count: h1s.size} + ) + end end def validate_title(page, issues) @@ -56,6 +73,8 @@ def validate_description(page, issues) if description.empty? issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {}) + elsif description.length < DESCRIPTION_MIN_LENGTH + issues.add(code: :meta_description_too_short, severity: :warning, category: :metadata, url: page.url, message: "meta description too short (#{description.length})", details: {length: description.length, minimum: DESCRIPTION_MIN_LENGTH}) elsif description.length > DESCRIPTION_MAX_LENGTH issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length}) end @@ -71,7 +90,7 @@ def validate_canonical(page, issues) normalized_canonical = Url.normalize(canonical, base_url: page.url) normalized_page_url = Url.normalize(page.url, base_url: page.url) - return if normalized_canonical == normalized_page_url + return if canonical_matches_page?(normalized_canonical, normalized_page_url) issues.add( code: :canonical_mismatch, @@ -88,6 +107,33 @@ def repeated_site_name?(title) title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1 end + + def validate_open_graph(page, issues) + missing = REQUIRED_OPEN_GRAPH_PROPERTIES.reject do |property| + page.doc.at_css(%(meta[property="#{property}"][content])) + end + return if missing.empty? + + issues.add( + code: :incomplete_open_graph_tags, + severity: :warning, + category: :metadata, + url: page.url, + message: "Open Graph tags incomplete (missing #{missing.join(", ")})", + details: {missing: missing} + ) + end + + def canonical_matches_page?(canonical, page_url) + canonical == page_url || (local_url?(page_url) && Url.path(canonical) == Url.path(page_url)) + end + + def local_url?(url) + host = URI.parse(url.to_s).host.to_s + ["localhost", "127.0.0.1", "0.0.0.0", "::1"].include?(host) + rescue URI::InvalidURIError + false + end end end end diff --git a/test/crawlscope/crawl_test.rb b/test/crawlscope/crawl_test.rb index ba8ce39..87976cd 100644 --- a/test/crawlscope/crawl_test.rb +++ b/test/crawlscope/crawl_test.rb @@ -31,8 +31,13 @@ def test_returns_ok_when_metadata_is_valid Pricing - + + + + + + @@ -95,7 +100,7 @@ def test_collects_metadata_issues_for_invalid_page ).call refute result.ok? - assert_equal %i[meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort + assert_equal %i[incomplete_open_graph_tags meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort end def test_uses_browser_when_renderer_is_browser @@ -128,8 +133,13 @@ def fetch(url) Pricing - + + + + + + diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb index a77b738..8bb8617 100644 --- a/test/crawlscope/links_rule_test.rb +++ b/test/crawlscope/links_rule_test.rb @@ -118,6 +118,45 @@ def test_reports_low_inbound_anchor_links assert_equal "https://example.com/guide", issues.to_a.first.url end + def test_counts_root_page_links_as_inbound_links + issues = Crawlscope::IssueCollection.new + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/", "https://example.com/about"], + pages: [ + page(url: "https://example.com/", body: "
About
"), + page(url: "https://example.com/about", body: "

About

") + ], + issues: issues, + context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} }) + ) + + refute_includes issues.to_a.map(&:code), :low_inbound_anchor_links + end + + def test_reports_internal_links_that_redirect + issues = Crawlscope::IssueCollection.new + resolver = lambda do |target_url| + { + crawled: false, + error: nil, + final_url: "https://example.com/pricing", + status: 200 + } + end + + Crawlscope::Rules::Links.new.call( + urls: ["https://example.com/guide"], + pages: [page(url: "https://example.com/guide", body: "
Plans
")], + issues: issues, + context: context(resolver: resolver) + ) + + redirect_issue = issues.to_a.find { |issue| issue.code == :internal_link_redirects } + assert redirect_issue + assert_includes redirect_issue.message, "https://example.com/pricing" + end + def test_ignores_links_that_should_not_be_crawled issues = Crawlscope::IssueCollection.new diff --git a/test/crawlscope/metadata_rule_test.rb b/test/crawlscope/metadata_rule_test.rb new file mode 100644 index 0000000..694404f --- /dev/null +++ b/test/crawlscope/metadata_rule_test.rb @@ -0,0 +1,77 @@ +# frozen_string_literal: true + +require "test_helper" + +class CrawlscopeMetadataRuleTest < Minitest::Test + def test_reports_short_meta_description_multiple_h1_and_incomplete_open_graph + issues = Crawlscope::IssueCollection.new + + Crawlscope::Rules::Metadata.new.call( + urls: [page.url], + pages: [page], + issues: issues + ) + + codes = issues.to_a.map(&:code) + assert_includes codes, :meta_description_too_short + assert_includes codes, :multiple_h1 + assert_includes codes, :incomplete_open_graph_tags + end + + def test_allows_localhost_page_with_matching_production_canonical_path + issues = Crawlscope::IssueCollection.new + local_page = page( + url: "http://localhost:3000/about", + body: <<~HTML + + + About + + + + + + + + +

About

+ + HTML + ) + + Crawlscope::Rules::Metadata.new.call( + urls: [local_page.url], + pages: [local_page], + issues: issues + ) + + refute_includes issues.to_a.map(&:code), :canonical_mismatch + end + + private + + def page(url: "https://example.com/about", body: nil) + body ||= <<~HTML + + + About + + + + +

About

Team

+ + HTML + + Crawlscope::Page.new( + url: url, + normalized_url: Crawlscope::Url.normalize(url, base_url: url), + final_url: url, + normalized_final_url: Crawlscope::Url.normalize(url, base_url: url), + status: 200, + headers: {"content-type" => "text/html"}, + body: body, + doc: Nokogiri::HTML(body) + ) + end +end