ethos-link · fidalgo · Apr 28, 2026 · Apr 28, 2026
diff --git a/lib/crawlscope/crawl.rb b/lib/crawlscope/crawl.rb
@@ -81,6 +81,8 @@ def collect(pages, issues)
           issues.add(code: :fetch_failed, severity: :error, category: :crawl, url: page.url, message: page.error, details: {})
         elsif !@allowed_statuses.include?(page.status)
           issues.add(code: :unexpected_status, severity: :error, category: :crawl, url: page.url, message: "HTTP #{page.status}", details: {status: page.status})
+        elsif redirected?(page)
+          issues.add(code: :redirected_page, severity: :warning, category: :crawl, url: page.url, message: "redirects to #{page.final_url}", details: {final_url: page.final_url, status: page.status})
         end
       end
     end
@@ -128,5 +130,9 @@ def resolution(page, normalized_url, crawled:)
         status: page.status
       }
     end
+
+    def redirected?(page)
+      page.normalized_url.to_s != page.normalized_final_url.to_s
+    end
   end
 end
diff --git a/lib/crawlscope/rules/links.rb b/lib/crawlscope/rules/links.rb
@@ -5,7 +5,7 @@
 module Crawlscope
   module Rules
     class Links
-      CONTEXTUAL_LINK_SELECTORS = "main a[href], article a[href]"
+      LINK_SELECTORS = "a[href]"
       INTERNAL_PATH_PREFIXES_TO_SKIP = ["/rails/", "/cdn-cgi/"].freeze
       LINK_SCHEMES_TO_SKIP = ["mailto:", "tel:", "javascript:", "data:"].freeze
       MAX_SOURCES_IN_ERROR = 3
@@ -33,10 +33,7 @@ def call(urls:, pages:, issues:, context:)
       private
 
       def contextual_links(doc)
-        links = doc.css(CONTEXTUAL_LINK_SELECTORS)
-        return links unless links.empty?
-
-        doc.css("a[href]")
+        doc.css(LINK_SELECTORS)
       end
 
       def extract_links(pages)
@@ -45,7 +42,7 @@ def extract_links(pages)
 
       def page_links(page)
         source_path = Url.path(page.normalized_url)
-        return [] unless crawlable_path?(source_path)
+        return [] unless crawlable_source_path?(source_path)
 
         contextual_links(page.doc).filter_map do |node|
           link_for(page: page, source_path: source_path, node: node)
@@ -146,6 +143,7 @@ def resolve_links(links, issues)
             next
           end
 
+          report_redirect_target(target_url, grouped_links, issues, target) if target.redirect?
           next unless crawlable_path?(target.final_path)
 
           grouped_links.each do |link|
@@ -156,6 +154,18 @@ def resolve_links(links, issues)
         resolved_links
       end
 
+      def report_redirect_target(target_url, grouped_links, issues, target)
+        source_urls = grouped_links.map { |link| link[:source_url] }.uniq.first(MAX_SOURCES_IN_ERROR)
+        issues.add(
+          code: :internal_link_redirects,
+          severity: :warning,
+          category: :links,
+          url: target_url,
+          message: "internal link redirects to #{target.final_url} (sources: #{source_urls.join(", ")})",
+          details: {final_url: target.final_url, source_urls: source_urls, status: target.status}
+        )
+      end
+
       def resolve_target(target_url)
         resolution = @resolve_target.call(target_url)
         LinkTarget.new(target_url: target_url, resolution: resolution)
@@ -183,11 +193,19 @@ def status
           resolution && resolution[:status]
         end
 
+        def redirect?
+          (status && (300..399).cover?(status.to_i)) || final_url != target_url
+        end
+
         def unresolved?
           resolution.nil? || (status.nil? && !ignored_error?)
         end
       end
 
+      def crawlable_source_path?(path)
+        !path.nil? && INTERNAL_PATH_PREFIXES_TO_SKIP.none? { |prefix| path.start_with?(prefix) }
+      end
+
       def skip_internal_path?(path)
         return true if path == "/"
 

diff --git a/lib/crawlscope/rules/metadata.rb b/lib/crawlscope/rules/metadata.rb
@@ -1,10 +1,14 @@
 # frozen_string_literal: true
 
+require "uri"
+
 module Crawlscope
   module Rules
     class Metadata
       TITLE_MAX_LENGTH = 72
+      DESCRIPTION_MIN_LENGTH = 110
       DESCRIPTION_MAX_LENGTH = 160
+      REQUIRED_OPEN_GRAPH_PROPERTIES = %w[og:title og:description og:url og:type og:image].freeze
 
       attr_reader :code
 
@@ -21,22 +25,35 @@ def call(urls:, pages:, issues:, context: nil)
           validate_title(page, issues)
           validate_description(page, issues)
           validate_canonical(page, issues)
+          validate_open_graph(page, issues)
         end
       end
 
       private
 
       def validate_h1(page, issues)
-        return unless page.doc.at_css("h1").nil?
-
-        issues.add(
-          code: :missing_h1,
-          severity: :warning,
-          category: :metadata,
-          url: page.url,
-          message: "missing <h1>",
-          details: {}
-        )
+        h1s = page.doc.css("h1")
+        return if h1s.one?
+
+        if h1s.empty?
+          issues.add(
+            code: :missing_h1,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "missing <h1>",
+            details: {}
+          )
+        else
+          issues.add(
+            code: :multiple_h1,
+            severity: :warning,
+            category: :metadata,
+            url: page.url,
+            message: "multiple <h1> tags (#{h1s.size})",
+            details: {count: h1s.size}
+          )
+        end
       end
 
       def validate_title(page, issues)
@@ -56,6 +73,8 @@ def validate_description(page, issues)
 
         if description.empty?
           issues.add(code: :missing_meta_description, severity: :warning, category: :metadata, url: page.url, message: "missing meta description", details: {})
+        elsif description.length < DESCRIPTION_MIN_LENGTH
+          issues.add(code: :meta_description_too_short, severity: :warning, category: :metadata, url: page.url, message: "meta description too short (#{description.length})", details: {length: description.length, minimum: DESCRIPTION_MIN_LENGTH})
         elsif description.length > DESCRIPTION_MAX_LENGTH
           issues.add(code: :meta_description_too_long, severity: :warning, category: :metadata, url: page.url, message: "meta description too long (#{description.length})", details: {length: description.length})
         end
@@ -71,7 +90,7 @@ def validate_canonical(page, issues)
 
         normalized_canonical = Url.normalize(canonical, base_url: page.url)
         normalized_page_url = Url.normalize(page.url, base_url: page.url)
-        return if normalized_canonical == normalized_page_url
+        return if canonical_matches_page?(normalized_canonical, normalized_page_url)
 
         issues.add(
           code: :canonical_mismatch,
@@ -88,6 +107,33 @@ def repeated_site_name?(title)
 
         title.split(/[^[:alnum:]]+/).count { |token| token.casecmp?(@site_name) } > 1
       end
+
+      def validate_open_graph(page, issues)
+        missing = REQUIRED_OPEN_GRAPH_PROPERTIES.reject do |property|
+          page.doc.at_css(%(meta[property="#{property}"][content]))
+        end
+        return if missing.empty?
+
+        issues.add(
+          code: :incomplete_open_graph_tags,
+          severity: :warning,
+          category: :metadata,
+          url: page.url,
+          message: "Open Graph tags incomplete (missing #{missing.join(", ")})",
+          details: {missing: missing}
+        )
+      end
+
+      def canonical_matches_page?(canonical, page_url)
+        canonical == page_url || (local_url?(page_url) && Url.path(canonical) == Url.path(page_url))
+      end
+
+      def local_url?(url)
+        host = URI.parse(url.to_s).host.to_s
+        ["localhost", "127.0.0.1", "0.0.0.0", "::1"].include?(host)
+      rescue URI::InvalidURIError
+        false
+      end
     end
   end
 end
diff --git a/test/crawlscope/crawl_test.rb b/test/crawlscope/crawl_test.rb
@@ -31,8 +31,13 @@ def test_returns_ok_when_metadata_is_valid
           <html>
             <head>
               <title>Pricing</title>
-              <meta name="description" content="Plans for hotels and restaurants">
+              <meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
               <link rel="canonical" href="https://example.com/pricing">
+              <meta property="og:title" content="Pricing">
+              <meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
+              <meta property="og:url" content="https://example.com/pricing">
+              <meta property="og:type" content="website">
+              <meta property="og:image" content="https://example.com/icon.png">
               <script type="application/ld+json">
                 {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
               </script>
@@ -95,7 +100,7 @@ def test_collects_metadata_issues_for_invalid_page
     ).call
 
     refute result.ok?
-    assert_equal %i[meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
+    assert_equal %i[incomplete_open_graph_tags meta_description_too_long missing_canonical missing_h1 missing_structured_data title_repeats_site_name].sort, result.issues.to_a.map(&:code).uniq.sort
   end
 
   def test_uses_browser_when_renderer_is_browser
@@ -128,8 +133,13 @@ def fetch(url)
           <html>
             <head>
               <title>Pricing</title>
-              <meta name="description" content="Plans for hotels and restaurants">
+              <meta name="description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
               <link rel="canonical" href="https://example.com/pricing">
+              <meta property="og:title" content="Pricing">
+              <meta property="og:description" content="Plans for hotels and restaurants that need practical software checks, clear metadata, and dependable search previews.">
+              <meta property="og:url" content="https://example.com/pricing">
+              <meta property="og:type" content="website">
+              <meta property="og:image" content="https://example.com/icon.png">
               <script type="application/ld+json">
                 {"@context":"https://schema.org","@type":"WebSite","name":"Example","url":"https://example.com"}
               </script>

diff --git a/test/crawlscope/links_rule_test.rb b/test/crawlscope/links_rule_test.rb
@@ -118,6 +118,45 @@ def test_reports_low_inbound_anchor_links
     assert_equal "https://example.com/guide", issues.to_a.first.url
   end
 
+  def test_counts_root_page_links_as_inbound_links
+    issues = Crawlscope::IssueCollection.new
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/", "https://example.com/about"],
+      pages: [
+        page(url: "https://example.com/", body: "<main><a href=\"/about\">About</a></main>"),
+        page(url: "https://example.com/about", body: "<main><p>About</p></main>")
+      ],
+      issues: issues,
+      context: context(resolver: ->(target_url) { {crawled: true, error: nil, final_url: target_url, status: 200} })
+    )
+
+    refute_includes issues.to_a.map(&:code), :low_inbound_anchor_links
+  end
+
+  def test_reports_internal_links_that_redirect
+    issues = Crawlscope::IssueCollection.new
+    resolver = lambda do |target_url|
+      {
+        crawled: false,
+        error: nil,
+        final_url: "https://example.com/pricing",
+        status: 200
+      }
+    end
+
+    Crawlscope::Rules::Links.new.call(
+      urls: ["https://example.com/guide"],
+      pages: [page(url: "https://example.com/guide", body: "<main><a href=\"/plans\">Plans</a></main>")],
+      issues: issues,
+      context: context(resolver: resolver)
+    )
+
+    redirect_issue = issues.to_a.find { |issue| issue.code == :internal_link_redirects }
+    assert redirect_issue
+    assert_includes redirect_issue.message, "https://example.com/pricing"
+  end
+
   def test_ignores_links_that_should_not_be_crawled
     issues = Crawlscope::IssueCollection.new
 

diff --git a/test/crawlscope/metadata_rule_test.rb b/test/crawlscope/metadata_rule_test.rb
@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+
+require "test_helper"
+
+class CrawlscopeMetadataRuleTest < Minitest::Test
+  def test_reports_short_meta_description_multiple_h1_and_incomplete_open_graph
+    issues = Crawlscope::IssueCollection.new
+
+    Crawlscope::Rules::Metadata.new.call(
+      urls: [page.url],
+      pages: [page],
+      issues: issues
+    )
+
+    codes = issues.to_a.map(&:code)
+    assert_includes codes, :meta_description_too_short
+    assert_includes codes, :multiple_h1
+    assert_includes codes, :incomplete_open_graph_tags
+  end
+
+  def test_allows_localhost_page_with_matching_production_canonical_path
+    issues = Crawlscope::IssueCollection.new
+    local_page = page(
+      url: "http://localhost:3000/about",
+      body: <<~HTML
+        <html>
+          <head>
+            <title>About</title>
+            <meta name="description" content="A clear description that is long enough for search snippets, local validation checks, and realistic production metadata audits.">
+            <link rel="canonical" href="https://www.example.com/about">
+            <meta property="og:title" content="About">
+            <meta property="og:description" content="About page">
+            <meta property="og:url" content="https://www.example.com/about">
+            <meta property="og:type" content="website">
+            <meta property="og:image" content="https://www.example.com/icon.png">
+          </head>
+          <body><main><h1>About</h1></main></body>
+        </html>
+      HTML
+    )
+
+    Crawlscope::Rules::Metadata.new.call(
+      urls: [local_page.url],
+      pages: [local_page],
+      issues: issues
+    )
+
+    refute_includes issues.to_a.map(&:code), :canonical_mismatch
+  end
+
+  private
+
+  def page(url: "https://example.com/about", body: nil)
+    body ||= <<~HTML
+      <html>
+        <head>
+          <title>About</title>
+          <meta name="description" content="Too short">
+          <link rel="canonical" href="https://example.com/about">
+          <meta property="og:title" content="About">
+        </head>
+        <body><main><h1>About</h1><h1>Team</h1></main></body>
+      </html>
+    HTML
+
+    Crawlscope::Page.new(
+      url: url,
+      normalized_url: Crawlscope::Url.normalize(url, base_url: url),
+      final_url: url,
+      normalized_final_url: Crawlscope::Url.normalize(url, base_url: url),
+      status: 200,
+      headers: {"content-type" => "text/html"},
+      body: body,
+      doc: Nokogiri::HTML(body)
+    )
+  end
+end