From 9d76ff900a54952e1929d94a977f94b163be70a7 Mon Sep 17 00:00:00 2001 From: Paulo Fidalgo Date: Tue, 28 Apr 2026 23:47:22 +0300 Subject: [PATCH] fix: ldjson check now uses the same convention for default URL --- README.md | 4 ++-- Rakefile | 2 +- lib/crawlscope/cli.rb | 5 +++++ lib/tasks/crawlscope_tasks.rake | 2 +- test/crawlscope/cli_test.rb | 24 +++++++++++++++++++----- 5 files changed, 28 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 6f4b138..41eb851 100644 --- a/README.md +++ b/README.md @@ -150,7 +150,7 @@ bin/rails crawlscope:validate:metadata bin/rails crawlscope:validate:structured_data bin/rails crawlscope:validate:uniqueness bin/rails crawlscope:validate:links -bin/rails crawlscope:validate:ldjson URL=https://example.com/article +bin/rails crawlscope:validate:ldjson ``` The same validation surface is also available in the gem repository itself through plain `rake`: @@ -163,7 +163,7 @@ bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article `crawlscope:validate` runs all default sitemap rules: metadata, structured data, uniqueness, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path. -`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. +`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`. ### Structured Data URL Audit diff --git a/Rakefile b/Rakefile index 9cd1b1c..8c221c1 100644 --- a/Rakefile +++ b/Rakefile @@ -105,7 +105,7 @@ namespace :crawlscope do end namespace :validate do - desc "Directly validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1" + desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1" task :ldjson do Crawlscope::RakeTasks.ldjson end diff --git a/lib/crawlscope/cli.rb b/lib/crawlscope/cli.rb index 03c4010..41c01bc 100644 --- a/lib/crawlscope/cli.rb +++ b/lib/crawlscope/cli.rb @@ -105,6 +105,7 @@ def run_ldjson parser.parse!(@argv) urls = options[:urls].map(&:strip).reject(&:empty?) + urls = default_urls if urls.empty? raise ConfigurationError, "Crawlscope URL is not configured" if urls.empty? configure_renderer(options[:renderer]) @@ -238,6 +239,10 @@ def resolved_urls_from_env raw_urls.split(";").map(&:strip).reject(&:empty?) end + def default_urls + [normalized_string(@configuration.base_url) || "http://localhost:3000"] + end + def task @task ||= Run.new(configuration: @configuration, reporter: Reporter.new(io: @out)) end diff --git a/lib/tasks/crawlscope_tasks.rake b/lib/tasks/crawlscope_tasks.rake index a06dbe1..53aa3c1 100644 --- a/lib/tasks/crawlscope_tasks.rake +++ b/lib/tasks/crawlscope_tasks.rake @@ -5,7 +5,7 @@ namespace :crawlscope do end namespace :validate do - desc "Directly validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1" + desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1" task ldjson: :environment do Crawlscope::RakeTasks.ldjson end diff --git a/test/crawlscope/cli_test.rb b/test/crawlscope/cli_test.rb index d6b25d1..6f876d2 100644 --- a/test/crawlscope/cli_test.rb +++ b/test/crawlscope/cli_test.rb @@ -4,9 +4,10 @@ class CrawlscopeCliTest < Minitest::Test class FakeConfiguration - attr_accessor :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds + attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds def initialize + @base_url = nil @concurrency = 10 @network_idle_timeout_seconds = 5 @renderer = :http @@ -145,6 +146,17 @@ def test_ldjson_reads_urls_from_environment assert_empty err.string end + def test_ldjson_defaults_to_configured_base_url + configuration = FakeConfiguration.new + configuration.base_url = "https://example.com" + task = FakeTask.new + + status = Crawlscope::Cli.start(["ldjson"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task) + + assert_equal 0, status + assert_equal ["https://example.com"], task.json_ld_arguments[:urls] + end + def test_validate_caps_default_browser_concurrency configuration = FakeConfiguration.new task = FakeTask.new @@ -218,14 +230,16 @@ def test_ldjson_accepts_repeated_urls_and_options assert_equal 3, configuration.network_idle_timeout_seconds end - def test_ldjson_requires_urls + def test_ldjson_defaults_to_localhost out = StringIO.new err = StringIO.new + task = FakeTask.new - status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new) + status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: task) - assert_equal 1, status - assert_includes err.string, "Crawlscope URL is not configured" + assert_equal 0, status + assert_equal ["http://localhost:3000"], task.json_ld_arguments[:urls] + assert_empty err.string end def test_invalid_integer_option_returns_error