Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ bin/rails crawlscope:validate:metadata
bin/rails crawlscope:validate:structured_data
bin/rails crawlscope:validate:uniqueness
bin/rails crawlscope:validate:links
bin/rails crawlscope:validate:ldjson URL=https://example.com/article
bin/rails crawlscope:validate:ldjson
```

The same validation surface is also available in the gem repository itself through plain `rake`:
Expand All @@ -163,7 +163,7 @@ bundle exec rake crawlscope:validate:ldjson URL=https://example.com/article

`crawlscope:validate` runs all default sitemap rules: metadata, structured data, uniqueness, and links. `URL` is the site base. Without `SITEMAP`, Crawlscope uses `/sitemap.xml`. With `SITEMAP`, Crawlscope uses `URL` as the site base and validates URLs from that sitemap. `SITEMAP` may be a full URL or a local file path.

`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap.
`crawlscope:validate:ldjson` is separate because it directly checks the URL or semicolon-separated URLs in `URL`; it does not crawl the sitemap. Without `URL`, it checks the configured base URL, falling back to `http://localhost:3000`.

### Structured Data URL Audit

Expand Down
2 changes: 1 addition & 1 deletion Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ namespace :crawlscope do
end

namespace :validate do
desc "Directly validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
task :ldjson do
Crawlscope::RakeTasks.ldjson
end
Expand Down
5 changes: 5 additions & 0 deletions lib/crawlscope/cli.rb
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def run_ldjson
parser.parse!(@argv)

urls = options[:urls].map(&:strip).reject(&:empty?)
urls = default_urls if urls.empty?
raise ConfigurationError, "Crawlscope URL is not configured" if urls.empty?

configure_renderer(options[:renderer])
Expand Down Expand Up @@ -238,6 +239,10 @@ def resolved_urls_from_env
raw_urls.split(";").map(&:strip).reject(&:empty?)
end

def default_urls
[normalized_string(@configuration.base_url) || "http://localhost:3000"]
end

def task
@task ||= Run.new(configuration: @configuration, reporter: Reporter.new(io: @out))
end
Expand Down
2 changes: 1 addition & 1 deletion lib/tasks/crawlscope_tasks.rake
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ namespace :crawlscope do
end

namespace :validate do
desc "Directly validate JSON-LD on one or more URLs. ENV: URL (required, semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
desc "Directly validate JSON-LD on one or more URLs. ENV: URL (semicolon-separated), DEBUG=1, JS=1, TIMEOUT, NETWORK_IDLE_TIMEOUT, REPORT_PATH, SUMMARY=1"
task ldjson: :environment do
Crawlscope::RakeTasks.ldjson
end
Expand Down
24 changes: 19 additions & 5 deletions test/crawlscope/cli_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

class CrawlscopeCliTest < Minitest::Test
class FakeConfiguration
attr_accessor :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds
attr_accessor :base_url, :concurrency, :network_idle_timeout_seconds, :output, :renderer, :timeout_seconds

def initialize
@base_url = nil
@concurrency = 10
@network_idle_timeout_seconds = 5
@renderer = :http
Expand Down Expand Up @@ -145,6 +146,17 @@ def test_ldjson_reads_urls_from_environment
assert_empty err.string
end

def test_ldjson_defaults_to_configured_base_url
configuration = FakeConfiguration.new
configuration.base_url = "https://example.com"
task = FakeTask.new

status = Crawlscope::Cli.start(["ldjson"], out: StringIO.new, err: StringIO.new, configuration: configuration, task: task)

assert_equal 0, status
assert_equal ["https://example.com"], task.json_ld_arguments[:urls]
end

def test_validate_caps_default_browser_concurrency
configuration = FakeConfiguration.new
task = FakeTask.new
Expand Down Expand Up @@ -218,14 +230,16 @@ def test_ldjson_accepts_repeated_urls_and_options
assert_equal 3, configuration.network_idle_timeout_seconds
end

def test_ldjson_requires_urls
def test_ldjson_defaults_to_localhost
out = StringIO.new
err = StringIO.new
task = FakeTask.new

status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: FakeTask.new)
status = Crawlscope::Cli.start(["ldjson"], out: out, err: err, configuration: FakeConfiguration.new, task: task)

assert_equal 1, status
assert_includes err.string, "Crawlscope URL is not configured"
assert_equal 0, status
assert_equal ["http://localhost:3000"], task.json_ld_arguments[:urls]
assert_empty err.string
end

def test_invalid_integer_option_returns_error
Expand Down
Loading