Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions lib/crawlscope/rules/structured_data.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
module Crawlscope
module Rules
class StructuredData
CAREER_DETAIL_PATH = %r{/careers/[^/]+/?\z}

attr_reader :code

def initialize
Expand Down Expand Up @@ -65,6 +67,51 @@ def validate_page(page, issues, schema_registry)
details: {errors: errors, source: source}
)
end

validate_job_posting_count(page, items, issues)
end

def validate_job_posting_count(page, items, issues)
job_postings = items.select { |item| structured_data_types(item.data).include?("JobPosting") }
return if job_postings.size == 1

if job_postings.size > 1
issues.add(
code: :multiple_job_postings,
severity: :warning,
category: :structured_data,
url: page.url,
message: "multiple JobPosting structured data blocks found",
details: {count: job_postings.size}
)
elsif career_detail_page?(page.url)
issues.add(
code: :missing_job_posting,
severity: :warning,
category: :structured_data,
url: page.url,
message: "career detail page missing JobPosting structured data",
details: {expected_type: "JobPosting"}
)
end
end

def structured_data_types(data)
return [] unless data.is_a?(Hash)

types = Array(data["@type"]).map(&:to_s)

if data["@graph"].is_a?(Array)
types.concat(data["@graph"].flat_map { |entry| structured_data_types(entry) })
end

types
end

def career_detail_page?(url)
URI(url).path.match?(CAREER_DETAIL_PATH)
rescue URI::InvalidURIError
false
end
end
end
Expand Down
53 changes: 52 additions & 1 deletion lib/crawlscope/schemas.rb
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,56 @@ class Schemas
}
}.freeze

JOB_POSTING = {
type: "object",
additionalProperties: true,
required: ["@type", "title", "description", "datePosted", "hiringOrganization"],
properties: {
"@context" => {enum: ["https://schema.org", "https://schema.org/"]},
"@type" => {const: "JobPosting"},
:title => {type: "string"},
:description => {type: "string"},
:identifier => {type: "object"},
:datePosted => {type: "string"},
:validThrough => {type: "string"},
:employmentType => {
anyOf: [
{type: "string"},
{type: "array", minItems: 1, items: {type: "string"}}
]
},
:directApply => {type: "boolean"},
:hiringOrganization => {
type: "object",
required: ["@type", "name"],
properties: {
"@type" => {const: "Organization"},
:name => {type: "string"},
:sameAs => {type: "string", format: "uri"},
:logo => {type: "string", format: "uri"}
}
},
:applicantLocationRequirements => {
anyOf: [
{type: "object"},
{type: "array", minItems: 1, items: {type: "object"}}
]
},
:jobLocationType => {type: "string"},
:jobLocation => {
anyOf: [
{type: "object"},
{type: "array", minItems: 1, items: {type: "object"}}
]
},
:baseSalary => {type: "object"}
},
anyOf: [
{required: ["jobLocation"]},
{required: ["jobLocationType", "applicantLocationRequirements"]}
]
}.freeze

def self.schemas
{
"FAQPage" => FAQ_PAGE,
Expand All @@ -348,7 +398,8 @@ def self.schemas
"Recipe" => RECIPE,
"Event" => EVENT,
"VideoObject" => VIDEO_OBJECT,
"WebPage" => WEB_PAGE
"WebPage" => WEB_PAGE,
"JobPosting" => JOB_POSTING
}
end
end
Expand Down
91 changes: 91 additions & 0 deletions test/crawlscope/structured_data_rule_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,97 @@ def test_reports_missing_structured_data_for_html_pages
assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources]
end

def test_validates_job_posting_markup
issues = Crawlscope::IssueCollection.new
rule = Crawlscope::Rules::StructuredData.new
page = page(
url: "https://example.com/careers/sales-partner",
body: <<~HTML
<html>
<head>
<script type="application/ld+json">
{
"@context":"https://schema.org/",
"@type":"JobPosting",
"title":"Sales Partner",
"description":"A real role description.",
"datePosted":"2026-04-28",
"hiringOrganization":{"@type":"Organization","name":"Example","sameAs":"https://example.com/","logo":"https://example.com/icon.png"},
"jobLocationType":"TELECOMMUTE",
"applicantLocationRequirements":[{"@type":"Country","name":"South Africa"}]
}
</script>
</head>
<body><h1>Sales Partner</h1></body>
</html>
HTML
)

rule.call(
urls: [page.url],
pages: [page],
issues: issues,
context: {schema_registry: Crawlscope::SchemaRegistry.default}
)

assert_empty issues.to_a
end

def test_reports_schema_errors_for_invalid_job_posting_markup
issues = Crawlscope::IssueCollection.new
rule = Crawlscope::Rules::StructuredData.new
page = page(
url: "https://example.com/careers/sales-partner",
body: <<~HTML
<html>
<head>
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"JobPosting","title":"Sales Partner"}
</script>
</head>
<body><h1>Sales Partner</h1></body>
</html>
HTML
)

rule.call(
urls: [page.url],
pages: [page],
issues: issues,
context: {schema_registry: Crawlscope::SchemaRegistry.default}
)

assert_equal [:structured_data_schema_error], issues.to_a.map(&:code)
assert_includes issues.to_a.first.message, "description"
end

def test_reports_missing_job_posting_for_career_detail_pages
issues = Crawlscope::IssueCollection.new
rule = Crawlscope::Rules::StructuredData.new
page = page(
url: "https://example.com/careers/sales-partner",
body: <<~HTML
<html>
<head>
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"WebPage","name":"Sales Partner"}
</script>
</head>
<body><h1>Sales Partner</h1></body>
</html>
HTML
)

rule.call(
urls: [page.url],
pages: [page],
issues: issues,
context: {schema_registry: Crawlscope::SchemaRegistry.default}
)

assert_equal [:missing_job_posting], issues.to_a.map(&:code)
end

private

def page(url:, body:)
Expand Down
Loading