diff --git a/lib/crawlscope/rules/structured_data.rb b/lib/crawlscope/rules/structured_data.rb index ce063ed..1d19719 100644 --- a/lib/crawlscope/rules/structured_data.rb +++ b/lib/crawlscope/rules/structured_data.rb @@ -3,6 +3,8 @@ module Crawlscope module Rules class StructuredData + CAREER_DETAIL_PATH = %r{/careers/[^/]+/?\z} + attr_reader :code def initialize @@ -65,6 +67,51 @@ def validate_page(page, issues, schema_registry) details: {errors: errors, source: source} ) end + + validate_job_posting_count(page, items, issues) + end + + def validate_job_posting_count(page, items, issues) + job_postings = items.select { |item| structured_data_types(item.data).include?("JobPosting") } + return if job_postings.size == 1 + + if job_postings.size > 1 + issues.add( + code: :multiple_job_postings, + severity: :warning, + category: :structured_data, + url: page.url, + message: "multiple JobPosting structured data blocks found", + details: {count: job_postings.size} + ) + elsif career_detail_page?(page.url) + issues.add( + code: :missing_job_posting, + severity: :warning, + category: :structured_data, + url: page.url, + message: "career detail page missing JobPosting structured data", + details: {expected_type: "JobPosting"} + ) + end + end + + def structured_data_types(data) + return [] unless data.is_a?(Hash) + + types = Array(data["@type"]).map(&:to_s) + + if data["@graph"].is_a?(Array) + types.concat(data["@graph"].flat_map { |entry| structured_data_types(entry) }) + end + + types + end + + def career_detail_page?(url) + URI(url).path.match?(CAREER_DETAIL_PATH) + rescue URI::InvalidURIError + false end end end diff --git a/lib/crawlscope/schemas.rb b/lib/crawlscope/schemas.rb index 2cee548..dd8ddc5 100644 --- a/lib/crawlscope/schemas.rb +++ b/lib/crawlscope/schemas.rb @@ -330,6 +330,56 @@ class Schemas } }.freeze + JOB_POSTING = { + type: "object", + additionalProperties: true, + required: ["@type", "title", "description", "datePosted", "hiringOrganization"], + properties: { + "@context" => {enum: ["https://schema.org", "https://schema.org/"]}, + "@type" => {const: "JobPosting"}, + :title => {type: "string"}, + :description => {type: "string"}, + :identifier => {type: "object"}, + :datePosted => {type: "string"}, + :validThrough => {type: "string"}, + :employmentType => { + anyOf: [ + {type: "string"}, + {type: "array", minItems: 1, items: {type: "string"}} + ] + }, + :directApply => {type: "boolean"}, + :hiringOrganization => { + type: "object", + required: ["@type", "name"], + properties: { + "@type" => {const: "Organization"}, + :name => {type: "string"}, + :sameAs => {type: "string", format: "uri"}, + :logo => {type: "string", format: "uri"} + } + }, + :applicantLocationRequirements => { + anyOf: [ + {type: "object"}, + {type: "array", minItems: 1, items: {type: "object"}} + ] + }, + :jobLocationType => {type: "string"}, + :jobLocation => { + anyOf: [ + {type: "object"}, + {type: "array", minItems: 1, items: {type: "object"}} + ] + }, + :baseSalary => {type: "object"} + }, + anyOf: [ + {required: ["jobLocation"]}, + {required: ["jobLocationType", "applicantLocationRequirements"]} + ] + }.freeze + def self.schemas { "FAQPage" => FAQ_PAGE, @@ -348,7 +398,8 @@ def self.schemas "Recipe" => RECIPE, "Event" => EVENT, "VideoObject" => VIDEO_OBJECT, - "WebPage" => WEB_PAGE + "WebPage" => WEB_PAGE, + "JobPosting" => JOB_POSTING } end end diff --git a/test/crawlscope/structured_data_rule_test.rb b/test/crawlscope/structured_data_rule_test.rb index 0ec3d29..dd9e10c 100644 --- a/test/crawlscope/structured_data_rule_test.rb +++ b/test/crawlscope/structured_data_rule_test.rb @@ -79,6 +79,97 @@ def test_reports_missing_structured_data_for_html_pages assert_equal ["json-ld", "microdata"], issues.to_a.first.details[:expected_sources] end + def test_validates_job_posting_markup + issues = Crawlscope::IssueCollection.new + rule = Crawlscope::Rules::StructuredData.new + page = page( + url: "https://example.com/careers/sales-partner", + body: <<~HTML + + + + +

Sales Partner

+ + HTML + ) + + rule.call( + urls: [page.url], + pages: [page], + issues: issues, + context: {schema_registry: Crawlscope::SchemaRegistry.default} + ) + + assert_empty issues.to_a + end + + def test_reports_schema_errors_for_invalid_job_posting_markup + issues = Crawlscope::IssueCollection.new + rule = Crawlscope::Rules::StructuredData.new + page = page( + url: "https://example.com/careers/sales-partner", + body: <<~HTML + + + + +

Sales Partner

+ + HTML + ) + + rule.call( + urls: [page.url], + pages: [page], + issues: issues, + context: {schema_registry: Crawlscope::SchemaRegistry.default} + ) + + assert_equal [:structured_data_schema_error], issues.to_a.map(&:code) + assert_includes issues.to_a.first.message, "description" + end + + def test_reports_missing_job_posting_for_career_detail_pages + issues = Crawlscope::IssueCollection.new + rule = Crawlscope::Rules::StructuredData.new + page = page( + url: "https://example.com/careers/sales-partner", + body: <<~HTML + + + + +

Sales Partner

+ + HTML + ) + + rule.call( + urls: [page.url], + pages: [page], + issues: issues, + context: {schema_registry: Crawlscope::SchemaRegistry.default} + ) + + assert_equal [:missing_job_posting], issues.to_a.map(&:code) + end + private def page(url:, body:)