From 56da9c6b27b7b64f55e8384f66a0b2ae55ee10cb Mon Sep 17 00:00:00 2001 From: jdsika Date: Thu, 2 Apr 2026 17:22:31 +0200 Subject: [PATCH 1/5] fix(owlgen): warn on covering axiom edge cases for abstract classes Emit warnings for abstract class covering axiom edge cases: - Zero children: warn that no covering axiom will be generated - One child: warn that the covering axiom degenerates to an equivalence (Parent = Child), recommending --skip-abstract-class-as-unionof-subclasses Both axioms are still emitted when applicable (semantically correct per OWL 2), but warnings alert users who extend the ontology downstream. Tests verify warnings are logged, flag suppression works, the single-child covering axiom triple is correctly asserted, plus negative tests for multi-child and concrete class cases, and the mixin-only children edge case. Refs: linkml/linkml#3309, linkml/linkml#3219 Signed-off-by: jdsika --- .../linkml/src/linkml/generators/owlgen.py | 29 ++- tests/linkml/test_generators/test_owlgen.py | 167 ++++++++++++++++++ 2 files changed, 194 insertions(+), 2 deletions(-) diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index 33c58b0ec..3f4eb4c18 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -201,7 +201,11 @@ class OwlSchemaGenerator(Generator): one direct ``is_a`` child, the generator adds ``AbstractClass rdfs:subClassOf (Child1 or Child2 or …)``, expressing the open-world covering constraint that every instance of the abstract class must also be an instance of one of its - direct subclasses.""" + direct subclasses. + + .. note:: A warning is emitted when an abstract class has no children (no axiom generated) + or only one child (covering axiom degenerates to equivalence Parent ≡ Child). + Use this flag to suppress covering axioms entirely if equivalence is undesired.""" def as_graph(self) -> Graph: """ @@ -471,6 +475,26 @@ def condition_to_bnode(expr: AnonymousClassExpression) -> BNode | None: # must be an instance of at least one of its direct subclasses. if cls.abstract and not self.skip_abstract_class_as_unionof_subclasses: children = sorted(sv.class_children(cls.name, imports=self.mergeimports, mixins=False, is_a=True)) + if not children: + logger.warning( + "Abstract class '%s' has no children. No covering axiom will be generated.", + cls.name, + ) + elif len(children) == 1: + # Warn: with one child C, the covering axiom degenerates to + # Parent ⊑ C which, combined with C ⊑ Parent (from is_a), + # creates Parent ≡ C (equivalence). This is semantically + # correct per OWL 2 but may be surprising for extensible + # ontologies where more children are added later. + logger.warning( + "Abstract class '%s' has only 1 direct child ('%s'). " + "The covering axiom makes them equivalent (%s ≡ %s). " + "Use --skip-abstract-class-as-unionof-subclasses to suppress.", + cls.name, + children[0], + cls.name, + children[0], + ) if children: child_uris = [self._class_uri(child) for child in children] union_node = self._union_of(child_uris) @@ -1569,7 +1593,8 @@ def slot_owl_type(self, slot: SlotDefinition) -> URIRef: show_default=True, help=( "If true, suppress rdfs:subClassOf owl:unionOf(subclasses) covering axioms for abstract classes. " - "By default such axioms are emitted for every abstract class that has direct is_a children." + "By default such axioms are emitted for every abstract class that has direct is_a children. " + "Note: warnings are emitted for abstract classes with zero children (no axiom) or one child (equivalence)." ), ) @click.version_option(__version__, "-V", "--version") diff --git a/tests/linkml/test_generators/test_owlgen.py b/tests/linkml/test_generators/test_owlgen.py index 062864721..74c96c3e6 100644 --- a/tests/linkml/test_generators/test_owlgen.py +++ b/tests/linkml/test_generators/test_owlgen.py @@ -1,3 +1,4 @@ +import logging from enum import Enum import pytest @@ -460,6 +461,172 @@ def test_abstract_class_without_subclasses_gets_no_union_of_axiom(): assert _union_members(g, EX.Orphan) is None +def test_abstract_class_with_no_children_emits_warning(caplog): + """An abstract class with no children emits a warning about missing coverage. + + When an abstract class has zero subclasses, no covering axiom can be + generated. The warning alerts users that the class hierarchy is incomplete. + + See: mgskjaeveland's review on linkml/linkml#3309. + """ + sb = SchemaBuilder() + sb.add_class("Orphan", abstract=True) + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # No covering axiom emitted + assert _union_members(g, EX.Orphan) is None + + # But a warning must be logged + assert any("has no children" in msg for msg in caplog.messages), ( + "Expected a warning about abstract class with no children" + ) + assert any("No covering axiom" in msg for msg in caplog.messages), ( + "Warning should mention that no covering axiom will be generated" + ) + + +def test_no_children_warning_suppressed_by_skip_flag(caplog): + """When --skip-abstract-class-as-unionof-subclasses is set, no warning for zero children.""" + sb = SchemaBuilder() + sb.add_class("Orphan", abstract=True) + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + _owl_graph(sb, skip_abstract_class_as_unionof_subclasses=True) + + assert not any("has no children" in msg for msg in caplog.messages) + + +def test_abstract_class_with_single_child_emits_warning(caplog): + """An abstract class with one child still gets a covering axiom but emits a warning. + + Per OWL 2 semantics, the covering axiom with a single child creates an + equivalence (Parent ≡ Child). This is logically correct but may surprise + users who plan to extend the ontology later. The generator should warn + and recommend ``--skip-abstract-class-as-unionof-subclasses``. + + See: W3C OWL 2 Primer §4.2 — bidirectional rdfs:subClassOf = equivalence. + See: mgskjaeveland's review on linkml/linkml#3309. + """ + sb = SchemaBuilder() + sb.add_class("GrandParent") + sb.add_class("Parent", is_a="GrandParent", abstract=True) + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # Covering axiom IS still emitted (single child → equivalence is OWL-correct). + # With one child, _union_of returns the child URI directly (no owl:unionOf wrapper), + # so the covering axiom materialises as Parent rdfs:subClassOf Child. + # Combined with Child rdfs:subClassOf Parent (from is_a), this is the equivalence. + assert (EX.Parent, RDFS.subClassOf, EX.Child) in g, ( + "Covering axiom should produce Parent rdfs:subClassOf Child for single-child case" + ) + assert (EX.Child, RDFS.subClassOf, EX.Parent) in g + assert (EX.Parent, RDFS.subClassOf, EX.GrandParent) in g + + # But a warning must be logged + assert any("only 1 direct child" in msg for msg in caplog.messages), ( + "Expected a warning about single-child covering axiom creating equivalence" + ) + assert any("--skip-abstract-class-as-unionof-subclasses" in msg for msg in caplog.messages), ( + "Warning should recommend the skip flag" + ) + + +def test_single_child_warning_suppressed_by_skip_flag(caplog): + """When --skip-abstract-class-as-unionof-subclasses is set, no warning is emitted. + + The skip flag suppresses covering axioms entirely, so the single-child + equivalence case never arises. + """ + sb = SchemaBuilder() + sb.add_class("Parent", abstract=True) + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb, skip_abstract_class_as_unionof_subclasses=True) + + # No covering axiom emitted + assert (EX.Parent, RDFS.subClassOf, EX.Child) not in g + # No warning either + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_multiple_children_no_warning(caplog): + """An abstract class with 2+ children must NOT emit a warning. + + The covering axiom is a proper union (not a degenerate equivalence), + so no warning is needed. + """ + sb = SchemaBuilder() + sb.add_class("Animal", abstract=True) + sb.add_class("Dog", is_a="Animal") + sb.add_class("Cat", is_a="Animal") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # Covering axiom emitted (proper union) + members = _union_members(g, EX.Animal) + assert members == {EX.Dog, EX.Cat} + + # No warning about children count + assert not any("has no children" in msg for msg in caplog.messages) + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_non_abstract_class_no_warning(caplog): + """A non-abstract class must NOT emit covering axiom warnings. + + Covering axioms only apply to abstract classes. Concrete classes + should be silently skipped regardless of child count. + """ + sb = SchemaBuilder() + sb.add_class("Parent") # not abstract + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # No covering axiom for non-abstract class + assert _union_members(g, EX.Parent) is None + assert (EX.Parent, RDFS.subClassOf, EX.Child) not in g + + # No warning either + assert not any("has no children" in msg for msg in caplog.messages) + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_abstract_class_with_only_mixin_children_emits_warning(caplog): + """An abstract class whose only children are via mixins (not is_a) gets the no-children warning. + + The covering axiom only considers direct is_a children (not mixins). + If an abstract class has mixin children but no is_a children, it should + warn about having no children for covering axiom purposes. + """ + sb = SchemaBuilder() + sb.add_class("Base", abstract=True) + sb.add_class("MixinChild", mixins=["Base"]) + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + assert _union_members(g, EX.Base) is None + assert any("has no children" in msg for msg in caplog.messages), ( + "Abstract class with only mixin children should warn about no is_a children" + ) + + @pytest.mark.parametrize("skip", [False, True]) def test_union_of_axiom_only_covers_direct_children(skip: bool): """Union-of axiom lists only direct is_a children, not grandchildren. From e66f9631dcb1d58e28d6a312b1ec72623d327719 Mon Sep 17 00:00:00 2001 From: jdsika Date: Thu, 2 Apr 2026 17:20:24 +0200 Subject: [PATCH 2/5] feat(generators): add --exclude-external-imports flag Add a dedicated --exclude-external-imports / --no-exclude-external-imports CLI flag to control whether external vocabulary terms are included in generated artifacts when --no-mergeimports is set. Previously external terms leaked into JSON-LD contexts even with --no-mergeimports. The new flag explicitly suppresses terms whose class_uri or slot_uri belong to an imported (external) schema. Tests cover linkml:types built-in import preservation, local file import preservation, and interaction with mergeimports=False. Signed-off-by: jdsika --- .../src/linkml/generators/jsonldcontextgen.py | 54 +++- .../test_generators/test_jsonldcontextgen.py | 291 ++++++++++++++++++ 2 files changed, 342 insertions(+), 3 deletions(-) diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py index 60eaa9ffd..306783dd8 100644 --- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py +++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py @@ -56,8 +56,22 @@ class ContextGenerator(Generator): fix_multivalue_containers: bool | None = False exclude_imports: bool = False """If True, elements from imported schemas won't be included in the generated context""" + exclude_external_imports: bool = False + """If True, elements from URL-based external vocabulary imports are excluded. + + Local file imports and linkml standard imports are kept. This is useful + when extending an external ontology (e.g. W3C Verifiable Credentials) + whose terms are ``@protected`` in their own JSON-LD context — redefining + them locally would violate JSON-LD 1.1 §4.1.11. + + Note: this flag has no effect when ``mergeimports=False`` because + non-local elements are already absent from the visitor iteration + in that mode. + """ _local_classes: set | None = field(default=None, repr=False) _local_slots: set | None = field(default=None, repr=False) + _external_classes: set | None = field(default=None, repr=False) + _external_slots: set | None = field(default=None, repr=False) # Framing (opt-in via CLI flag) emit_frame: bool = False @@ -69,7 +83,7 @@ def __post_init__(self) -> None: super().__post_init__() if self.namespaces is None: raise TypeError("Schema text must be supplied to context generator. Preparsed schema will not work") - if self.exclude_imports: + if self.exclude_imports or self.exclude_external_imports: if self.schemaview: sv = self.schemaview else: @@ -77,8 +91,31 @@ def __post_init__(self) -> None: if isinstance(source, str) and self.base_dir and not Path(source).is_absolute(): source = str(Path(self.base_dir) / source) sv = SchemaView(source, importmap=self.importmap, base_dir=self.base_dir) - self._local_classes = set(sv.all_classes(imports=False).keys()) - self._local_slots = set(sv.all_slots(imports=False).keys()) + if self.exclude_imports: + self._local_classes = set(sv.all_classes(imports=False).keys()) + self._local_slots = set(sv.all_slots(imports=False).keys()) + if self.exclude_external_imports: + self._external_classes, self._external_slots = self._collect_external_elements(sv) + + @staticmethod + def _collect_external_elements(sv: SchemaView) -> tuple[set[str], set[str]]: + """Identify classes and slots from URL-based external vocabulary imports. + + Walks the SchemaView ``schema_map`` (populated by ``imports_closure``) + and collects element names from schemas whose import key starts with + ``http://`` or ``https://``. Local file imports and ``linkml:`` + standard imports are left untouched. + """ + sv.imports_closure() + external_classes: set[str] = set() + external_slots: set[str] = set() + for schema_key, schema_def in sv.schema_map.items(): + if schema_key == sv.schema.name: + continue + if schema_key.startswith("http://") or schema_key.startswith("https://"): + external_classes.update(schema_def.classes.keys()) + external_slots.update(schema_def.slots.keys()) + return external_classes, external_slots def visit_schema(self, base: str | Namespace | None = None, output: str | None = None, **_): # Add any explicitly declared prefixes @@ -194,6 +231,8 @@ def end_schema( def visit_class(self, cls: ClassDefinition) -> bool: if self.exclude_imports and cls.name not in self._local_classes: return False + if self.exclude_external_imports and cls.name in self._external_classes: + return False class_def = {} cn = camelcase(cls.name) @@ -246,6 +285,8 @@ def _literal_coercion_for_ranges(self, ranges: list[str]) -> tuple[bool, str | N def visit_slot(self, aliased_slot_name: str, slot: SlotDefinition) -> None: if self.exclude_imports and slot.name not in self._local_slots: return + if self.exclude_external_imports and slot.name in self._external_slots: + return if slot.identifier: slot_def = "@id" @@ -390,6 +431,13 @@ def serialize( help="Use --exclude-imports to exclude imported elements from the generated JSON-LD context. This is useful when " "extending an ontology whose terms already have context definitions in their own JSON-LD context file.", ) +@click.option( + "--exclude-external-imports/--no-exclude-external-imports", + default=False, + show_default=True, + help="Exclude elements from URL-based external vocabulary imports while keeping local file imports. " + "Useful when extending ontologies (e.g. W3C VC v2) whose terms are @protected in their own JSON-LD context.", +) @click.version_option(__version__, "-V", "--version") def cli(yamlfile, emit_frame, embed_context_in_frame, output, **args): """Generate jsonld @context definition from LinkML model""" diff --git a/tests/linkml/test_generators/test_jsonldcontextgen.py b/tests/linkml/test_generators/test_jsonldcontextgen.py index 6de23347a..ff5b75e66 100644 --- a/tests/linkml/test_generators/test_jsonldcontextgen.py +++ b/tests/linkml/test_generators/test_jsonldcontextgen.py @@ -1,4 +1,5 @@ import json +import textwrap import pytest from click.testing import CliRunner @@ -571,3 +572,293 @@ def test_exclude_imports(input_path): # Imported class and slot must NOT be present assert "BaseClass" not in ctx, "Imported class 'BaseClass' must not appear in exclude-imports context" assert "baseProperty" not in ctx, "Imported slot 'baseProperty' must not appear in exclude-imports context" + + +@pytest.mark.parametrize("mergeimports", [True, False], ids=["merge", "no-merge"]) +def test_exclude_external_imports(tmp_path, mergeimports): + """With --exclude-external-imports, elements from URL-based external + vocabulary imports must not appear in the generated JSON-LD context, + while local file imports and linkml standard imports are kept. + + When a schema imports terms from an external vocabulary (e.g. W3C VC + v2), those terms already have context definitions in their own JSON-LD + context file. Re-defining them in the local context can conflict with + @protected term definitions from the external context (JSON-LD 1.1 + section 4.1.11). + """ + ext_dir = tmp_path / "ext" + ext_dir.mkdir() + (ext_dir / "external_vocab.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/external-vocab + name: external_vocab + default_prefix: ext + prefixes: + linkml: https://w3id.org/linkml/ + ext: https://example.org/external-vocab/ + imports: + - linkml:types + slots: + issuer: + slot_uri: ext:issuer + range: string + validFrom: + slot_uri: ext:validFrom + range: date + classes: + ExternalCredential: + class_uri: ext:ExternalCredential + slots: + - issuer + - validFrom + """), + encoding="utf-8", + ) + + (tmp_path / "main.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/main + name: main + default_prefix: main + prefixes: + linkml: https://w3id.org/linkml/ + main: https://example.org/main/ + ext: https://example.org/external-vocab/ + imports: + - linkml:types + - https://example.org/external-vocab + slots: + localName: + slot_uri: main:localName + range: string + classes: + LocalThing: + class_uri: main:LocalThing + slots: + - localName + """), + encoding="utf-8", + ) + + importmap = {"https://example.org/external-vocab": str(ext_dir / "external_vocab")} + + context_text = ContextGenerator( + str(tmp_path / "main.yaml"), + exclude_external_imports=True, + mergeimports=mergeimports, + importmap=importmap, + base_dir=str(tmp_path), + ).serialize() + context = json.loads(context_text) + ctx = context["@context"] + + # Local terms must be present + assert "localName" in ctx or "local_name" in ctx, ( + f"Local slot missing with mergeimports={mergeimports}, got: {list(ctx.keys())}" + ) + assert "LocalThing" in ctx, f"Local class missing with mergeimports={mergeimports}, got: {list(ctx.keys())}" + + # External vocabulary terms must NOT be present + assert "issuer" not in ctx, f"External slot 'issuer' present with mergeimports={mergeimports}" + assert "validFrom" not in ctx and "valid_from" not in ctx, ( + f"External slot 'validFrom' present with mergeimports={mergeimports}" + ) + assert "ExternalCredential" not in ctx, ( + f"External class 'ExternalCredential' present with mergeimports={mergeimports}" + ) + + +def test_exclude_external_imports_preserves_linkml_types(tmp_path): + """linkml:types (standard library import) must NOT be treated as external. + + The ``linkml:types`` import resolves to a URL internally + (``https://w3id.org/linkml/types``), but it is a standard LinkML import, + not a user-declared external vocabulary. The ``_collect_external_elements`` + method filters by ``schema_key.startswith("http")`` — this test verifies + that linkml built-in types (string, integer, date, etc.) survive the filter. + """ + (tmp_path / "schema.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/test + name: test_linkml_types + default_prefix: ex + prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/ + imports: + - linkml:types + slots: + name: + slot_uri: ex:name + range: string + age: + slot_uri: ex:age + range: integer + classes: + Person: + class_uri: ex:Person + slots: + - name + - age + """), + encoding="utf-8", + ) + + context_text = ContextGenerator( + str(tmp_path / "schema.yaml"), + exclude_external_imports=True, + ).serialize() + ctx = json.loads(context_text)["@context"] + + # Local classes and slots must be present + assert "Person" in ctx, f"Local class 'Person' missing, got: {list(ctx.keys())}" + assert "name" in ctx, f"Local slot 'name' missing, got: {list(ctx.keys())}" + assert "age" in ctx, f"Local slot 'age' missing, got: {list(ctx.keys())}" + + +def test_exclude_external_imports_preserves_local_file_imports(tmp_path): + """Local file imports (non-URL) must be preserved when exclude_external_imports is set. + + Only URL-based imports (http:// or https://) are considered external. + File-path imports between local schemas must remain in the context. + """ + local_dir = tmp_path / "local" + local_dir.mkdir() + (local_dir / "base.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/base + name: base + default_prefix: base + prefixes: + linkml: https://w3id.org/linkml/ + base: https://example.org/base/ + imports: + - linkml:types + slots: + baseField: + slot_uri: base:baseField + range: string + classes: + BaseRecord: + class_uri: base:BaseRecord + slots: + - baseField + """), + encoding="utf-8", + ) + + (tmp_path / "main.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/main + name: main + default_prefix: main + prefixes: + linkml: https://w3id.org/linkml/ + main: https://example.org/main/ + base: https://example.org/base/ + imports: + - linkml:types + - local/base + slots: + localField: + slot_uri: main:localField + range: string + classes: + MainRecord: + class_uri: main:MainRecord + slots: + - localField + """), + encoding="utf-8", + ) + + context_text = ContextGenerator( + str(tmp_path / "main.yaml"), + exclude_external_imports=True, + mergeimports=True, + base_dir=str(tmp_path), + ).serialize() + ctx = json.loads(context_text)["@context"] + + # Local file import terms must be present + assert "MainRecord" in ctx, f"Local class 'MainRecord' missing, got: {list(ctx.keys())}" + assert "BaseRecord" in ctx, f"Local-file-imported class 'BaseRecord' missing, got: {list(ctx.keys())}" + assert "baseField" in ctx or "base_field" in ctx, ( + f"Local-file-imported slot 'baseField' missing, got: {list(ctx.keys())}" + ) + + +def test_exclude_external_imports_works_with_mergeimports_false(tmp_path): + """exclude_external_imports is effective even when mergeimports=False. + + Although mergeimports=False prevents most imported elements from appearing, + external vocabulary elements can still leak into the context via the + schema_map. The exclude_external_imports flag catches these. + """ + ext_dir = tmp_path / "ext" + ext_dir.mkdir() + (ext_dir / "external_vocab.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/external-vocab + name: external_vocab + default_prefix: ext + prefixes: + linkml: https://w3id.org/linkml/ + ext: https://example.org/external-vocab/ + imports: + - linkml:types + slots: + issuer: + slot_uri: ext:issuer + range: string + classes: + ExternalCredential: + class_uri: ext:ExternalCredential + slots: + - issuer + """), + encoding="utf-8", + ) + + (tmp_path / "main.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/main + name: main + default_prefix: main + prefixes: + linkml: https://w3id.org/linkml/ + main: https://example.org/main/ + ext: https://example.org/external-vocab/ + imports: + - linkml:types + - https://example.org/external-vocab + slots: + localName: + slot_uri: main:localName + range: string + classes: + LocalThing: + class_uri: main:LocalThing + slots: + - localName + """), + encoding="utf-8", + ) + + importmap = {"https://example.org/external-vocab": str(ext_dir / "external_vocab")} + + ctx_text = ContextGenerator( + str(tmp_path / "main.yaml"), + exclude_external_imports=True, + mergeimports=False, + importmap=importmap, + base_dir=str(tmp_path), + ).serialize() + ctx = json.loads(ctx_text)["@context"] + + # Local terms must still be present + assert "LocalThing" in ctx, f"Local class missing, got: {list(ctx.keys())}" + + # External vocabulary terms must be excluded + assert "issuer" not in ctx, "External slot 'issuer' should be excluded with mergeimports=False" + assert "ExternalCredential" not in ctx, "External class should be excluded with mergeimports=False" From 76c0082eae34d9033b781841fd470525bd5bb1b6 Mon Sep 17 00:00:00 2001 From: jdsika Date: Thu, 2 Apr 2026 17:03:54 +0200 Subject: [PATCH 3/5] fix(generators): add --xsd-anyuri-as-iri flag for cross-generator IRI consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JSON-LD processors treat xsd:anyURI as an opaque string literal, so range:uri/uriorcurie slots get xsd:anyURI coercion instead of proper IRI node semantics (@type:@id, owl:ObjectProperty, sh:IRI). Add an opt-in --xsd-anyuri-as-iri flag that promotes xsd:anyURI ranges to IRI semantics across all three generators: - JSON-LD context: @type: xsd:anyURI → @type: @id - OWL: DatatypeProperty → ObjectProperty (no rdfs:range restriction) - SHACL: sh:datatype xsd:anyURI → sh:nodeKind sh:IRI The flag only affects types whose XSD mapping is xsd:anyURI (uri and uriorcurie). The curie type (xsd:string) is correctly excluded via is_xsd_anyuri_range() to maintain cross-generator consistency. Standards basis: - OWL 2 §5.3-5.4 (ObjectProperty vs DatatypeProperty) - SHACL §4.8.1 (sh:nodeKind sh:IRI) - JSON-LD 1.1 §4.2.2 (type coercion with @id) - RDF 1.1 §3.2-3.3 (IRIs as first-class nodes, not string literals) Signed-off-by: jdsika --- .../linkml/generators/common/subproperty.py | 33 ++ .../src/linkml/generators/jsonldcontextgen.py | 22 +- .../linkml/src/linkml/generators/owlgen.py | 56 ++- .../test_generators/test_jsonldcontextgen.py | 373 +++++++++++++++++- 4 files changed, 472 insertions(+), 12 deletions(-) diff --git a/packages/linkml/src/linkml/generators/common/subproperty.py b/packages/linkml/src/linkml/generators/common/subproperty.py index 4687c3821..9b136e242 100644 --- a/packages/linkml/src/linkml/generators/common/subproperty.py +++ b/packages/linkml/src/linkml/generators/common/subproperty.py @@ -15,6 +15,10 @@ CURIE_TYPES: frozenset[str] = frozenset({"uriorcurie", "curie"}) URI_TYPES: frozenset[str] = frozenset({"uri"}) +# Types whose XSD mapping is xsd:anyURI (not xsd:string). +# ``curie`` maps to xsd:string and is deliberately excluded. +_ANYURI_TYPES: frozenset[str] = frozenset({"uri", "uriorcurie"}) + def is_uri_range(sv: SchemaView, range_type: str | None) -> bool: """ @@ -63,6 +67,35 @@ def is_curie_range(sv: SchemaView, range_type: str | None) -> bool: return False +def is_xsd_anyuri_range(sv: SchemaView, range_type: str | None) -> bool: + """Check if range type resolves to ``xsd:anyURI``. + + Returns True for ``uri``, ``uriorcurie``, and types that inherit from them. + Returns False for ``curie`` (which maps to ``xsd:string``). + + This is the correct predicate for the ``--xsd-anyuri-as-iri`` flag: only + types whose XSD representation is ``xsd:anyURI`` should be promoted from + literal to IRI semantics. ``curie`` is a compact string representation + that resolves to ``xsd:string`` and must not be affected. + + :param sv: SchemaView for type ancestry lookup + :param range_type: The range type to check + :return: True if range type maps to xsd:anyURI + """ + if range_type is None: + return False + + if range_type in _ANYURI_TYPES: + return True + + if range_type in sv.all_types(): + type_ancestors = set(sv.type_ancestors(range_type)) + if type_ancestors & _ANYURI_TYPES: + return True + + return False + + def format_slot_value_for_range(sv: SchemaView, slot_name: str, range_type: str | None) -> str: """ Format slot value according to the declared range type. diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py index 306783dd8..d18c88fff 100644 --- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py +++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py @@ -23,6 +23,10 @@ URI_RANGES = (SHEX.nonliteral, SHEX.bnode, SHEX.iri) +# Extended URI_RANGES that also treats xsd:anyURI as an IRI reference (@id) +# rather than a typed literal. Opt-in via --xsd-anyuri-as-iri flag. +URI_RANGES_WITH_XSD = (*URI_RANGES, XSD.anyURI) + ENUM_CONTEXT = { "text": "skos:notation", "description": "skos:prefLabel", @@ -72,6 +76,12 @@ class ContextGenerator(Generator): _local_slots: set | None = field(default=None, repr=False) _external_classes: set | None = field(default=None, repr=False) _external_slots: set | None = field(default=None, repr=False) + xsd_anyuri_as_iri: bool = False + """Map xsd:anyURI-typed ranges (uri, uriorcurie) to ``@type: @id`` instead of ``@type: xsd:anyURI``. + + This aligns the JSON-LD context with the SHACL generator, which emits + ``sh:nodeKind sh:IRI`` for the same types. + """ # Framing (opt-in via CLI flag) emit_frame: bool = False @@ -263,6 +273,7 @@ def _literal_coercion_for_ranges(self, ranges: list[str]) -> tuple[bool, str | N and "could not resolve safely because the branches disagree". """ coercions: set[str | None] = set() + uri_ranges = URI_RANGES_WITH_XSD if self.xsd_anyuri_as_iri else URI_RANGES for range_name in ranges: if range_name not in self.schema.types: continue @@ -271,7 +282,7 @@ def _literal_coercion_for_ranges(self, ranges: list[str]) -> tuple[bool, str | N range_uri = self.namespaces.uri_for(range_type.uri) if range_uri == XSD.string: coercions.add(None) - elif range_uri in URI_RANGES: + elif range_uri in uri_ranges: coercions.add("@id") else: coercions.add(range_type.uri) @@ -316,9 +327,10 @@ def visit_slot(self, aliased_slot_name: str, slot: SlotDefinition) -> None: self.emit_prefixes.add(skos) else: range_type = self.schema.types[slot.range] + uri_ranges = URI_RANGES_WITH_XSD if self.xsd_anyuri_as_iri else URI_RANGES if self.namespaces.uri_for(range_type.uri) == XSD.string: pass - elif self.namespaces.uri_for(range_type.uri) in URI_RANGES: + elif self.namespaces.uri_for(range_type.uri) in uri_ranges: slot_def["@type"] = "@id" else: slot_def["@type"] = range_type.uri @@ -438,6 +450,12 @@ def serialize( help="Exclude elements from URL-based external vocabulary imports while keeping local file imports. " "Useful when extending ontologies (e.g. W3C VC v2) whose terms are @protected in their own JSON-LD context.", ) +@click.option( + "--xsd-anyuri-as-iri/--no-xsd-anyuri-as-iri", + default=False, + show_default=True, + help="Map xsd:anyURI-typed ranges (uri, uriorcurie) to @type: @id instead of @type: xsd:anyURI.", +) @click.version_option(__version__, "-V", "--version") def cli(yamlfile, emit_frame, embed_context_in_frame, output, **args): """Generate jsonld @context definition from LinkML model""" diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index 3f4eb4c18..8ae2da270 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -19,6 +19,7 @@ from linkml import METAMODEL_NAMESPACE_NAME from linkml._version import __version__ +from linkml.generators.common.subproperty import is_xsd_anyuri_range from linkml.utils.deprecation import deprecation_warning from linkml.utils.generator import Generator, shared_arguments from linkml_runtime import SchemaView @@ -207,6 +208,24 @@ class OwlSchemaGenerator(Generator): or only one child (covering axiom degenerates to equivalence Parent ≡ Child). Use this flag to suppress covering axioms entirely if equivalence is undesired.""" + xsd_anyuri_as_iri: bool = False + """Treat ``range: uri`` / ``range: uriorcurie`` slots as ``owl:ObjectProperty`` + instead of ``owl:DatatypeProperty`` with ``rdfs:range xsd:anyURI``. + + This aligns the OWL output with the SHACL generator (which emits + ``sh:nodeKind sh:IRI``) and the JSON-LD context generator (which emits + ``@type: @id`` when its own ``--xsd-anyuri-as-iri`` flag is set). + + Without this flag, ``range: uri`` produces a semantic inconsistency: + OWL says the value is a literal (``DatatypeProperty``), while SHACL and + JSON-LD say it is an IRI node. Enabling the flag makes all three + generators consistent. + + When enabled, URI-range slots: + - become ``owl:ObjectProperty`` (not ``owl:DatatypeProperty``) + - have no ``rdfs:range`` restriction (any IRI is valid) + """ + def as_graph(self) -> Graph: """ Generate an rdflib Graph from the LinkML schema. @@ -770,14 +789,19 @@ def transform_class_slot_expression( this_owl_types = set() if range: if range in sv.all_types(imports=True): - self.slot_is_literal_map[main_slot.name].add(True) - this_owl_types.add(RDFS.Literal) - typ = sv.get_type(range) - if self.type_objects: - # TODO - owl_exprs.append(self._type_uri(typ.name)) + if self.xsd_anyuri_as_iri and is_xsd_anyuri_range(sv, range): + # xsd:anyURI ranges become ObjectProperty with no rdfs:range + self.slot_is_literal_map[main_slot.name].add(False) + this_owl_types.add(OWL.Thing) else: - owl_exprs.append(self._type_uri(typ.name)) + self.slot_is_literal_map[main_slot.name].add(True) + this_owl_types.add(RDFS.Literal) + typ = sv.get_type(range) + if self.type_objects: + # TODO + owl_exprs.append(self._type_uri(typ.name)) + else: + owl_exprs.append(self._type_uri(typ.name)) elif range in sv.all_enums(imports=True): # TODO: enums fill this in owl_exprs.append(self._enum_uri(EnumDefinitionName(range))) @@ -1354,8 +1378,9 @@ def _boolean_expression( def _range_is_datatype(self, slot: SlotDefinition) -> bool: if self.type_objects: return False - else: - return slot.range in self.schema.types + if self.xsd_anyuri_as_iri and is_xsd_anyuri_range(self.schemaview, slot.range): + return False + return slot.range in self.schema.types def _range_uri(self, slot: SlotDefinition) -> URIRef: if slot.range in self.schema.types: @@ -1474,6 +1499,8 @@ def slot_owl_type(self, slot: SlotDefinition) -> URIRef: elif range in sv.all_enums(): return OWL.ObjectProperty elif range in sv.all_types(): + if self.xsd_anyuri_as_iri and is_xsd_anyuri_range(sv, range): + return OWL.ObjectProperty return OWL.DatatypeProperty else: raise Exception(f"Unknown range: {slot.range}") @@ -1597,6 +1624,17 @@ def slot_owl_type(self, slot: SlotDefinition) -> URIRef: "Note: warnings are emitted for abstract classes with zero children (no axiom) or one child (equivalence)." ), ) +@click.option( + "--xsd-anyuri-as-iri/--no-xsd-anyuri-as-iri", + default=False, + show_default=True, + help=( + "Treat range: uri / range: uriorcurie slots as owl:ObjectProperty (IRI node) " + "instead of owl:DatatypeProperty with rdfs:range xsd:anyURI (literal). " + "Aligns OWL output with the SHACL generator (sh:nodeKind sh:IRI) and " + "the JSON-LD context generator (--xsd-anyuri-as-iri → @type: @id)." + ), +) @click.version_option(__version__, "-V", "--version") def cli(yamlfile, metadata_profile: str, **kwargs): """Generate an OWL representation of a LinkML model diff --git a/tests/linkml/test_generators/test_jsonldcontextgen.py b/tests/linkml/test_generators/test_jsonldcontextgen.py index ff5b75e66..7bae2eaa5 100644 --- a/tests/linkml/test_generators/test_jsonldcontextgen.py +++ b/tests/linkml/test_generators/test_jsonldcontextgen.py @@ -573,7 +573,6 @@ def test_exclude_imports(input_path): assert "BaseClass" not in ctx, "Imported class 'BaseClass' must not appear in exclude-imports context" assert "baseProperty" not in ctx, "Imported slot 'baseProperty' must not appear in exclude-imports context" - @pytest.mark.parametrize("mergeimports", [True, False], ids=["merge", "no-merge"]) def test_exclude_external_imports(tmp_path, mergeimports): """With --exclude-external-imports, elements from URL-based external @@ -862,3 +861,375 @@ def test_exclude_external_imports_works_with_mergeimports_false(tmp_path): # External vocabulary terms must be excluded assert "issuer" not in ctx, "External slot 'issuer' should be excluded with mergeimports=False" assert "ExternalCredential" not in ctx, "External class should be excluded with mergeimports=False" + + +def test_xsd_anyuri_as_iri_flag(): + """Test that --xsd-anyuri-as-iri maps uri ranges to @type: @id. + + By default, ``range: uri`` (type_uri ``xsd:anyURI``) produces + ``@type: xsd:anyURI`` (typed literal). With ``xsd_anyuri_as_iri=True``, + it produces ``@type: @id`` (IRI node reference), aligning the JSON-LD + context with the SHACL generator which already emits ``sh:nodeKind sh:IRI`` + for the same type. + + See: + - W3C SHACL §4.8.1 sh:nodeKind (https://www.w3.org/TR/shacl/#NodeKindConstraintComponent) + - JSON-LD 1.1 §4.2.2 Type Coercion (https://www.w3.org/TR/json-ld11/#type-coercion) + - RDF 1.1 §3.3 Literals vs §3.2 IRIs (https://www.w3.org/TR/rdf11-concepts/) + """ + schema_yaml = """ +id: https://example.org/test-uri-context +name: test_uri_context + +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + +imports: + - linkml:types + +default_prefix: ex +default_range: string + +slots: + homepage: + range: uri + slot_uri: ex:homepage + node_ref: + range: nodeidentifier + slot_uri: ex:nodeRef + name: + range: string + slot_uri: ex:name + +classes: + Thing: + slots: + - homepage + - node_ref + - name +""" + # Default behaviour: uri → xsd:anyURI (backward compatible) + ctx_default = json.loads(ContextGenerator(schema_yaml).serialize())["@context"] + assert ctx_default["homepage"]["@type"] == "xsd:anyURI" + + # Opt-in: uri → @id (aligned with SHACL sh:nodeKind sh:IRI) + ctx_iri = json.loads(ContextGenerator(schema_yaml, xsd_anyuri_as_iri=True).serialize())["@context"] + assert ctx_iri["homepage"]["@type"] == "@id", ( + f"Expected @type: @id for uri range with xsd_anyuri_as_iri=True, got {ctx_iri['homepage'].get('@type')}" + ) + + # nodeidentifier is unaffected by the flag (not xsd:anyURI-typed) + # Its default @type depends on URI_RANGES matching shex:nonLiteral; + # we only verify the flag doesn't change its behaviour. + assert ctx_default["node_ref"]["@type"] == ctx_iri["node_ref"]["@type"] + + # string → no @type regardless of flag + assert "@type" not in ctx_default.get("name", {}) + assert "@type" not in ctx_iri.get("name", {}) + + +def test_xsd_anyuri_as_iri_with_any_of(): + """The --xsd-anyuri-as-iri flag must also apply to ``any_of`` slots + whose type branches include ``uri`` mixed with class ranges. + + ``_literal_coercion_for_ranges`` resolves mixed any_of type branches + and must use the extended URI_RANGES when the flag is active. + """ + schema_yaml = """ +id: https://example.org/test-anyof-uri +name: test_anyof_uri + +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + +imports: + - linkml:types + +default_prefix: ex +default_range: string + +classes: + Container: + slots: + - mixed_slot + Target: + class_uri: ex:Target + +slots: + mixed_slot: + slot_uri: ex:mixed + any_of: + - range: Target + - range: uri +""" + # Default: mixed class+uri any_of — uri resolves to xsd:anyURI literal, + # which disagrees with @id from the class branch → no coercion emitted + ctx_default = json.loads(ContextGenerator(schema_yaml).serialize())["@context"] + default_type = ctx_default.get("mixed_slot", {}).get("@type") + assert default_type != "@id", f"Without flag, mixed any_of should not resolve to @id, got {default_type}" + + # With flag: uri branch now also resolves to @id, matching the class branch + # → all branches agree → @id is emitted + ctx_iri = json.loads(ContextGenerator(schema_yaml, xsd_anyuri_as_iri=True).serialize())["@context"] + assert ctx_iri["mixed_slot"]["@type"] == "@id", ( + f"Expected @id for mixed any_of with flag, got {ctx_iri.get('mixed_slot', {}).get('@type')}" + ) + + +def test_xsd_anyuri_as_iri_owl(): + """OWL generator must produce owl:ObjectProperty for uri ranges when flag is set. + + Without the flag, ``range: uri`` produces ``owl:DatatypeProperty`` with + ``rdfs:range xsd:anyURI``. With ``xsd_anyuri_as_iri=True``, it should + produce ``owl:ObjectProperty`` (no rdfs:range restriction), aligning + with the SHACL generator's ``sh:nodeKind sh:IRI``. + """ + from rdflib import OWL, RDF, URIRef + + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_yaml = """ +id: https://example.org/test-owl-uri +name: test_owl_uri +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + homepage: + range: uri + slot_uri: ex:homepage + name: + range: string + slot_uri: ex:name +classes: + Thing: + slots: + - homepage + - name +""" + # Default: uri → DatatypeProperty (must disable type_objects which + # unconditionally returns ObjectProperty for all type-ranged slots) + gen_default = OwlSchemaGenerator(schema_yaml, type_objects=False) + g_default = gen_default.as_graph() + homepage_uri = URIRef("https://example.org/homepage") + default_rdf_type = set(g_default.objects(homepage_uri, RDF.type)) + assert OWL.DatatypeProperty in default_rdf_type, ( + f"Without flag, homepage should be DatatypeProperty, got {default_rdf_type}" + ) + + # With flag: uri → ObjectProperty + gen_iri = OwlSchemaGenerator(schema_yaml, xsd_anyuri_as_iri=True, type_objects=False) + g_iri = gen_iri.as_graph() + iri_rdf_type = set(g_iri.objects(homepage_uri, RDF.type)) + assert OWL.ObjectProperty in iri_rdf_type, f"With flag, homepage should be ObjectProperty, got {iri_rdf_type}" + assert OWL.DatatypeProperty not in iri_rdf_type, ( + f"With flag, homepage should NOT be DatatypeProperty, got {iri_rdf_type}" + ) + + # String slot must remain DatatypeProperty regardless of flag + name_uri = URIRef("https://example.org/name") + name_rdf_type = set(g_iri.objects(name_uri, RDF.type)) + assert OWL.DatatypeProperty in name_rdf_type, f"String slot should remain DatatypeProperty, got {name_rdf_type}" + + +def test_xsd_anyuri_as_iri_uriorcurie_range(): + """``uriorcurie`` also maps to ``xsd:anyURI`` and must behave identically + to ``uri`` when the ``--xsd-anyuri-as-iri`` flag is active. + + This is a high-priority coverage gap: ``uriorcurie`` is distinct from + ``uri`` at the LinkML level but shares the same XSD type. + """ + schema_yaml = """ +id: https://example.org/test-uriorcurie +name: test_uriorcurie + +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + +imports: + - linkml:types + +default_prefix: ex +default_range: string + +slots: + reference: + range: uriorcurie + slot_uri: ex:reference + homepage: + range: uri + slot_uri: ex:homepage + +classes: + Thing: + slots: + - reference + - homepage +""" + ctx_default = json.loads(ContextGenerator(schema_yaml).serialize())["@context"] + assert ctx_default["reference"]["@type"] == "xsd:anyURI" + assert ctx_default["homepage"]["@type"] == "xsd:anyURI" + + ctx_iri = json.loads(ContextGenerator(schema_yaml, xsd_anyuri_as_iri=True).serialize())["@context"] + assert ctx_iri["reference"]["@type"] == "@id", "uriorcurie should map to @id with xsd_anyuri_as_iri=True" + assert ctx_iri["homepage"]["@type"] == "@id", "uri should map to @id with xsd_anyuri_as_iri=True" + + +def test_xsd_anyuri_as_iri_curie_range_unchanged(): + """``curie`` maps to ``xsd:string`` (not ``xsd:anyURI``), so the flag + must NOT affect its coercion. + + This documents the cross-type boundary: ``uri`` and ``uriorcurie`` + share ``xsd:anyURI``, but ``curie`` uses ``xsd:string``. + """ + schema_yaml = """ +id: https://example.org/test-curie +name: test_curie + +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + +imports: + - linkml:types + +default_prefix: ex +default_range: string + +slots: + curie_slot: + range: curie + slot_uri: ex:curieSlot + uri_slot: + range: uri + slot_uri: ex:uriSlot + +classes: + Thing: + slots: + - curie_slot + - uri_slot +""" + ctx_default = json.loads(ContextGenerator(schema_yaml).serialize())["@context"] + ctx_iri = json.loads(ContextGenerator(schema_yaml, xsd_anyuri_as_iri=True).serialize())["@context"] + + # curie (xsd:string) must be unaffected by the flag + curie_default = ctx_default.get("curie_slot", {}).get("@type") + curie_iri = ctx_iri.get("curie_slot", {}).get("@type") + assert curie_default == curie_iri, f"curie coercion should not change with flag: {curie_default} vs {curie_iri}" + + # uri (xsd:anyURI) must change — sanity check + assert ctx_iri["uri_slot"]["@type"] == "@id" + + +def test_xsd_anyuri_as_iri_owl_curie_unchanged(): + """OWL generator must keep ``range: curie`` as DatatypeProperty even with flag. + + ``curie`` maps to ``xsd:string`` (not ``xsd:anyURI``), so the + ``--xsd-anyuri-as-iri`` flag must not promote it to ObjectProperty. + This verifies cross-generator consistency: the JSON-LD context generator + already correctly excludes ``curie`` via ``URI_RANGES_WITH_XSD``; the + OWL generator must match via ``is_xsd_anyuri_range()``. + """ + from rdflib import OWL, RDF, URIRef + + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_yaml = """ +id: https://example.org/test-owl-curie +name: test_owl_curie +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + compact_id: + range: curie + slot_uri: ex:compactId + homepage: + range: uri + slot_uri: ex:homepage +classes: + Thing: + slots: + - compact_id + - homepage +""" + compact_id_uri = URIRef("https://example.org/compact_id") + homepage_uri = URIRef("https://example.org/homepage") + + # With flag: curie must stay DatatypeProperty, uri must become ObjectProperty + gen = OwlSchemaGenerator(schema_yaml, xsd_anyuri_as_iri=True, type_objects=False) + g = gen.as_graph() + + curie_types = set(g.objects(compact_id_uri, RDF.type)) + assert OWL.DatatypeProperty in curie_types, f"curie slot must remain DatatypeProperty with flag, got {curie_types}" + assert OWL.ObjectProperty not in curie_types, ( + f"curie slot must NOT become ObjectProperty with flag, got {curie_types}" + ) + + # Sanity: uri must become ObjectProperty + uri_types = set(g.objects(homepage_uri, RDF.type)) + assert OWL.ObjectProperty in uri_types, f"uri slot should be ObjectProperty with flag, got {uri_types}" + + +def test_xsd_anyuri_as_iri_cli_flag(): + """Verify the ``--xsd-anyuri-as-iri`` flag is wired through Click.""" + import tempfile + from pathlib import Path + + from click.testing import CliRunner + + from linkml.generators.jsonldcontextgen import cli + + schema_yaml = """ +id: https://example.org/test-cli +name: test_cli + +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + +imports: + - linkml:types + +default_prefix: ex +default_range: string + +slots: + homepage: + range: uri + slot_uri: ex:homepage + +classes: + Thing: + slots: + - homepage +""" + with tempfile.TemporaryDirectory() as tmpdir: + schema_path = Path(tmpdir) / "test.yaml" + schema_path.write_text(schema_yaml) + + runner = CliRunner() + + # Without flag + result_default = runner.invoke(cli, [str(schema_path)]) + assert result_default.exit_code == 0, result_default.output + ctx_default = json.loads(result_default.output)["@context"] + assert ctx_default["homepage"]["@type"] == "xsd:anyURI" + + # With flag + result_iri = runner.invoke(cli, [str(schema_path), "--xsd-anyuri-as-iri"]) + assert result_iri.exit_code == 0, result_iri.output + ctx_iri = json.loads(result_iri.output)["@context"] + assert ctx_iri["homepage"]["@type"] == "@id" From c6f7c773eec4a6fa799ea04fed2c2255f343fd96 Mon Sep 17 00:00:00 2001 From: jdsika Date: Thu, 2 Apr 2026 16:54:38 +0200 Subject: [PATCH 4/5] feat(generators): add --deterministic flag with hybrid RDFC-1.0 + rdflib serialization Add a --deterministic / --no-deterministic CLI flag (default off) to OWL, SHACL, JSON-LD Context, and JSON-LD generators that produces byte-identical output across invocations. Three-phase hybrid pipeline for Turtle generators: 1. RDFC-1.0 canonicalization (W3C Recommendation) via pyoxigraph 2. Weisfeiler-Lehman structural hashing for diff-stable blank node IDs 3. Hybrid rdflib re-serialization for idiomatic Turtle (inline blank nodes, collection syntax, prefix filtering) JSON generators use deterministic_json() with recursive deep-sort and JSON-LD-aware key ordering that preserves conventional @context structure. Collection items (owl:oneOf, sh:in, sh:ignoredProperties) are sorted when --deterministic is set to ensure reproducible RDF list order. pyoxigraph >= 0.4.0 is imported lazily only when --deterministic is used. Tests skip gracefully when pyoxigraph is unavailable. Refs: linkml#1847 Signed-off-by: Carlo van Driesten Signed-off-by: jdsika --- .../src/linkml/generators/jsonldcontextgen.py | 54 ++ .../linkml/src/linkml/generators/jsonldgen.py | 5 + .../linkml/src/linkml/generators/owlgen.py | 78 ++- .../linkml/src/linkml/generators/shaclgen.py | 21 +- packages/linkml/src/linkml/utils/generator.py | 319 +++++++++++- .../test_deterministic_benchmark.py | 356 +++++++++++++ .../test_deterministic_output.py | 481 ++++++++++++++++++ 7 files changed, 1292 insertions(+), 22 deletions(-) create mode 100644 tests/linkml/test_generators/test_deterministic_benchmark.py create mode 100644 tests/linkml/test_generators/test_deterministic_output.py diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py index d18c88fff..d19498195 100644 --- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py +++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py @@ -236,8 +236,62 @@ def end_schema( with open(frame_path, "w", encoding="UTF-8") as f: json.dump(frame, f, indent=2, ensure_ascii=False) + if self.deterministic: + return self._deterministic_context_json(json.loads(str(as_json(context))), indent=3) + "\n" return str(as_json(context)) + "\n" + @staticmethod + def _deterministic_context_json(data: dict, indent: int = 3) -> str: + """Serialize a JSON-LD context with deterministic key ordering. + + Preserves the conventional JSON-LD context structure: + 1. ``comments`` block first (metadata) + 2. ``@context`` block second, with: + a. ``@``-prefixed directives (``@vocab``, ``@base``) first + b. Prefix declarations (string values) second + c. Class/property term entries (object values) last + 3. Each group sorted alphabetically within itself + + Unlike :func:`deterministic_json`, this understands JSON-LD + conventions so that the output remains human-readable while + still being byte-identical across invocations. + """ + from linkml.utils.generator import deterministic_json + + ordered = {} + + # 1. "comments" first (if present) + if "comments" in data: + ordered["comments"] = data["comments"] + + # 2. "@context" with structured internal ordering + if "@context" in data: + ctx = data["@context"] + ordered_ctx = {} + + # 2a. @-prefixed directives (@vocab, @base, etc.) + for k in sorted(k for k in ctx if k.startswith("@")): + ordered_ctx[k] = ctx[k] + + # 2b. Prefix declarations (string values — short namespace URIs) + for k in sorted(k for k in ctx if not k.startswith("@") and isinstance(ctx[k], str)): + ordered_ctx[k] = ctx[k] + + # 2c. Term definitions (object values) — deep-sorted for determinism + term_entries = {k: v for k, v in ctx.items() if not k.startswith("@") and not isinstance(v, str)} + sorted_terms = json.loads(deterministic_json(term_entries)) + for k in sorted(sorted_terms): + ordered_ctx[k] = sorted_terms[k] + + ordered["@context"] = ordered_ctx + + # 3. Any remaining top-level keys + for k in sorted(data): + if k not in ordered: + ordered[k] = data[k] + + return json.dumps(ordered, indent=indent, ensure_ascii=False) + def visit_class(self, cls: ClassDefinition) -> bool: if self.exclude_imports and cls.name not in self._local_classes: return False diff --git a/packages/linkml/src/linkml/generators/jsonldgen.py b/packages/linkml/src/linkml/generators/jsonldgen.py index c974e762d..0c9c87cbb 100644 --- a/packages/linkml/src/linkml/generators/jsonldgen.py +++ b/packages/linkml/src/linkml/generators/jsonldgen.py @@ -1,5 +1,6 @@ """Generate JSONld from a LinkML schema.""" +import json import os from collections.abc import Sequence from copy import deepcopy @@ -202,6 +203,10 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs: self.schema["@context"].append({"@base": base_prefix}) # json_obj["@id"] = self.schema.id out = str(as_json(self.schema, indent=" ")) + "\n" + if self.deterministic: + from linkml.utils.generator import deterministic_json + + out = deterministic_json(json.loads(out), indent=2) + "\n" self.schema = self.original_schema return out diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index 8ae2da270..418871fa3 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -42,6 +42,7 @@ ) from linkml_runtime.utils.formatutils import camelcase, underscore from linkml_runtime.utils.introspection import package_schemaview +from linkml_runtime.utils.yamlutils import YAMLRoot logger = logging.getLogger(__name__) @@ -51,6 +52,21 @@ SWRLB = rdflib.Namespace("http://www.w3.org/2003/11/swrlb#") +def _expression_sort_key(expr: YAMLRoot) -> str: + """Return a stable sort key for LinkML anonymous expressions. + + Used by ``--deterministic`` to order ``any_of``, ``all_of``, + ``none_of``, and ``exactly_one_of`` members reproducibly. + + This relies on ``YAMLRoot.__repr__()`` which formats objects using + their **field values** (not memory addresses). All anonymous + expression dataclasses in ``linkml_runtime.linkml_model.meta`` + use ``@dataclass(repr=False)`` and inherit this field-based repr, + so the output is deterministic across runs. + """ + return repr(expr) + + @unique class MetadataProfile(Enum): """ @@ -290,7 +306,14 @@ def serialize(self, **kwargs) -> str: :return: """ self.as_graph() - data = self.graph.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format) + fmt = "turtle" if self.format in ["owl", "ttl"] else self.format + if self.deterministic and fmt == "turtle": + # Deferred to avoid circular import (generator.py imports from this package) + from linkml.utils.generator import deterministic_turtle + + data = deterministic_turtle(self.graph) + else: + data = self.graph.serialize(format=fmt) return data def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: @@ -579,9 +602,15 @@ def transform_class_expression( own_slots = self.get_own_slots(cls) owl_exprs = [] if cls.any_of: - owl_exprs.append(self._union_of([self.transform_class_expression(x) for x in cls.any_of])) + members = list(cls.any_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + owl_exprs.append(self._union_of([self.transform_class_expression(x) for x in members])) if cls.exactly_one_of: - sub_exprs = [self.transform_class_expression(x) for x in cls.exactly_one_of] + members = list(cls.exactly_one_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + sub_exprs = [self.transform_class_expression(x) for x in members] if isinstance(cls, ClassDefinition): cls_uri = self._class_uri(cls.name) listnode = BNode() @@ -589,17 +618,23 @@ def transform_class_expression( graph.add((cls_uri, OWL.disjointUnionOf, listnode)) else: sub_sub_exprs = [] - for i, x in enumerate(cls.exactly_one_of): - rest = cls.exactly_one_of[0:i] + cls.exactly_one_of[i + 1 :] + for i, x in enumerate(members): + rest = members[0:i] + members[i + 1 :] neg_expr = self._complement_of_union_of([self.transform_class_expression(nx) for nx in rest]) pos_expr = self._intersection_of([self.transform_class_expression(x), neg_expr]) sub_sub_exprs.append(pos_expr) owl_exprs.append(self._union_of(sub_sub_exprs)) # owl_exprs.extend(sub_exprs) if cls.all_of: - owl_exprs.append(self._intersection_of([self.transform_class_expression(x) for x in cls.all_of])) + members = list(cls.all_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + owl_exprs.append(self._intersection_of([self.transform_class_expression(x) for x in members])) if cls.none_of: - owl_exprs.append(self._complement_of_union_of([self.transform_class_expression(x) for x in cls.none_of])) + members = list(cls.none_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + owl_exprs.append(self._complement_of_union_of([self.transform_class_expression(x) for x in members])) for slot in own_slots: if slot.name: owltypes = self.slot_node_owltypes(sv.get_slot(slot.name), owning_class=cls) @@ -752,27 +787,37 @@ def transform_class_slot_expression( owl_exprs.append(self.transform_class_slot_expression(cls, slot.all_members, main_slot, owl_types)) if slot.any_of: + members = list(slot.any_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) owl_exprs.append( - self._union_of( - [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in slot.any_of] - ) + self._union_of([self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in members]) ) if slot.all_of: + members = list(slot.all_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) owl_exprs.append( self._intersection_of( - [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in slot.all_of] + [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in members] ) ) if slot.none_of: + members = list(slot.none_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) owl_exprs.append( self._complement_of_union_of( - [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in slot.none_of] + [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in members] ) ) if slot.exactly_one_of: + members = list(slot.exactly_one_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) disj_exprs = [] - for i, operand in enumerate(slot.exactly_one_of): - rest = slot.exactly_one_of[0:i] + slot.exactly_one_of[i + 1 :] + for i, operand in enumerate(members): + rest = members[0:i] + members[i + 1 :] neg_expr = self._complement_of_union_of( [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in rest], owl_types=owl_types, @@ -1046,7 +1091,10 @@ def add_enum(self, e: EnumDefinition) -> None: owl_types = [] enum_owl_type = self._get_metatype(e, self.default_permissible_value_type) - for pv in e.permissible_values.values(): + pvs = e.permissible_values.values() + if self.deterministic: + pvs = sorted(pvs, key=lambda x: x.text) + for pv in pvs: pv_owl_type = self._get_metatype(pv, enum_owl_type) owl_types.append(pv_owl_type) if pv_owl_type == RDFS.Literal: diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index 5425051e3..ac6afa4cd 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -93,7 +93,13 @@ def generate_header(self) -> str: def serialize(self, **args) -> str: g = self.as_graph() - data = g.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format) + fmt = "turtle" if self.format in ["owl", "ttl"] else self.format + if self.deterministic and fmt == "turtle": + from linkml.utils.generator import deterministic_turtle + + data = deterministic_turtle(g) + else: + data = g.serialize(format=fmt) return data def as_graph(self) -> Graph: @@ -309,13 +315,13 @@ def _add_enum(self, g: Graph, func: Callable, r: ElementName) -> None: sv = self.schemaview enum = sv.get_enum(r) pv_node = BNode() + pv_items = list(enum.permissible_values.items()) + if self.deterministic: + pv_items = sorted(pv_items, key=lambda x: x[0]) Collection( g, pv_node, - [ - URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) - for pv_name, pv in enum.permissible_values.items() - ], + [URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) for pv_name, pv in pv_items], ) func(SH["in"], pv_node) @@ -469,7 +475,10 @@ def collect_child_properties(class_name: str, output: set) -> None: list_node = BNode() ignored_properties.add(RDF.type) - Collection(g, list_node, list(ignored_properties)) + props = list(ignored_properties) + if self.deterministic: + props = sorted(props, key=str) + Collection(g, list_node, props) return list_node diff --git a/packages/linkml/src/linkml/utils/generator.py b/packages/linkml/src/linkml/utils/generator.py index 88fc48585..605d9cec4 100644 --- a/packages/linkml/src/linkml/utils/generator.py +++ b/packages/linkml/src/linkml/utils/generator.py @@ -24,7 +24,7 @@ from dataclasses import dataclass, field from functools import lru_cache from pathlib import Path -from typing import ClassVar, TextIO, Union, cast +from typing import TYPE_CHECKING, ClassVar, TextIO, Union, cast import click from click import Argument, Command, Option @@ -37,6 +37,10 @@ from linkml.utils.schemaloader import SchemaLoader from linkml.utils.typereferences import References from linkml_runtime import SchemaView + +if TYPE_CHECKING: + from rdflib import Graph as RdfGraph + from linkml_runtime.linkml_model.meta import ( ClassDefinition, ClassDefinitionName, @@ -78,6 +82,292 @@ def _resolved_metamodel(mergeimports): return metamodel +def _wl_signatures( + quads: list, + iterations: int = 4, +) -> dict[str, str]: + """Compute Weisfeiler-Lehman structural signatures for blank nodes. + + Uses 1-dimensional WL colour refinement [1]_ to assign each blank + node a deterministic signature derived from its multi-hop + neighbourhood structure. The signature depends only on predicate + IRIs, literal values, and named-node IRIs — **not** on blank-node + identifiers — so it remains stable when unrelated triples are added + or removed. + + Parameters + ---------- + quads : list + Canonical quads from pyoxigraph (after RDFC-1.0). + iterations : int + Number of WL refinement rounds (default 4). + + Returns + ------- + dict[str, str] + Mapping from canonical blank-node ID (e.g. ``c14n42``) to a + truncated SHA-256 hash suitable for use as a stable blank-node + label. + + References + ---------- + .. [1] Weisfeiler, B. & Leman, A. (1968). "The reduction of a graph + to canonical form and the algebra which appears therein." + """ + import hashlib + + import pyoxigraph # guaranteed available — caller (deterministic_turtle) checks + + # Collect all blank node IDs and build adjacency index. + bnode_ids: set[str] = set() + # outgoing[b] = list of (predicate_str, object_str_or_bnode_id, is_bnode) + outgoing: dict[str, list[tuple[str, str, bool]]] = {} + # incoming[b] = list of (subject_str_or_bnode_id, predicate_str, is_bnode) + incoming: dict[str, list[tuple[str, str, bool]]] = {} + + for q in quads: + s, p, o = q.subject, q.predicate, q.object + s_is_bn = isinstance(s, pyoxigraph.BlankNode) + o_is_bn = isinstance(o, pyoxigraph.BlankNode) + p_str = str(p) + + if s_is_bn: + bnode_ids.add(s.value) + outgoing.setdefault(s.value, []).append((p_str, o.value if o_is_bn else str(o), o_is_bn)) + if o_is_bn: + bnode_ids.add(o.value) + incoming.setdefault(o.value, []).append((s.value if s_is_bn else str(s), p_str, s_is_bn)) + + # Initialise signatures: named-node edges only (no bnode IDs). + sig: dict[str, str] = {} + for bid in bnode_ids: + parts = [] + for p_str, o_str, o_is_bn in outgoing.get(bid, []): + if not o_is_bn: + parts.append(f"+{p_str}={o_str}") + for s_str, p_str, s_is_bn in incoming.get(bid, []): + if not s_is_bn: + parts.append(f"-{s_str}={p_str}") + sig[bid] = "|".join(sorted(parts)) + + # Iterative refinement: incorporate neighbour signatures. + for _ in range(iterations): + new_sig: dict[str, str] = {} + for bid in bnode_ids: + parts = [sig[bid]] + for p_str, o_str, o_is_bn in outgoing.get(bid, []): + if o_is_bn: + parts.append(f"+{p_str}={sig.get(o_str, '')}") + for s_str, p_str, s_is_bn in incoming.get(bid, []): + if s_is_bn: + parts.append(f"-{sig.get(s_str, '')}={p_str}") + new_sig[bid] = "|".join(sorted(parts)) + sig = new_sig + + # Convert signatures to truncated SHA-256 hashes. + # Use 12 hex chars (48 bits) — birthday-bound collision probability + # is ~n²/2^49: ~0.002% at 100k nodes. Collisions are handled by + # appending a counter (see below), so correctness is preserved. + hash_map: dict[str, str] = {} + seen_hashes: dict[str, int] = {} + for bid in sorted(bnode_ids): + digest = hashlib.sha256(sig[bid].encode("utf-8")).hexdigest()[:12] + # Handle collisions by appending a counter. + count = seen_hashes.get(digest, 0) + seen_hashes[digest] = count + 1 + label = f"b{digest}" if count == 0 else f"b{digest}_{count}" + hash_map[bid] = label + + return hash_map + + +def deterministic_turtle(graph: "RdfGraph") -> str: + """Serialize an RDF graph to Turtle with deterministic output ordering. + + Uses a three-phase hybrid pipeline for **correctness**, **diff + stability**, and **readability**: + + 1. **RDFC-1.0** [1]_ (via ``pyoxigraph``) canonicalizes the graph, + ensuring isomorphic inputs produce identical triple sets. + 2. **Weisfeiler-Lehman structural hashing** replaces the sequential + ``_:c14nN`` identifiers with content-based hashes derived from + each blank node's multi-hop neighbourhood. These hashes depend + only on predicate IRIs, literal values, and named-node IRIs — + not on blank-node numbering — so adding or removing a triple + only affects the identifiers of directly involved blank nodes. + 3. **Hybrid rdflib re-serialization** parses the canonicalized, + WL-hashed triples back into an rdflib ``Graph`` and serializes + with rdflib's native Turtle writer. This recovers idiomatic + Turtle features that pyoxigraph cannot emit: + + - **Inline blank nodes** (``[ … ]``) for singly-referenced + blank nodes (Turtle §2.7 [2]_), instead of verbose named + ``_:bHASH`` syntax. + - **Collection syntax** (``( … )``) for ``rdf:List`` chains + (Turtle §2.8 [2]_). + - **Prefix filtering**: only prefixes actually used in the + graph's IRIs are declared, following the practice of Apache + Jena, Eclipse RDF4J, and Raptor. + + All triples from the source graph are preserved — the hybrid step + only changes syntactic form, never semantic content. + + Parameters + ---------- + graph : rdflib.Graph + An rdflib Graph to serialize. + + Returns + ------- + str + Deterministic Turtle string with ``@prefix`` declarations. + + References + ---------- + .. [1] W3C (2024). "RDF Dataset Canonicalization (RDFC-1.0)." + W3C Recommendation. https://www.w3.org/TR/rdf-canon/ + .. [2] W3C (2014). "RDF 1.1 Turtle — Terse RDF Triple Language." + W3C Recommendation. https://www.w3.org/TR/turtle/ + """ + try: + import pyoxigraph + except ImportError as exc: + raise ImportError( + "pyoxigraph >= 0.4.0 is required for --deterministic output. " + "Install it with: pip install 'pyoxigraph>=0.4.0'" + ) from exc + + from rdflib import BNode, Graph, Literal, URIRef + + # ── Phase 1: RDFC-1.0 canonicalization ────────────────────────── + nt_data = graph.serialize(format="nt") + + dataset = pyoxigraph.Dataset(pyoxigraph.parse(nt_data, format=pyoxigraph.RdfFormat.N_TRIPLES)) + dataset.canonicalize(pyoxigraph.CanonicalizationAlgorithm.RDFC_1_0) + + canonical_quads = list(dataset) + + # ── Phase 2: WL structural hashing for diff-stable blank node IDs + wl_map = _wl_signatures(canonical_quads) + + def _remap(term): + if isinstance(term, pyoxigraph.BlankNode) and term.value in wl_map: + return pyoxigraph.BlankNode(wl_map[term.value]) + return term + + remapped = [pyoxigraph.Triple(_remap(q.subject), q.predicate, _remap(q.object)) for q in canonical_quads] + + # ── Phase 3: Hybrid rdflib re-serialization ───────────────────── + # Convert pyoxigraph terms to rdflib terms and populate a clean + # Graph that only carries explicitly-bound prefixes. + def _to_rdflib(term): + """Convert a pyoxigraph term to the equivalent rdflib term.""" + if isinstance(term, pyoxigraph.NamedNode): + return URIRef(term.value) + if isinstance(term, pyoxigraph.BlankNode): + return BNode(term.value) + if isinstance(term, pyoxigraph.Literal): + if term.language: + return Literal(term.value, lang=term.language) + if term.datatype: + dt_iri = term.datatype.value + # In RDF 1.1, simple literals are syntactic sugar for + # xsd:string (Turtle §2.5.1). Preserve the shorter form + # to match the original owlgen output and avoid spurious + # diffs on every string literal. + if dt_iri == "http://www.w3.org/2001/XMLSchema#string": + return Literal(term.value) + return Literal(term.value, datatype=URIRef(dt_iri)) + return Literal(term.value) + raise TypeError(f"Unexpected pyoxigraph term type: {type(term).__name__}: {term}") + + result_graph = Graph(bind_namespaces="none") + for triple in remapped: + result_graph.add( + ( + _to_rdflib(triple.subject), + _to_rdflib(triple.predicate), + _to_rdflib(triple.object), + ) + ) + + # Bind only prefixes whose namespace IRI is actually referenced + # by at least one subject, predicate, or object in the graph. + # This filters out rdflib's ~27 built-in default bindings + # (brick, csvw, doap, …) that leak through Graph() even when + # the schema never declared them. + used_iris: set[str] = set() + for s, p, o in result_graph: + for term in (s, p, o): + if isinstance(term, URIRef): + used_iris.add(str(term)) + + for pfx, ns in sorted(graph.namespaces()): + pfx_s, ns_s = str(pfx), str(ns) + if pfx_s and any(iri.startswith(ns_s) for iri in used_iris): + result_graph.bind(pfx_s, ns_s) + + return result_graph.serialize(format="turtle") + + +def deterministic_json(obj: object, indent: int = 3, preserve_list_order_keys: frozenset[str] | None = None) -> str: + """Serialize a JSON-compatible object with deterministic ordering. + + Recursively sorts all dict keys *and* list elements to produce + stable output across Python versions and process invocations. + + List elements are sorted by their canonical JSON representation + (``json.dumps(item, sort_keys=True)``), which handles lists of + dicts, strings, and mixed types. + + :param obj: A JSON-serializable object (typically parsed from ``as_json``). + :param indent: Number of spaces for indentation. + :param preserve_list_order_keys: Dict keys whose list values must NOT be + sorted (e.g. ``@context``, ``@list`` in JSON-LD where array order is + semantic). Defaults to ``_JSONLD_ORDERED_KEYS``. + :returns: Deterministic JSON string. + """ + import json + + skip = preserve_list_order_keys if preserve_list_order_keys is not None else _JSONLD_ORDERED_KEYS + + def _deep_sort(value: object, parent_key: str = "") -> object: + if isinstance(value, dict): + return {k: _deep_sort(v, parent_key=k) for k, v in sorted(value.items())} + if isinstance(value, list): + sorted_items = [_deep_sort(item) for item in value] + if parent_key in skip: + return sorted_items + try: + return sorted(sorted_items, key=lambda x: json.dumps(x, sort_keys=True, ensure_ascii=False)) + except TypeError: + return sorted_items + return value + + return json.dumps(_deep_sort(obj), indent=indent, ensure_ascii=False) + + +# JSON-LD keys whose array values carry ordering semantics and must not +# be sorted. @context arrays define an override cascade (JSON-LD 1.1 +# §4.1); @list containers are explicitly ordered; @graph and @set are +# included defensively. +_JSONLD_ORDERED_KEYS: frozenset[str] = frozenset({"@context", "@list", "@graph", "@set", "imports"}) + + +def well_known_prefix_map() -> dict[str, str]: + """Return a mapping from namespace URI to standard prefix name. + + Uses rdflib's curated default namespace bindings as the source of truth. + For example, ``https://schema.org/`` maps to ``schema``. + + This allows generators to normalise non-standard prefix aliases + (e.g. ``sdo`` for ``https://schema.org/``) to their conventional names. + """ + from rdflib import Graph as RdfGraph + + return {str(ns): str(pfx) for pfx, ns in RdfGraph().namespaces() if str(pfx)} + + @dataclass class Generator(metaclass=abc.ABCMeta): """ @@ -139,6 +429,9 @@ class Generator(metaclass=abc.ABCMeta): mergeimports: bool | None = True """True means merge non-linkml sources into importing package. False means separate packages""" + deterministic: bool = False + """True means produce stable, reproducible output with sorted keys and canonical blank-node ordering""" + source_file_date: str | None = None """Modification date of input source file""" @@ -180,6 +473,10 @@ class Generator(metaclass=abc.ABCMeta): stacktrace: bool = False """True means print stack trace, false just error message""" + normalize_prefixes: bool = False + """True means normalise non-standard prefix aliases to rdflib's curated default names + (e.g. ``sdo`` → ``schema`` for ``https://schema.org/``).""" + include: str | Path | SchemaDefinition | None = None """If set, include extra schema outside of the imports mechanism""" @@ -986,6 +1283,26 @@ def decorator(f: Command) -> Command: callback=stacktrace_callback, ) ) + f.params.append( + Option( + ("--deterministic/--no-deterministic",), + default=False, + show_default=True, + help="Generate stable, reproducible output with sorted keys and canonical blank-node ordering. " + "Supported by OWL, SHACL, JSON-LD, and JSON-LD Context generators. " + "Useful when generated artifacts are stored in version control.", + ) + ) + f.params.append( + Option( + ("--normalize-prefixes/--no-normalize-prefixes",), + default=False, + show_default=True, + help="Normalise non-standard prefix aliases to rdflib's curated default names " + "(e.g. sdo → schema for https://schema.org/). " + "Supported by OWL, SHACL, and JSON-LD Context generators.", + ) + ) return f diff --git a/tests/linkml/test_generators/test_deterministic_benchmark.py b/tests/linkml/test_generators/test_deterministic_benchmark.py new file mode 100644 index 000000000..b7488a8dd --- /dev/null +++ b/tests/linkml/test_generators/test_deterministic_benchmark.py @@ -0,0 +1,356 @@ +"""Benchmark: deterministic Turtle serializer on real-world ontologies. + +Evaluates the ``--deterministic`` flag against schema.org (~16 000 triples, +~800 classes, ~1 400 properties) and the kitchen_sink LinkML schema to +demonstrate four properties: + +1. **Semantic equivalence** — ``rdflib.compare.isomorphic()`` confirms that + deterministic and non-deterministic outputs encode the same RDF graph. +2. **Byte-level stability** — SHA-256 identity across repeated runs proves + that deterministic output is truly reproducible. +3. **Diff quality** — controlled mutations show that small schema changes + produce small, focused diffs (high signal-to-noise ratio). +4. **Performance** — generation time stays within acceptable bounds even + on large real-world graphs. + +Schema.org tests exercise ``deterministic_turtle()`` directly on a +pre-existing OWL ontology. Kitchen_sink tests exercise the full +``OwlSchemaGenerator`` / ``ShaclGenerator`` pipeline with LinkML schemas. + +References +---------- +- W3C RDFC-1.0: https://www.w3.org/TR/rdf-canon/ +- W3C Turtle 1.1: https://www.w3.org/TR/turtle/ +- schema.org: https://schema.org/docs/developers.html +""" + +import difflib +import hashlib +import time +from pathlib import Path + +import pytest +import yaml +from rdflib import Graph +from rdflib.compare import isomorphic + +from linkml.generators.owlgen import OwlSchemaGenerator +from linkml.generators.shaclgen import ShaclGenerator +from linkml.utils.generator import deterministic_turtle + +_has_pyoxigraph = False +try: + import pyoxigraph + + _has_pyoxigraph = hasattr(pyoxigraph, "Dataset") +except ImportError: + pass + +pytestmark = pytest.mark.skipif( + not _has_pyoxigraph, + reason="pyoxigraph >= 0.4.0 required for deterministic benchmarks", +) + +KITCHEN_SINK = str(Path(__file__).parent / "input" / "kitchen_sink.yaml") +SCHEMA_ORG_URL = "https://schema.org/version/latest/schemaorg-current-https.ttl" + + +def _sha256(text: str) -> str: + return hashlib.sha256(text.encode()).hexdigest() + + +def _diff_line_count(a: str, b: str) -> int: + """Count lines present in *b* but not in *a* (unified-diff additions).""" + al = a.strip().splitlines() + bl = b.strip().splitlines() + return sum( + 1 for line in difflib.unified_diff(al, bl, lineterm="") if line.startswith("+") and not line.startswith("+++") + ) + + +# ── Schema.org: direct serializer benchmark ──────────────────────── + + +@pytest.fixture(scope="module") +def schema_org_graph(): + """Download and parse schema.org as an rdflib Graph. + + Cached for the module so the network fetch only happens once. + Skips all dependent tests if the download fails. + """ + try: + import urllib.request + + with urllib.request.urlopen(SCHEMA_ORG_URL, timeout=60) as resp: + data = resp.read().decode("utf-8") + except Exception as exc: + pytest.skip(f"Could not fetch schema.org: {exc}") + + g = Graph() + g.parse(data=data, format="turtle") + return g + + +@pytest.mark.network +class TestSchemaOrgDeterministicSerializer: + """Benchmark ``deterministic_turtle()`` on schema.org OWL ontology.""" + + def test_semantic_equivalence(self, schema_org_graph): + """Deterministic serialization must be isomorphic to the original graph.""" + det_ttl = deterministic_turtle(schema_org_graph) + + g_det = Graph() + g_det.parse(data=det_ttl, format="turtle") + + assert len(g_det) == len(schema_org_graph), ( + f"Triple count mismatch: original={len(schema_org_graph)}, deterministic={len(g_det)}" + ) + assert isomorphic(g_det, schema_org_graph), ( + "Deterministic output is NOT isomorphic to original schema.org graph" + ) + + def test_byte_stability(self, schema_org_graph): + """Two deterministic runs must produce byte-identical output.""" + run1 = deterministic_turtle(schema_org_graph) + run2 = deterministic_turtle(schema_org_graph) + assert _sha256(run1) == _sha256(run2), "Deterministic serializer produced different output across runs" + + def test_prefix_filtering(self, schema_org_graph): + """Only prefixes actually used in the graph should be declared.""" + det_ttl = deterministic_turtle(schema_org_graph) + + # Extract declared prefixes + declared = {} + for line in det_ttl.splitlines(): + if line.startswith("@prefix"): + parts = line.split() + pfx = parts[1].rstrip(":") + ns = parts[2].strip("<>") + declared[pfx] = ns + + # Collect all IRIs in the graph + from rdflib import URIRef + + used_iris = set() + for s, p, o in schema_org_graph: + for term in (s, p, o): + if isinstance(term, URIRef): + used_iris.add(str(term)) + + # Every declared prefix must have at least one IRI using it + for pfx, ns in declared.items(): + assert any(iri.startswith(ns) for iri in used_iris), f"Prefix '{pfx}:' <{ns}> declared but no IRI uses it" + + def test_performance(self, schema_org_graph): + """Serialization must complete within 60 seconds for ~16K triples.""" + start = time.time() + det_ttl = deterministic_turtle(schema_org_graph) + elapsed = time.time() - start + triple_count = len(schema_org_graph) + throughput = triple_count / elapsed if elapsed > 0 else float("inf") + + # Log for benchmark visibility (shows with pytest -v) + print(f"\n schema.org: {triple_count} triples in {elapsed:.1f}s ({throughput:.0f} triples/s)") + + assert elapsed < 60.0, f"Serialization took {elapsed:.1f}s (limit: 60s) for {triple_count} triples" + assert len(det_ttl) > 1000, "Output suspiciously short" + + +# ── Kitchen_sink: full pipeline benchmark ─────────────────────────── + + +def _mutate_kitchen_sink(description_suffix: str = "", add_slot: bool = False) -> str: + """Create a mutated copy of kitchen_sink.yaml **in the same directory** and return its path. + + The copy must live alongside the original so that LinkML relative imports + (``linkml:types``, ``core``, etc.) resolve correctly. + + Uses a unique filename (via ``os.getpid()``) to avoid race conditions + when tests run in parallel under pytest-xdist. + + Parameters + ---------- + description_suffix + Text appended to the first class description found. + add_slot + If True, adds a synthetic ``benchmark_notes`` slot to the first class. + """ + import os + + ks_path = Path(KITCHEN_SINK) + schema = yaml.safe_load(ks_path.read_text()) + + if description_suffix or add_slot: + # Find the first class with a description + for cls_name, cls_def in schema.get("classes", {}).items(): + if isinstance(cls_def, dict) and cls_def.get("description"): + if description_suffix: + cls_def["description"] += description_suffix + if add_slot: + slots = cls_def.get("slots", []) + slots.append("benchmark_notes") + cls_def["slots"] = slots + break + + # Define the synthetic slot if adding one + if add_slot: + slots_dict = schema.setdefault("slots", {}) + slots_dict["benchmark_notes"] = { + "description": "Synthetic benchmark slot for diff quality testing.", + "range": "string", + } + + # Write in the same directory so relative imports resolve. + # Use PID to avoid race conditions with pytest-xdist workers. + out_path = ks_path.parent / f"_benchmark_mutated_{os.getpid()}_kitchen_sink.yaml" + out_path.write_text( + yaml.dump(schema, default_flow_style=False, allow_unicode=True), + encoding="utf-8", + ) + return str(out_path) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +class TestKitchenSinkDiffQuality: + """Measure diff quality on the kitchen_sink schema with controlled mutations.""" + + def test_mutation_description_change(self, generator_cls): + """A single description change must produce a small, focused diff. + + Deterministic mode should change only the affected line(s) and their + immediate context (e.g. SHACL may repeat descriptions in sh:description). + Non-deterministic mode produces a much larger diff due to blank-node + and property-ordering instability. + """ + base = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + mutated_path = _mutate_kitchen_sink(description_suffix=" (benchmark edit)") + try: + mutated = generator_cls(mutated_path, deterministic=True).serialize() + finally: + Path(mutated_path).unlink(missing_ok=True) + + det_diff = _diff_line_count(base, mutated) + + # Non-deterministic baseline for comparison + non_base = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + non_mutated_path = _mutate_kitchen_sink(description_suffix=" (benchmark edit)") + try: + non_mutated = generator_cls(non_mutated_path, deterministic=False).serialize() + finally: + Path(non_mutated_path).unlink(missing_ok=True) + + non_diff = _diff_line_count(non_base, non_mutated) + + # The deterministic diff must be small (description + any SHACL mirrors) + assert det_diff <= 20, ( + f"Deterministic diff too large for a 1-description change: {det_diff} lines (expected ≤20)" + ) + # Signal-to-noise: deterministic must be at least 5× smaller + if non_diff > 0: + ratio = non_diff / max(det_diff, 1) + assert ratio >= 5, ( + f"Insufficient noise reduction: det={det_diff}, non-det={non_diff}, ratio={ratio:.1f}× (expected ≥5×)" + ) + + print( + f"\n {generator_cls.__name__} description mutation: " + f"det={det_diff} lines, non-det={non_diff} lines, " + f"noise reduction={non_diff / max(det_diff, 1):.0f}×" + ) + + def test_mutation_add_slot(self, generator_cls): + """Adding a new slot must produce a proportionally small diff. + + A new slot adds ~10-20 triples (label, range, domain, restrictions). + The diff should be roughly proportional to the new content, not a + full-file rewrite. + """ + base = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + mutated_path = _mutate_kitchen_sink(add_slot=True) + try: + mutated = generator_cls(mutated_path, deterministic=True).serialize() + finally: + Path(mutated_path).unlink(missing_ok=True) + + det_diff = _diff_line_count(base, mutated) + + # Non-deterministic baseline for comparison + non_base = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + non_mutated_path = _mutate_kitchen_sink(add_slot=True) + try: + non_mutated = generator_cls(non_mutated_path, deterministic=False).serialize() + finally: + Path(non_mutated_path).unlink(missing_ok=True) + + non_diff = _diff_line_count(non_base, non_mutated) + + g_base = Graph() + g_base.parse(data=base, format="turtle") + g_mut = Graph() + g_mut.parse(data=mutated, format="turtle") + new_triples = len(g_mut) - len(g_base) + + # Diff should be proportional to new triples (allow 5× margin) + assert det_diff <= max(new_triples * 5, 40), ( + f"Deterministic diff ({det_diff} lines) disproportionate to new triples ({new_triples})" + ) + # Signal-to-noise: deterministic must be at least 5× smaller + if non_diff > 0: + ratio = non_diff / max(det_diff, 1) + assert ratio >= 5, ( + f"Insufficient noise reduction: det={det_diff}, non-det={non_diff}, ratio={ratio:.1f}× (expected ≥5×)" + ) + + print( + f"\n {generator_cls.__name__} add-slot mutation: " + f"det_diff={det_diff} lines, non-det={non_diff} lines, " + f"new_triples={new_triples}, noise reduction={non_diff / max(det_diff, 1):.0f}×" + ) + + print(f"\n {generator_cls.__name__} add-slot mutation: det_diff={det_diff} lines, new_triples={new_triples}") + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +class TestKitchenSinkEquivalence: + """Verify semantic equivalence between deterministic and non-deterministic modes.""" + + def test_triple_count_matches(self, generator_cls): + """Both modes must produce the same number of triples.""" + det = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + nondet = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=det, format="turtle") + g_nondet = Graph() + g_nondet.parse(data=nondet, format="turtle") + + assert len(g_det) == len(g_nondet), ( + f"Triple count mismatch: deterministic={len(g_det)}, non-deterministic={len(g_nondet)}" + ) + + def test_byte_stability_across_runs(self, generator_cls): + """Three deterministic runs must produce identical output.""" + runs = [generator_cls(KITCHEN_SINK, deterministic=True).serialize() for _ in range(3)] + hashes = [_sha256(r) for r in runs] + assert hashes[0] == hashes[1] == hashes[2], f"Deterministic output varies across runs: {hashes}" + + def test_non_deterministic_instability(self, generator_cls): + """Non-deterministic output should vary across runs (documents the problem). + + This test is advisory — it passes regardless but logs the instability. + """ + runs = [generator_cls(KITCHEN_SINK, deterministic=False).serialize() for _ in range(3)] + hashes = [_sha256(r) for r in runs] + identical = hashes[0] == hashes[1] == hashes[2] + print( + f"\n {generator_cls.__name__} non-det stable: {identical} " + f"(expected: False for Turtle due to bnode/ordering instability)" + ) diff --git a/tests/linkml/test_generators/test_deterministic_output.py b/tests/linkml/test_generators/test_deterministic_output.py new file mode 100644 index 000000000..6721c2ac9 --- /dev/null +++ b/tests/linkml/test_generators/test_deterministic_output.py @@ -0,0 +1,481 @@ +"""Tests for deterministic generator output. + +When ``deterministic=True``, generators must produce byte-identical output +across multiple invocations. This ensures version-controlled artifacts don't +show spurious diffs from blank-node relabeling or dict-ordering instability. + +Generators must also produce **isomorphic** output — the deterministic +serialization must encode the same RDF graph as non-deterministic mode. +""" + +import json +import time +from pathlib import Path + +import pytest +from rdflib import Graph +from rdflib.compare import isomorphic + +from linkml.generators.jsonldcontextgen import ContextGenerator +from linkml.generators.jsonldgen import JSONLDGenerator +from linkml.generators.owlgen import OwlSchemaGenerator +from linkml.generators.shaclgen import ShaclGenerator + +# Deterministic Turtle requires pyoxigraph >= 0.4.0 (for Dataset/canonicalize). +# When an older version is present (e.g. pulled in by morph-kgc), skip these tests. +_has_pyoxigraph = False +try: + import pyoxigraph + + _has_pyoxigraph = hasattr(pyoxigraph, "Dataset") +except ImportError: + pass + +pytestmark = pytest.mark.skipif(not _has_pyoxigraph, reason="pyoxigraph >= 0.4.0 required for deterministic tests") + +SCHEMA = str(Path(__file__).parent / "input" / "personinfo.yaml") + + +@pytest.mark.parametrize( + "generator_cls,kwargs", + [ + (OwlSchemaGenerator, {}), + (ShaclGenerator, {}), + (ContextGenerator, {}), + (JSONLDGenerator, {}), + ], + ids=["owl", "shacl", "context", "jsonld"], +) +def test_deterministic_output_is_identical_across_runs(generator_cls, kwargs): + """Generate output twice with deterministic=True and verify identity.""" + out1 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize() + out2 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize() + # JSONLDGenerator embeds a generation_date timestamp — normalize it + if generator_cls is JSONLDGenerator: + import re + + ts_re = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}") + out1 = ts_re.sub("TIMESTAMP", out1) + out2 = ts_re.sub("TIMESTAMP", out2) + assert out1 == out2, f"{generator_cls.__name__} produced different output across runs" + assert len(out1) > 100, "Output suspiciously short — generator may have failed silently" + + +@pytest.mark.parametrize( + "generator_cls", + [ContextGenerator, JSONLDGenerator], + ids=["context", "jsonld"], +) +def test_deterministic_json_has_sorted_keys(generator_cls): + """When deterministic=True, JSON dict keys should be sorted at all levels. + + For the ContextGenerator, @context keys use grouped ordering (prefixes + before term entries) — each group is sorted, but not globally. + """ + out = generator_cls(SCHEMA, deterministic=True).serialize() + parsed = json.loads(out) + + is_context_gen = generator_cls is ContextGenerator + + def _check_sorted_keys(obj, path="root"): + if isinstance(obj, dict): + keys = list(obj.keys()) + # Context generator groups @context keys: @-directives, prefixes, terms + if is_context_gen and path == "root.@context": + at_keys = [k for k in keys if k.startswith("@")] + prefix_keys = [k for k in keys if not k.startswith("@") and isinstance(obj[k], str)] + term_keys = [k for k in keys if not k.startswith("@") and not isinstance(obj[k], str)] + assert at_keys == sorted(at_keys), f"@-keys not sorted: {at_keys}" + assert prefix_keys == sorted(prefix_keys), f"Prefix keys not sorted: {prefix_keys}" + assert term_keys == sorted(term_keys), f"Term keys not sorted: {term_keys}" + else: + assert keys == sorted(keys), f"Keys not sorted at {path}: {keys}" + for k, v in obj.items(): + _check_sorted_keys(v, f"{path}.{k}") + elif isinstance(obj, list): + for i, item in enumerate(obj): + _check_sorted_keys(item, f"{path}[{i}]") + + _check_sorted_keys(parsed) + + +@pytest.mark.parametrize( + "generator_cls", + [ContextGenerator, JSONLDGenerator], + ids=["context", "jsonld"], +) +def test_deterministic_json_lists_are_sorted(generator_cls): + """When deterministic=True, JSON list elements should be sorted. + + Lists under JSON-LD structural keys (``@context``, ``@list``, ``imports``, + etc.) are exempt because their ordering carries semantic meaning. + """ + out = generator_cls(SCHEMA, deterministic=True).serialize() + parsed = json.loads(out) + + # JSON-LD keys whose array values carry ordering semantics. + _ORDERED_KEYS = {"@context", "@list", "@graph", "@set", "imports"} + + def _check_sorted_lists(obj, path="root", parent_key=""): + if isinstance(obj, dict): + for k, v in obj.items(): + _check_sorted_lists(v, f"{path}.{k}", parent_key=k) + elif isinstance(obj, list): + if parent_key not in _ORDERED_KEYS: + str_items = [json.dumps(item, sort_keys=True, ensure_ascii=False) for item in obj] + assert str_items == sorted(str_items), f"List not sorted at {path}" + for i, item in enumerate(obj): + _check_sorted_lists(item, f"{path}[{i}]") + + _check_sorted_lists(parsed) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_turtle_preserves_at_prefix(generator_cls): + """deterministic_turtle must produce standard @prefix, not SPARQL PREFIX.""" + out = generator_cls(SCHEMA, deterministic=True).serialize() + assert "@prefix" in out, "Output uses non-standard prefix syntax" + assert "PREFIX " not in out, "Output uses SPARQL PREFIX instead of Turtle @prefix" + + +def test_deterministic_turtle_performance(): + """Deterministic OWL generation must complete within 10 seconds for personinfo. + + The Weisfeiler-Lehman approach is O(n log n), so this should easily pass. + The previous canon=True approach was exponential and failed this test + for graphs above ~250 triples. + """ + start = time.time() + out = OwlSchemaGenerator(SCHEMA, deterministic=True).serialize() + elapsed = time.time() - start + assert elapsed < 10.0, f"Deterministic generation took {elapsed:.1f}s (limit: 10s)" + assert len(out) > 100, "Output suspiciously short" + + +def test_shacl_closed_ignored_properties_deterministic(): + """sh:ignoredProperties in closed shapes must be deterministic. + + ``_build_ignored_properties`` collects inherited slots into a set; without + explicit sorting this produces different ``rdf:first``/``rdf:rest`` chains + on each run. With ``deterministic=True`` (and sorted Collection inputs) + the output must be byte-identical. + """ + runs = [ShaclGenerator(SCHEMA, deterministic=True, closed=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "sh:ignoredProperties ordering differs across runs" + assert "sh:ignoredProperties" in runs[0], "Expected closed shapes with sh:ignoredProperties" + + +def test_shacl_enum_in_deterministic(): + """sh:in RDF lists for enums must be deterministic. + + ``_build_enum_constraint`` iterates ``enum.permissible_values.items()`` + (dict iteration order) into a ``Collection``. Without sorting, the + ``rdf:first``/``rdf:rest`` chain varies across runs. + """ + runs = [ShaclGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "sh:in enum list ordering differs across runs" + assert "sh:in" in runs[0], "Expected sh:in constraints for enums" + + +def test_owl_enum_one_of_deterministic(): + """owl:oneOf RDF lists for enums must be deterministic. + + ``_boolean_expression`` feeds ``pv_uris`` (from ``permissible_values``) + into a ``Collection``. Without sorting, ``owl:oneOf`` list ordering varies. + """ + runs = [OwlSchemaGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "owl:oneOf enum list ordering differs across runs" + + +KITCHEN_SINK = str(Path(__file__).parent / "input" / "kitchen_sink.yaml") + + +def test_deterministic_large_schema(): + """End-to-end idempotency on a complex schema (kitchen_sink). + + Exercises many code paths simultaneously: closed shapes, enums, imports, + class hierarchies, and mixed ranges. + """ + owl1 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize() + owl2 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize() + assert owl1 == owl2, "OWL output differs across runs for kitchen_sink" + assert len(owl1) > 500, "kitchen_sink output suspiciously short" + + shacl1 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize() + shacl2 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize() + assert shacl1 == shacl2, "SHACL output differs across runs for kitchen_sink" + assert len(shacl1) > 500, "kitchen_sink output suspiciously short" + + +def test_deterministic_context_preserves_jsonld_structure(): + """Deterministic JSON-LD context must preserve conventional structure. + + JSON-LD contexts have a conventional layout: + 1. ``comments`` block first (metadata) + 2. ``@context`` block second, with prefixes grouped before term entries + + ``deterministic_json()`` would scramble this by sorting all keys + uniformly. The context generator must use JSON-LD-aware ordering. + """ + out = ContextGenerator(SCHEMA, deterministic=True, metadata=True).serialize() + parsed = json.loads(out) + + # Top-level key order: "comments" before "@context" + top_keys = list(parsed.keys()) + assert "comments" in top_keys, "Expected 'comments' block with metadata=True" + assert top_keys.index("comments") < top_keys.index("@context"), ( + f"'comments' should precede '@context', got: {top_keys}" + ) + + # Inside @context: @-directives, then prefixes (str values), then terms (dict values) + ctx = parsed["@context"] + ctx_keys = list(ctx.keys()) + + at_keys = [k for k in ctx_keys if k.startswith("@")] + prefix_keys = [k for k in ctx_keys if not k.startswith("@") and isinstance(ctx[k], str)] + term_keys = [k for k in ctx_keys if not k.startswith("@") and not isinstance(ctx[k], str)] + + # Verify grouping: all @-keys before all prefix keys before all term keys + last_at = max(ctx_keys.index(k) for k in at_keys) if at_keys else -1 + first_prefix = min(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else len(ctx_keys) + last_prefix = max(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else -1 + first_term = min(ctx_keys.index(k) for k in term_keys) if term_keys else len(ctx_keys) + + assert last_at < first_prefix, "@-directives must come before prefixes" + assert last_prefix < first_term, "Prefixes must come before term entries" + + # Verify each group is sorted internally + assert at_keys == sorted(at_keys), f"@-directives not sorted: {at_keys}" + assert prefix_keys == sorted(prefix_keys), f"Prefixes not sorted: {prefix_keys}" + assert term_keys == sorted(term_keys), f"Term entries not sorted: {term_keys}" + + +def test_non_deterministic_is_default(): + """Verify that ``deterministic`` defaults to False.""" + gen = OwlSchemaGenerator(SCHEMA) + assert gen.deterministic is False + + +def test_wl_handles_structurally_similar_bnodes(): + """Blank nodes with identical local structure but different named neighbours + must receive different WL signatures and thus different stable labels. + + This tests the core WL property: two BNodes that differ only in their + connected named nodes (URIs/literals) must be distinguishable. + """ + from rdflib import BNode, Graph, Namespace, URIRef + + from linkml.utils.generator import deterministic_turtle + + RDF_TYPE = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type") + OWL_RESTRICTION = URIRef("http://www.w3.org/2002/07/owl#Restriction") + OWL_ON_PROP = URIRef("http://www.w3.org/2002/07/owl#onProperty") + OWL_ALL_VALUES = URIRef("http://www.w3.org/2002/07/owl#allValuesFrom") + + EX = Namespace("http://example.org/") + g = Graph() + + # Two restrictions with same structure but different property URIs + r1 = BNode() + g.add((r1, RDF_TYPE, OWL_RESTRICTION)) + g.add((r1, OWL_ON_PROP, EX.alpha)) + g.add((r1, OWL_ALL_VALUES, EX.Target1)) + + r2 = BNode() + g.add((r2, RDF_TYPE, OWL_RESTRICTION)) + g.add((r2, OWL_ON_PROP, EX.beta)) + g.add((r2, OWL_ALL_VALUES, EX.Target2)) + + RDFS_SUBCLASS = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") + g.add((EX.MyClass, RDFS_SUBCLASS, r1)) + g.add((EX.MyClass, RDFS_SUBCLASS, r2)) + + # Must be deterministic across runs + out1 = deterministic_turtle(g) + out2 = deterministic_turtle(g) + assert out1 == out2, "WL-based serializer is not deterministic for similar BNodes" + + # Both restrictions must appear (not collapsed) + assert "alpha" in out1 + assert "beta" in out1 + + +def test_deterministic_turtle_no_bnodes(): + """Graphs with no blank nodes should still produce sorted, deterministic output.""" + from rdflib import Graph, Literal, Namespace + from rdflib.namespace import RDFS + + from linkml.utils.generator import deterministic_turtle + + EX = Namespace("http://example.org/") + g = Graph() + g.add((EX.B, RDFS.label, Literal("B"))) + g.add((EX.A, RDFS.label, Literal("A"))) + + out1 = deterministic_turtle(g) + out2 = deterministic_turtle(g) + assert out1 == out2 + + # A should appear before B (sorted) + a_pos = out1.find("example.org/A") + b_pos = out1.find("example.org/B") + assert a_pos < b_pos, "Triples should be sorted: A before B" + + +@pytest.mark.xfail( + reason=( + "Collection sorting (owl:oneOf, sh:in) in deterministic mode intentionally " + "reorders RDF list triples for canonical output. The resulting graph is " + "semantically equivalent (OWL/SHACL interpret these as unordered sets) but " + "not RDF-isomorphic because rdf:first/rdf:rest chains encode ordering." + ), + strict=True, +) +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_turtle_is_isomorphic(generator_cls): + """Deterministic output is NOT RDF-isomorphic to non-deterministic output. + + This documents the trade-off identified in linkml/linkml#3295 review: + deterministic mode sorts Collection inputs (owl:oneOf, sh:in, + sh:ignoredProperties) to produce canonical RDF list ordering. Since RDF + Collections encode order via rdf:first/rdf:rest triples, the sorted graph + is structurally different from the insertion-order graph — even though the + OWL/SHACL semantics are identical (these Collections represent sets). + + The test is marked xfail(strict=True) so that it: + - Documents the known, intentional non-isomorphism + - Alerts maintainers if the behaviour changes (strict xfail fails on pass) + """ + out_det = generator_cls(SCHEMA, deterministic=True).serialize() + out_nondet = generator_cls(SCHEMA, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=out_det, format="turtle") + + g_nondet = Graph() + g_nondet.parse(data=out_nondet, format="turtle") + + assert len(g_det) == len(g_nondet), ( + f"Triple count mismatch: deterministic={len(g_det)}, non-deterministic={len(g_nondet)}" + ) + assert isomorphic(g_det, g_nondet), ( + f"{generator_cls.__name__}: deterministic output is NOT isomorphic " + "to non-deterministic output — the serialization changed the graph" + ) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_non_deterministic_output_unchanged(generator_cls): + """Non-deterministic output must still produce valid RDF. + + Ensures that changes for deterministic mode don't break default behavior. + """ + out = generator_cls(SCHEMA, deterministic=False).serialize() + assert len(out) > 100, "Output suspiciously short" + g = Graph() + g.parse(data=out, format="turtle") + assert len(g) > 50, f"Graph has too few triples ({len(g)})" + + +@pytest.mark.parametrize( + "generator_cls,kwargs", + [ + (OwlSchemaGenerator, {}), + (ShaclGenerator, {}), + (ContextGenerator, {}), + (JSONLDGenerator, {}), + ], + ids=["owl", "shacl", "context", "jsonld"], +) +def test_non_deterministic_produces_valid_output(generator_cls, kwargs): + """All generators must produce valid output in non-deterministic mode.""" + out = generator_cls(SCHEMA, deterministic=False, **kwargs).serialize() + assert len(out) > 100, f"{generator_cls.__name__} output suspiciously short" + + +@pytest.mark.xfail( + reason=( + "Collection sorting in deterministic mode produces non-isomorphic RDF " + "(different rdf:first/rdf:rest triples). See test_deterministic_turtle_is_isomorphic." + ), + strict=True, +) +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_kitchen_sink_isomorphic(generator_cls): + """Isomorphism check on the complex kitchen_sink schema. + + Expected to fail for the same reason as test_deterministic_turtle_is_isomorphic: + Collection sorting changes the RDF structure while preserving OWL/SHACL semantics. + """ + out_det = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + out_nondet = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=out_det, format="turtle") + + g_nondet = Graph() + g_nondet.parse(data=out_nondet, format="turtle") + + assert isomorphic(g_det, g_nondet), ( + f"{generator_cls.__name__}: kitchen_sink deterministic output is NOT isomorphic to non-deterministic output" + ) + + +@pytest.mark.skipif(False, reason="does not require pyoxigraph") +def test_expression_sort_key_is_stable(): + """``_expression_sort_key`` must produce stable, content-based keys. + + LinkML anonymous expressions inherit ``YAMLRoot.__repr__()``, which + formats objects using **field values** (not memory addresses). + The ``_expression_sort_key`` helper relies on this for deterministic + ordering of ``any_of`` / ``all_of`` / ``none_of`` members. + + This test verifies that: + 1. Two distinct objects with identical fields produce the same key. + 2. Objects with different fields produce different keys. + 3. Sorting is stable across repeated calls. + """ + from linkml.generators.owlgen import _expression_sort_key + from linkml_runtime.linkml_model.meta import AnonymousClassExpression, AnonymousSlotExpression + + # Two distinct objects with identical content → same key + a1 = AnonymousClassExpression(is_a="Parent") + a2 = AnonymousClassExpression(is_a="Parent") + assert a1 is not a2 + assert _expression_sort_key(a1) == _expression_sort_key(a2) + + # Different content → different keys + b = AnonymousClassExpression(is_a="Child") + assert _expression_sort_key(a1) != _expression_sort_key(b) + + # Sorting stability: same order every time + items = [b, a1, a2] + for _ in range(5): + result = sorted(items, key=_expression_sort_key) + # "Child" < "Parent" alphabetically, so b comes first + assert _expression_sort_key(result[0]) == _expression_sort_key(b) + assert _expression_sort_key(result[1]) == _expression_sort_key(result[2]) # a1, a2 together + + # Slot expressions work too + s1 = AnonymousSlotExpression(range="string") + s2 = AnonymousSlotExpression(range="integer") + assert _expression_sort_key(s1) != _expression_sort_key(s2) + order1 = sorted([s2, s1], key=_expression_sort_key) + order2 = sorted([s1, s2], key=_expression_sort_key) + assert [_expression_sort_key(x) for x in order1] == [_expression_sort_key(x) for x in order2] From a3127868210383fa86c65a9c4a128bc154cce496 Mon Sep 17 00:00:00 2001 From: jdsika Date: Thu, 2 Apr 2026 17:21:36 +0200 Subject: [PATCH 5/5] feat(generators): add --normalize-prefixes flag for well-known prefix names Add an opt-in --normalize-prefixes flag to OWL, SHACL, and JSON-LD Context generators that normalises non-standard prefix aliases to well-known names from a static prefix map (derived from rdflib 7.x defaults, cross-checked against prefix.cc consensus). Key design decisions: - Static frozen map (MappingProxyType) instead of runtime Graph().namespaces() lookup eliminates rdflib version dependency - Both http://schema.org/ and https://schema.org/ map to 'schema' - Shared normalize_graph_prefixes() helper used by OWL and SHACL - Two-phase graph normalisation: Phase 1 normalises schema-declared prefixes, Phase 2 cleans up runtime-injected bindings - Collision detection: skip with warning when standard prefix name is already user-declared for a different namespace - Phase 2 guard prevents overwriting HTTPS bindings with HTTP variants The flag defaults to off, preserving existing behaviour. Tests cover OWL, SHACL, and context generators with sdo->schema, dce->dc, http/https edge case, custom prefix preservation, flag-off backward compatibility, cross-generator consistency, prefix collision detection, schema1 regression prevention, Phase 2 HTTPS guard, empty schema edge case, and static map integrity. Signed-off-by: jdsika --- .../src/linkml/generators/jsonldcontextgen.py | 82 ++- .../linkml/src/linkml/generators/jsonldgen.py | 5 + .../linkml/src/linkml/generators/owlgen.py | 6 +- .../linkml/src/linkml/generators/shaclgen.py | 6 +- packages/linkml/src/linkml/utils/generator.py | 136 ++++- .../test_generators/test_jsonldcontextgen.py | 116 ++++ .../test_normalize_prefixes.py | 555 ++++++++++++++++++ 7 files changed, 896 insertions(+), 10 deletions(-) create mode 100644 tests/linkml/test_generators/test_normalize_prefixes.py diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py index d19498195..101e773ff 100644 --- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py +++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py @@ -15,7 +15,7 @@ from linkml._version import __version__ from linkml.utils.deprecation import deprecated_fields -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, shared_arguments, well_known_prefix_map from linkml_runtime.linkml_model.meta import ClassDefinition, SlotDefinition from linkml_runtime.linkml_model.types import SHEX from linkml_runtime.utils.formatutils import camelcase, underscore @@ -90,6 +90,9 @@ class ContextGenerator(Generator): frame_root: str | None = None def __post_init__(self) -> None: + # Must be set before super().__post_init__() because the parent triggers + # the visitor pattern (visit_schema), which accesses _prefix_remap. + self._prefix_remap: dict[str, str] = {} super().__post_init__() if self.namespaces is None: raise TypeError("Schema text must be supplied to context generator. Preparsed schema will not work") @@ -127,8 +130,14 @@ def _collect_external_elements(sv: SchemaView) -> tuple[set[str], set[str]]: external_slots.update(schema_def.slots.keys()) return external_classes, external_slots + def add_prefix(self, ncname: str) -> None: + """Add a prefix, applying well-known prefix normalisation when enabled.""" + super().add_prefix(self._prefix_remap.get(ncname, ncname)) + def visit_schema(self, base: str | Namespace | None = None, output: str | None = None, **_): - # Add any explicitly declared prefixes + # Add any explicitly declared prefixes. + # Direct .add() is safe here: the normalisation block below explicitly + # rewrites emit_prefixes entries for any renamed prefixes (Cases 1-3). for prefix in self.schema.prefixes.values(): self.emit_prefixes.add(prefix.prefix_prefix) @@ -136,6 +145,68 @@ def visit_schema(self, base: str | Namespace | None = None, output: str | None = for pfx in self.schema.emit_prefixes: self.add_prefix(pfx) + # Normalise well-known prefix names when --normalize-prefixes is set. + # If the schema declares a non-standard alias for a namespace that has + # a well-known standard name (e.g. ``sdo`` for + # ``https://schema.org/``), replace the alias with the standard name + # so that generated JSON-LD contexts use the conventional prefix. + # + # Three cases are handled: + # 1. Standard prefix is not yet bound → just rebind from old to new. + # 2. Standard prefix is bound to a *different* URI: + # a. User-declared (in schema.prefixes) → collision, skip with warning. + # b. Runtime default (e.g. linkml-runtime's ``schema: http://…``) + # → remove stale binding, then rebind. + # 3. Standard prefix is already bound to the *same* URI (duplicate) + # → just drop the non-standard alias. + # + # A remap dict is stored for ``_build_element_id`` because + # ``prefix_suffix()`` splits CURIEs on ``:`` without looking up the + # namespace dict. + self._prefix_remap.clear() + if self.normalize_prefixes: + wk = well_known_prefix_map() + for old_pfx in list(self.namespaces): + url = str(self.namespaces[old_pfx]) + std_pfx = wk.get(url) + if not std_pfx or std_pfx == old_pfx: + continue + if std_pfx in self.namespaces: + if str(self.namespaces[std_pfx]) != url: + # Case 2: std_pfx is bound to a different URI. + # If the user explicitly declared std_pfx in the schema, + # it is intentional — skip to avoid data loss. + if std_pfx in self.schema.prefixes: + self.logger.warning( + "Prefix collision: cannot rename '%s' to '%s' because '%s' is " + "already declared for <%s>; skipping normalisation for <%s>", + old_pfx, + std_pfx, + std_pfx, + str(self.namespaces[std_pfx]), + url, + ) + continue + # Not user-declared (e.g. linkml-runtime default) — safe to remove + self.emit_prefixes.discard(std_pfx) + del self.namespaces[std_pfx] + else: + # Case 3: standard prefix already bound to same URI + # — just drop the non-standard alias + del self.namespaces[old_pfx] + if old_pfx in self.emit_prefixes: + self.emit_prefixes.discard(old_pfx) + self.emit_prefixes.add(std_pfx) + self._prefix_remap[old_pfx] = std_pfx + continue + # Case 1 (or Case 2 after stale removal): bind standard name + self.namespaces[std_pfx] = self.namespaces[old_pfx] + del self.namespaces[old_pfx] + if old_pfx in self.emit_prefixes: + self.emit_prefixes.discard(old_pfx) + self.emit_prefixes.add(std_pfx) + self._prefix_remap[old_pfx] = std_pfx + # Add the default prefix if self.schema.default_prefix: dflt = self.namespaces.prefix_for(self.schema.default_prefix) @@ -143,6 +214,8 @@ def visit_schema(self, base: str | Namespace | None = None, output: str | None = self.default_ns = dflt if self.default_ns: default_uri = self.namespaces[self.default_ns] + # Direct .add() is safe: default_ns is already resolved from + # the (possibly normalised) namespace bindings above. self.emit_prefixes.add(self.default_ns) else: default_uri = self.schema.default_prefix @@ -417,6 +490,11 @@ def _build_element_id(self, definition: Any, uri: str) -> None: @return: None """ uri_prefix, uri_suffix = self.namespaces.prefix_suffix(uri) + # Apply well-known prefix normalisation (e.g. sdo → schema). + # prefix_suffix() splits CURIEs on ':' without checking the + # namespace dict, so it may return a stale alias. + if uri_prefix and uri_prefix in self._prefix_remap: + uri_prefix = self._prefix_remap[uri_prefix] is_default_namespace = uri_prefix == self.context_body["@vocab"] or uri_prefix == self.namespaces.prefix_for( self.context_body["@vocab"] ) diff --git a/packages/linkml/src/linkml/generators/jsonldgen.py b/packages/linkml/src/linkml/generators/jsonldgen.py index 0c9c87cbb..0b58aec23 100644 --- a/packages/linkml/src/linkml/generators/jsonldgen.py +++ b/packages/linkml/src/linkml/generators/jsonldgen.py @@ -179,6 +179,11 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs: # TODO: The _visit function above alters the schema in situ # force some context_kwargs context_kwargs["metadata"] = False + # Forward generator flags so prefix normalisation and deterministic + # output propagate into the inline @context produced for JSON-LD. + for flag in ("normalize_prefixes", "deterministic"): + if hasattr(self, flag): + context_kwargs.setdefault(flag, getattr(self, flag)) add_prefixes = ContextGenerator(self.original_schema, **context_kwargs).serialize() add_prefixes_json = loads(add_prefixes) metamodel_ctx = self.metamodel_context or METAMODEL_CONTEXT_URI diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index 418871fa3..22f04eba0 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -21,7 +21,7 @@ from linkml._version import __version__ from linkml.generators.common.subproperty import is_xsd_anyuri_range from linkml.utils.deprecation import deprecation_warning -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments from linkml_runtime import SchemaView from linkml_runtime.linkml_model.meta import ( AnonymousClassExpression, @@ -272,6 +272,10 @@ def as_graph(self) -> Graph: self.graph.bind(prefix, self.metamodel.namespaces[prefix]) for pfx in schema.prefixes.values(): self.graph.namespace_manager.bind(pfx.prefix_prefix, URIRef(pfx.prefix_reference)) + if self.normalize_prefixes: + normalize_graph_prefixes( + graph, {str(v.prefix_prefix): str(v.prefix_reference) for v in schema.prefixes.values()} + ) graph.add((base, RDF.type, OWL.Ontology)) # Add main schema elements diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index ac6afa4cd..7d5bb6c8f 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -13,7 +13,7 @@ from linkml.generators.common.subproperty import get_subproperty_values, is_uri_range from linkml.generators.shacl.shacl_data_type import ShaclDataType from linkml.generators.shacl.shacl_ifabsent_processor import ShaclIfAbsentProcessor -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments from linkml_runtime.linkml_model.meta import ClassDefinition, ElementName from linkml_runtime.utils.formatutils import underscore from linkml_runtime.utils.yamlutils import TypedNode, extended_float, extended_int, extended_str @@ -111,6 +111,10 @@ def as_graph(self) -> Graph: for pfx in self.schema.prefixes.values(): g.bind(str(pfx.prefix_prefix), pfx.prefix_reference) + if self.normalize_prefixes: + normalize_graph_prefixes( + g, {str(v.prefix_prefix): str(v.prefix_reference) for v in self.schema.prefixes.values()} + ) for c in sv.all_classes(imports=not self.exclude_imports).values(): diff --git a/packages/linkml/src/linkml/utils/generator.py b/packages/linkml/src/linkml/utils/generator.py index 605d9cec4..548ca7cbf 100644 --- a/packages/linkml/src/linkml/utils/generator.py +++ b/packages/linkml/src/linkml/utils/generator.py @@ -20,6 +20,7 @@ import os import re import sys +import types from collections.abc import Callable, Mapping from dataclasses import dataclass, field from functools import lru_cache @@ -62,6 +63,9 @@ from linkml_runtime.utils.formatutils import camelcase, underscore from linkml_runtime.utils.namespaces import Namespaces +if TYPE_CHECKING: + from rdflib import Graph + logger = logging.getLogger(__name__) @@ -357,15 +361,133 @@ def _deep_sort(value: object, parent_key: str = "") -> object: def well_known_prefix_map() -> dict[str, str]: """Return a mapping from namespace URI to standard prefix name. - Uses rdflib's curated default namespace bindings as the source of truth. - For example, ``https://schema.org/`` maps to ``schema``. + Uses a frozen, version-independent map derived from rdflib 7.x curated + defaults (which align with the `prefix.cc `_ community + consensus registry). The map is **not** computed at runtime from + ``Graph().namespaces()`` because those defaults can change across rdflib + releases (they differ between 6.x and 7.x), which would silently alter + generator output. This allows generators to normalise non-standard prefix aliases (e.g. ``sdo`` for ``https://schema.org/``) to their conventional names. + + Both ``http`` and ``https`` variants of schema.org are included because + the linkml-runtime historically binds ``schema: http://schema.org/`` + while rdflib (and the W3C) prefer ``https://schema.org/``. """ - from rdflib import Graph as RdfGraph + return dict(_WELL_KNOWN_PREFIX_MAP) + + +# Frozen, version-independent map: namespace URI → canonical prefix name. +# Source: rdflib 7.x defaults, cross-checked against https://prefix.cc +_WELL_KNOWN_PREFIX_MAP: types.MappingProxyType[str, str] = types.MappingProxyType( + { + "https://brickschema.org/schema/Brick#": "brick", + "http://www.w3.org/ns/csvw#": "csvw", + "http://purl.org/dc/elements/1.1/": "dc", + "http://purl.org/dc/dcam/": "dcam", + "http://www.w3.org/ns/dcat#": "dcat", + "http://purl.org/dc/dcmitype/": "dcmitype", + "http://purl.org/dc/terms/": "dcterms", + "http://usefulinc.com/ns/doap#": "doap", + "http://xmlns.com/foaf/0.1/": "foaf", + "http://www.opengis.net/ont/geosparql#": "geo", + "http://www.w3.org/ns/odrl/2/": "odrl", + "http://www.w3.org/ns/org#": "org", + "http://www.w3.org/2002/07/owl#": "owl", + "http://www.w3.org/ns/dx/prof/": "prof", + "http://www.w3.org/ns/prov#": "prov", + "http://purl.org/linked-data/cube#": "qb", + "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", + "http://www.w3.org/2000/01/rdf-schema#": "rdfs", + "https://schema.org/": "schema", + "http://schema.org/": "schema", # HTTP variant (linkml-runtime uses this) + "http://www.w3.org/ns/shacl#": "sh", + "http://www.w3.org/2004/02/skos/core#": "skos", + "http://www.w3.org/ns/sosa/": "sosa", + "http://www.w3.org/ns/ssn/": "ssn", + "http://www.w3.org/2006/time#": "time", + "http://purl.org/vocab/vann/": "vann", + "http://rdfs.org/ns/void#": "void", + "https://www.w3.org/2003/01/geo/wgs84_pos#": "wgs", + "http://www.w3.org/2003/01/geo/wgs84_pos#": "wgs", # HTTP variant (W3C canonical) + "http://www.w3.org/XML/1998/namespace": "xml", + "http://www.w3.org/2001/XMLSchema#": "xsd", + } +) + - return {str(ns): str(pfx) for pfx, ns in RdfGraph().namespaces() if str(pfx)} +def normalize_graph_prefixes(graph: "Graph", schema_prefixes: dict[str, str]) -> None: + """Normalise non-standard prefix aliases in an rdflib Graph. + + For each prefix bound in *schema_prefixes* (mapping prefix name → + namespace URI), check whether ``well_known_prefix_map()`` knows a + standard name for that URI. If the standard name differs from the + schema-declared name, rebind the namespace to the standard name. + + This is the **shared implementation** used by OWL, SHACL, and (via a + different code-path) JSON-LD context generators so that all serialisation + formats agree on prefix names when ``--normalize-prefixes`` is active. + + :param graph: rdflib Graph whose namespace bindings should be adjusted. + :param schema_prefixes: mapping of prefix name → namespace URI string, + typically from ``schema.prefixes``. + """ + from rdflib import Namespace + + wk = well_known_prefix_map() + + # Phase 1: normalise schema-declared prefixes. + for old_pfx, ns_uri in schema_prefixes.items(): + ns_str = str(ns_uri) + std_pfx = wk.get(ns_str) + if not std_pfx or std_pfx == old_pfx: + continue + # Collision: the user explicitly declared std_pfx for a different + # namespace — do not clobber their binding. + if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str: + logger.warning( + "Prefix collision: cannot rename '%s' to '%s' because '%s' is already " + "declared for <%s>; skipping normalisation for <%s>", + old_pfx, + std_pfx, + std_pfx, + schema_prefixes[std_pfx], + ns_str, + ) + continue + # Rebind: remove old prefix, add standard prefix. + # ``replace=True`` forces the new prefix even if the prefix name + # is already bound to a different namespace. + graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True) + + # Phase 2: normalise runtime-injected bindings (e.g. metamodel defaults). + # The linkml-runtime / rdflib may inject well-known namespaces under + # non-standard prefix names. After Phase 1 rebinds schema-declared + # prefixes, orphaned runtime bindings can appear as ``schema1``, ``dc0``, + # etc. Scan the graph's current bindings and fix any that map to a + # well-known namespace under a non-standard name, provided the standard + # name isn't already claimed by the user for a different namespace. + # + # Guard: if Phase 1 already bound std_pfx to a different URI (e.g. + # ``schema`` → ``https://schema.org/``), do not clobber it with the + # HTTP variant (``http://schema.org/``). Build a snapshot of the + # current bindings after Phase 1 to detect this. + current_bindings = {str(p): str(n) for p, n in graph.namespaces()} + for pfx, ns in list(graph.namespaces()): + pfx_str, ns_str = str(pfx), str(ns) + std_pfx = wk.get(ns_str) + if not std_pfx or std_pfx == pfx_str: + continue + # Same collision check as Phase 1: respect user-declared prefixes. + if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str: + continue + # Guard: if std_pfx is already bound to a different (correct) URI + # by Phase 1, do not overwrite it. This prevents the HTTP variant + # of schema.org from clobbering the HTTPS binding. + if std_pfx in current_bindings and current_bindings[std_pfx] != ns_str: + continue + graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True) @dataclass @@ -474,8 +596,10 @@ class Generator(metaclass=abc.ABCMeta): """True means print stack trace, false just error message""" normalize_prefixes: bool = False - """True means normalise non-standard prefix aliases to rdflib's curated default names - (e.g. ``sdo`` → ``schema`` for ``https://schema.org/``).""" + """True means normalise non-standard prefix aliases to well-known names + from the static ``_WELL_KNOWN_PREFIX_MAP`` (derived from rdflib 7.x + defaults / prefix.cc consensus). E.g. ``sdo`` → ``schema`` for + ``https://schema.org/``.""" include: str | Path | SchemaDefinition | None = None """If set, include extra schema outside of the imports mechanism""" diff --git a/tests/linkml/test_generators/test_jsonldcontextgen.py b/tests/linkml/test_generators/test_jsonldcontextgen.py index 7bae2eaa5..7d5378879 100644 --- a/tests/linkml/test_generators/test_jsonldcontextgen.py +++ b/tests/linkml/test_generators/test_jsonldcontextgen.py @@ -573,6 +573,7 @@ def test_exclude_imports(input_path): assert "BaseClass" not in ctx, "Imported class 'BaseClass' must not appear in exclude-imports context" assert "baseProperty" not in ctx, "Imported slot 'baseProperty' must not appear in exclude-imports context" + @pytest.mark.parametrize("mergeimports", [True, False], ids=["merge", "no-merge"]) def test_exclude_external_imports(tmp_path, mergeimports): """With --exclude-external-imports, elements from URL-based external @@ -1233,3 +1234,118 @@ def test_xsd_anyuri_as_iri_cli_flag(): assert result_iri.exit_code == 0, result_iri.output ctx_iri = json.loads(result_iri.output)["@context"] assert ctx_iri["homepage"]["@type"] == "@id" + + +def test_normalize_prefixes_renames_nonstandard_alias(tmp_path): + """When --normalize-prefixes is set, non-standard aliases are replaced by rdflib defaults. + + rdflib binds ``dc`` to ``http://purl.org/dc/elements/1.1/`` by default. + A schema that declares ``dce`` for the same URI should have it normalised + to ``dc`` when the flag is enabled. + + See: rdflib default namespace bindings. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_normalize +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + dce: http://purl.org/dc/elements/1.1/ +imports: + - linkml:types +classes: + Record: + class_uri: ex:Record + attributes: + title: + range: string + slot_uri: dce:title +""", + encoding="utf-8", + ) + + # Flag OFF (default): non-standard alias preserved + ctx_off = json.loads(ContextGenerator(str(schema), normalize_prefixes=False).serialize())["@context"] + assert "dce" in ctx_off, "With flag off, original prefix 'dce' must be preserved" + + # Flag ON: rdflib default name used + ctx_on = json.loads(ContextGenerator(str(schema), normalize_prefixes=True).serialize())["@context"] + assert "dc" in ctx_on, "With flag on, 'dce' should be normalised to 'dc'" + assert "dce" not in ctx_on, "With flag on, original alias 'dce' should be removed" + assert ctx_on["dc"] == "http://purl.org/dc/elements/1.1/" + + +def test_normalize_prefixes_default_is_off(tmp_path): + """The --normalize-prefixes flag defaults to False — no prefix renaming. + + Ensures backward compatibility: existing schemas produce identical output. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_default +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ +imports: + - linkml:types +classes: + Thing: + class_uri: sdo:Thing + attributes: + name: + range: string + slot_uri: sdo:name +""", + encoding="utf-8", + ) + + ctx = json.loads(ContextGenerator(str(schema)).serialize())["@context"] + # Without the flag, the schema's own prefix name must be preserved + assert "sdo" in ctx, "Default behavior must preserve schema-declared prefix 'sdo'" + + +def test_normalize_prefixes_curie_remapping(tmp_path): + """CURIEs in element @id values use the normalised prefix name. + + When ``sdo`` is normalised to ``schema``, slot URIs like ``sdo:name`` + must appear as ``schema:name`` in the generated context. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_curie +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ +imports: + - linkml:types +classes: + Person: + class_uri: sdo:Person + attributes: + full_name: + range: string + slot_uri: sdo:name +""", + encoding="utf-8", + ) + + ctx = json.loads(ContextGenerator(str(schema), normalize_prefixes=True).serialize())["@context"] + # The prefix declaration must use the standard name + assert "schema" in ctx, "Normalised prefix 'schema' must appear" + # Element @id must use the normalised prefix + person = ctx.get("Person", {}) + assert person.get("@id", "").startswith("schema:"), ( + f"Person @id should use normalised prefix 'schema:', got {person}" + ) diff --git a/tests/linkml/test_generators/test_normalize_prefixes.py b/tests/linkml/test_generators/test_normalize_prefixes.py new file mode 100644 index 000000000..5eb3f5b87 --- /dev/null +++ b/tests/linkml/test_generators/test_normalize_prefixes.py @@ -0,0 +1,555 @@ +"""Tests for the --normalize-prefixes flag across all generators. + +Verifies that non-standard prefix aliases (e.g. ``sdo`` for ``https://schema.org/``) +are normalised to well-known names (e.g. ``schema``) consistently in OWL, SHACL, +and JSON-LD context output. + +References: +- prefix.cc — community consensus RDF prefix registry +- rdflib 7.x curated default namespace bindings +- W3C Turtle §2.4 — prefix declarations are syntactic sugar +""" + +import json +import logging +import re +import textwrap + +# ── Shared test schema ────────────────────────────────────────────── + +SCHEMA_SDO = textwrap.dedent("""\ + id: https://example.org/test + name: test_normalize + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ + imports: + - linkml:types + classes: + Person: + class_uri: sdo:Person + attributes: + full_name: + range: string + slot_uri: sdo:name +""") + +SCHEMA_DCE = textwrap.dedent("""\ + id: https://example.org/test + name: test_normalize_dce + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + dce: http://purl.org/dc/elements/1.1/ + imports: + - linkml:types + classes: + Record: + class_uri: ex:Record + attributes: + title: + range: string + slot_uri: dce:title +""") + +# HTTP variant — linkml-runtime historically binds schema: http://schema.org/ +# while rdflib (and the W3C) prefer https://schema.org/. The normalize flag +# must handle both. +SCHEMA_HTTP_SDO = textwrap.dedent("""\ + id: https://example.org/test + name: test_http_schema + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: http://schema.org/ + imports: + - linkml:types + classes: + Place: + class_uri: sdo:Place + attributes: + geo: + range: string + slot_uri: sdo:geo +""") + +# Collision scenario: user declares 'foaf' for a custom namespace AND 'myfoaf' +# for http://xmlns.com/foaf/0.1/. Normalisation must NOT clobber the user's 'foaf'. +# Uses 'foaf' instead of 'schema' because 'schema' is declared in linkml:types, +# which causes a SchemaLoader merge conflict before normalisation even runs. +SCHEMA_COLLISION = textwrap.dedent("""\ + id: https://example.org/test + name: test_collision + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + foaf: https://something-else.org/ + myfoaf: http://xmlns.com/foaf/0.1/ + imports: + - linkml:types + classes: + Agent: + class_uri: myfoaf:Agent + attributes: + label: + range: string + slot_uri: myfoaf:name +""") + + +def _write_schema(tmp_path, content: str, name: str = "schema.yaml") -> str: + """Write schema content to a temporary file and return its path as string.""" + p = tmp_path / name + p.write_text(content, encoding="utf-8") + return str(p) + + +def _turtle_prefixes(ttl: str) -> dict[str, str]: + """Extract @prefix declarations from Turtle output → {prefix: namespace}.""" + result = {} + for m in re.finditer(r"@prefix\s+(\w+):\s+<([^>]+)>", ttl): + result[m.group(1)] = m.group(2) + return result + + +# ── OWL Generator Tests ───────────────────────────────────────────── + + +class TestOwlNormalizePrefixes: + """OWL generator prefix normalisation tests.""" + + def test_sdo_normalised_to_schema(self, tmp_path): + """sdo → schema when --normalize-prefixes is active.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix in OWL output, got: {sorted(pfx)}" + assert pfx["schema"] == "https://schema.org/" + assert "sdo" not in pfx, "Non-standard 'sdo' prefix should be removed" + + def test_flag_off_preserves_original(self, tmp_path): + """Without the flag, schema-declared prefix names are preserved.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=False).serialize() + pfx = _turtle_prefixes(ttl) + assert "sdo" in pfx, "With flag off, original prefix 'sdo' must be preserved" + + def test_dce_normalised_to_dc(self, tmp_path): + """dce → dc for http://purl.org/dc/elements/1.1/ in graph bindings. + + Note: rdflib's Turtle serializer only emits @prefix declarations for + namespaces actually used in triples. Since the OWL generator may not + produce triples using dc:elements URIs for simple attribute schemas, + we verify the graph's namespace bindings directly. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_DCE) + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "dc" in bound, f"Expected 'dc' in graph bindings, got: {sorted(bound)}" + assert bound["dc"] == "http://purl.org/dc/elements/1.1/" + + def test_custom_prefix_not_affected(self, tmp_path): + """Domain-specific prefixes (e.g. 'ex') are not touched by normalisation.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "ex" in pfx, "Custom prefix 'ex' must survive normalisation" + assert pfx["ex"] == "https://example.org/" + + def test_http_schema_org_normalised(self, tmp_path): + """http://schema.org/ (HTTP variant) also normalises to 'schema'. + + The linkml-runtime historically binds ``schema: http://schema.org/`` + while the W3C and rdflib prefer ``https://schema.org/``. Both + variants must be recognised by the static well-known prefix map. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix for http://schema.org/, got: {sorted(pfx)}" + assert "sdo" not in pfx + + def test_no_schema1_from_runtime_http_binding(self, tmp_path): + """Runtime-injected ``schema: http://schema.org/`` must not create ``schema1``. + + The linkml metamodel (types.yaml) declares ``schema: http://schema.org/`` + (HTTP). When a user schema declares ``sdo: https://schema.org/`` (HTTPS), + normalisation must clean up *both* variants so the output never contains + auto-generated suffixed prefixes like ``schema1``. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + suffixed = [p for p in pfx if re.match(r"schema\d+", p)] + assert not suffixed, ( + f"Auto-generated suffixed prefix(es) {suffixed} found — " + "runtime http://schema.org/ binding was not cleaned up" + ) + + +# ── SHACL Generator Tests ─────────────────────────────────────────── + + +class TestShaclNormalizePrefixes: + """SHACL generator prefix normalisation tests.""" + + def test_sdo_normalised_to_schema(self, tmp_path): + """sdo → schema when --normalize-prefixes is active.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix in SHACL output, got: {sorted(pfx)}" + assert pfx["schema"] == "https://schema.org/" + assert "sdo" not in pfx, "Non-standard 'sdo' prefix should be removed" + + def test_flag_off_preserves_original(self, tmp_path): + """Without the flag, schema-declared prefix names are preserved.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=False).serialize() + pfx = _turtle_prefixes(ttl) + assert "sdo" in pfx, "With flag off, original prefix 'sdo' must be preserved" + + def test_dce_normalised_to_dc(self, tmp_path): + """dce → dc for http://purl.org/dc/elements/1.1/.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_DCE) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "dc" in pfx, f"Expected 'dc' prefix in SHACL output, got: {sorted(pfx)}" + assert pfx["dc"] == "http://purl.org/dc/elements/1.1/" + assert "dce" not in pfx, "Non-standard 'dce' prefix should be removed" + + def test_custom_prefix_not_affected(self, tmp_path): + """Domain-specific prefixes (e.g. 'ex') are not touched by normalisation. + + Note: rdflib only emits @prefix for namespaces used in triples. + We verify graph bindings directly. + """ + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + gen = ShaclGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "ex" in bound, f"Custom prefix 'ex' must survive in graph bindings, got: {sorted(bound)}" + assert bound["ex"] == "https://example.org/" + + def test_http_schema_org_normalised(self, tmp_path): + """http://schema.org/ (HTTP variant) also normalises to 'schema'.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix for http://schema.org/, got: {sorted(pfx)}" + assert "sdo" not in pfx + + def test_no_schema1_from_runtime_http_binding(self, tmp_path): + """Runtime-injected ``schema: http://schema.org/`` must not create ``schema1``. + + Same scenario as the OWL test: linkml:types imports bring in + ``schema: http://schema.org/`` while the user schema has + ``sdo: https://schema.org/``. Phase 2 of normalisation must + clean up the orphaned HTTP binding. + """ + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + suffixed = [p for p in pfx if re.match(r"schema\d+", p)] + assert not suffixed, ( + f"Auto-generated suffixed prefix(es) {suffixed} found — " + "runtime http://schema.org/ binding was not cleaned up" + ) + + +# ── JSON-LD Context Generator Tests ───────────────────────────────── + + +class TestContextNormalizePrefixes: + """JSON-LD context generator prefix normalisation tests (supplements existing tests).""" + + def test_http_schema_org_normalised(self, tmp_path): + """http://schema.org/ (HTTP variant) normalises to 'schema' in JSON-LD context. + + This covers the edge case where linkml-runtime's ``schema: http://schema.org/`` + conflicts with rdflib's ``schema: https://schema.org/``. The stale binding + must be removed and replaced with the correct one. + """ + from linkml.generators.jsonldcontextgen import ContextGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + assert "schema" in ctx, "HTTP schema.org should normalise to 'schema'" + assert "sdo" not in ctx, "Non-standard 'sdo' should be removed" + # The namespace URI must match the schema-declared one (http, not https) + schema_val = ctx["schema"] + if isinstance(schema_val, dict): + schema_val = schema_val.get("@id", "") + assert schema_val == "http://schema.org/", f"Namespace URI must be preserved: got {schema_val}" + + +# ── Static Prefix Map Tests ───────────────────────────────────────── + + +class TestWellKnownPrefixMap: + """Tests for the frozen static prefix map.""" + + def test_returns_dict(self): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert isinstance(wk, dict) + assert len(wk) >= 29, f"Expected ≥29 entries, got {len(wk)}" + + def test_schema_https(self): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["https://schema.org/"] == "schema" + + def test_schema_http_variant(self): + """Both http and https schema.org must map to 'schema'.""" + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["http://schema.org/"] == "schema" + + def test_dc_elements(self): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["http://purl.org/dc/elements/1.1/"] == "dc" + + def test_returns_copy(self): + """Callers should not be able to mutate the internal map.""" + from linkml.utils.generator import well_known_prefix_map + + wk1 = well_known_prefix_map() + wk1["http://example.org/"] = "test" + wk2 = well_known_prefix_map() + assert "http://example.org/" not in wk2 + + def test_matches_rdflib_defaults(self): + """The static map must be a superset of rdflib's current defaults. + + This test documents the relationship: if rdflib adds new defaults in + a future version, this test will flag them for inclusion. + """ + from rdflib import Graph as RdfGraph + + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + rdflib_map = {str(ns): str(pfx) for pfx, ns in RdfGraph().namespaces() if str(pfx)} + missing = {ns: pfx for ns, pfx in rdflib_map.items() if ns not in wk} + assert not missing, f"Static map missing rdflib defaults: {missing}" + + +# ── Cross-Generator Consistency Tests ──────────────────────────────── + + +class TestCrossGeneratorConsistency: + """Ensure all generators agree on prefix normalisation.""" + + def test_all_generators_normalise_sdo_to_schema(self, tmp_path): + """OWL, SHACL, and JSON-LD context must all use 'schema' for schema.org.""" + from linkml.generators.jsonldcontextgen import ContextGenerator + from linkml.generators.owlgen import OwlSchemaGenerator + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + + owl_ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + shacl_ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + + owl_pfx = _turtle_prefixes(owl_ttl) + shacl_pfx = _turtle_prefixes(shacl_ttl) + + assert "schema" in owl_pfx, "OWL must use 'schema'" + assert "schema" in shacl_pfx, "SHACL must use 'schema'" + assert "schema" in ctx, "JSON-LD context must use 'schema'" + + assert "sdo" not in owl_pfx, "OWL must not have 'sdo'" + assert "sdo" not in shacl_pfx, "SHACL must not have 'sdo'" + assert "sdo" not in ctx, "JSON-LD context must not have 'sdo'" + + +# ── Prefix Collision Tests ──────────────────────────────────────────── + + +class TestPrefixCollision: + """Collision: user claims the standard prefix name for a different namespace.""" + + def test_owl_collision_skips_rename(self, tmp_path, caplog): + """OWL: myfoaf must NOT be renamed to 'foaf' when user claims that name.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + # myfoaf must NOT have been renamed to 'foaf' + assert "myfoaf" in bound, "Non-standard 'myfoaf' must remain when collision prevents renaming" + assert bound["myfoaf"] == "http://xmlns.com/foaf/0.1/" + # Warning emitted + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + def test_shacl_collision_skips_rename(self, tmp_path, caplog): + """SHACL: myfoaf must NOT be renamed to 'foaf' when user claims that name.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + gen = ShaclGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "myfoaf" in bound, "Non-standard 'myfoaf' must remain when collision prevents renaming" + assert bound["myfoaf"] == "http://xmlns.com/foaf/0.1/" + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + def test_context_collision_preserves_user_prefix(self, tmp_path, caplog): + """JSON-LD: user's 'foaf: https://something-else.org/' must survive.""" + from linkml.generators.jsonldcontextgen import ContextGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + # User's 'foaf' binding preserved + foaf_val = ctx.get("foaf") + if isinstance(foaf_val, dict): + foaf_val = foaf_val.get("@id", "") + assert foaf_val == "https://something-else.org/", f"User's 'foaf' binding must be preserved, got: {foaf_val}" + # myfoaf must remain (not renamed to foaf) + assert "myfoaf" in ctx, "Non-standard 'myfoaf' must remain when collision prevents renaming" + # Warning emitted + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + +# ── JSONLDGenerator Flag Forwarding Tests ───────────────────────────── + + +class TestJSONLDGeneratorForwarding: + """Verify JSONLDGenerator propagates flags to its embedded ContextGenerator.""" + + def test_normalize_prefixes_forwarded(self, tmp_path): + """JSONLDGenerator must pass normalize_prefixes to embedded ContextGenerator. + + Without forwarding, the inline @context in JSON-LD output would keep + non-standard prefix aliases even when --normalize-prefixes is set. + """ + from linkml.generators.jsonldgen import JSONLDGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + out = JSONLDGenerator(schema_path, normalize_prefixes=True).serialize() + parsed = json.loads(out) + # The @context may be a list; find the dict entry + ctx = parsed.get("@context", {}) + if isinstance(ctx, list): + for item in ctx: + if isinstance(item, dict): + ctx = item + break + assert "sdo" not in ctx, "normalize_prefixes not forwarded: 'sdo' still in embedded @context" + + +# ── Phase 2 HTTP/HTTPS Overwrite Bug Tests ──────────────────────────── + + +class TestPhase2HttpsPreservation: + """Phase 2 must not overwrite Phase 1 HTTPS bindings with HTTP variants.""" + + def test_phase2_does_not_overwrite_https_with_http(self, tmp_path): + """When Phase 1 binds schema → https://schema.org/, Phase 2 must not + overwrite it with http://schema.org/ from the runtime metamodel. + + Reproduction: linkml:types imports bring schema: http://schema.org/ + (HTTP) while the user schema has sdo: https://schema.org/ (HTTPS). + Phase 1 normalises sdo → schema (HTTPS). Phase 2 must not then + rebind schema → http://schema.org/ when it encounters the runtime + HTTP binding. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "schema" in bound, f"Expected 'schema' in bindings, got: {sorted(bound)}" + # MUST be HTTPS (from the user's schema), not HTTP (from runtime) + assert bound["schema"] == "https://schema.org/", ( + f"Phase 2 overwrote HTTPS with HTTP: schema bound to {bound['schema']}" + ) + + def test_normalize_graph_prefixes_phase2_guard(self): + """Direct unit test for the Phase 2 guard in normalize_graph_prefixes. + + Simulates the exact scenario: Phase 1 binds schema → https://schema.org/, + then Phase 2 encounters schema1 → http://schema.org/ and must NOT rebind. + """ + from rdflib import Graph, Namespace, URIRef + + from linkml.utils.generator import normalize_graph_prefixes + + g = Graph(bind_namespaces="none") + # Simulate Phase 1 result + g.bind("schema", Namespace("https://schema.org/")) + # Simulate runtime-injected HTTP variant (would appear as schema1) + g.bind("schema1", Namespace("http://schema.org/")) + # Add a triple so the graph isn't empty + g.add((URIRef("https://example.org/s"), URIRef("https://schema.org/name"), URIRef("https://example.org/o"))) + + normalize_graph_prefixes(g, {"sdo": "https://schema.org/"}) + + bound = {str(p): str(n) for p, n in g.namespaces()} + assert bound.get("schema") == "https://schema.org/", ( + f"Phase 2 guard failed: schema bound to {bound.get('schema')}" + ) + + def test_empty_schema_no_crash(self, tmp_path): + """A schema with no custom prefixes must not crash normalize_graph_prefixes.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + (tmp_path / "empty.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/empty + name: empty + default_prefix: ex + prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/ + imports: + - linkml:types + """), + encoding="utf-8", + ) + # Should not raise + gen = OwlSchemaGenerator(str(tmp_path / "empty.yaml"), normalize_prefixes=True) + ttl = gen.serialize() + assert len(ttl) > 0