diff --git a/packages/linkml/src/linkml/generators/common/subproperty.py b/packages/linkml/src/linkml/generators/common/subproperty.py index 4687c3821..9b136e242 100644 --- a/packages/linkml/src/linkml/generators/common/subproperty.py +++ b/packages/linkml/src/linkml/generators/common/subproperty.py @@ -15,6 +15,10 @@ CURIE_TYPES: frozenset[str] = frozenset({"uriorcurie", "curie"}) URI_TYPES: frozenset[str] = frozenset({"uri"}) +# Types whose XSD mapping is xsd:anyURI (not xsd:string). +# ``curie`` maps to xsd:string and is deliberately excluded. +_ANYURI_TYPES: frozenset[str] = frozenset({"uri", "uriorcurie"}) + def is_uri_range(sv: SchemaView, range_type: str | None) -> bool: """ @@ -63,6 +67,35 @@ def is_curie_range(sv: SchemaView, range_type: str | None) -> bool: return False +def is_xsd_anyuri_range(sv: SchemaView, range_type: str | None) -> bool: + """Check if range type resolves to ``xsd:anyURI``. + + Returns True for ``uri``, ``uriorcurie``, and types that inherit from them. + Returns False for ``curie`` (which maps to ``xsd:string``). + + This is the correct predicate for the ``--xsd-anyuri-as-iri`` flag: only + types whose XSD representation is ``xsd:anyURI`` should be promoted from + literal to IRI semantics. ``curie`` is a compact string representation + that resolves to ``xsd:string`` and must not be affected. + + :param sv: SchemaView for type ancestry lookup + :param range_type: The range type to check + :return: True if range type maps to xsd:anyURI + """ + if range_type is None: + return False + + if range_type in _ANYURI_TYPES: + return True + + if range_type in sv.all_types(): + type_ancestors = set(sv.type_ancestors(range_type)) + if type_ancestors & _ANYURI_TYPES: + return True + + return False + + def format_slot_value_for_range(sv: SchemaView, slot_name: str, range_type: str | None) -> str: """ Format slot value according to the declared range type. diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py index 60eaa9ffd..101e773ff 100644 --- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py +++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py @@ -15,7 +15,7 @@ from linkml._version import __version__ from linkml.utils.deprecation import deprecated_fields -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, shared_arguments, well_known_prefix_map from linkml_runtime.linkml_model.meta import ClassDefinition, SlotDefinition from linkml_runtime.linkml_model.types import SHEX from linkml_runtime.utils.formatutils import camelcase, underscore @@ -23,6 +23,10 @@ URI_RANGES = (SHEX.nonliteral, SHEX.bnode, SHEX.iri) +# Extended URI_RANGES that also treats xsd:anyURI as an IRI reference (@id) +# rather than a typed literal. Opt-in via --xsd-anyuri-as-iri flag. +URI_RANGES_WITH_XSD = (*URI_RANGES, XSD.anyURI) + ENUM_CONTEXT = { "text": "skos:notation", "description": "skos:prefLabel", @@ -56,8 +60,28 @@ class ContextGenerator(Generator): fix_multivalue_containers: bool | None = False exclude_imports: bool = False """If True, elements from imported schemas won't be included in the generated context""" + exclude_external_imports: bool = False + """If True, elements from URL-based external vocabulary imports are excluded. + + Local file imports and linkml standard imports are kept. This is useful + when extending an external ontology (e.g. W3C Verifiable Credentials) + whose terms are ``@protected`` in their own JSON-LD context — redefining + them locally would violate JSON-LD 1.1 §4.1.11. + + Note: this flag has no effect when ``mergeimports=False`` because + non-local elements are already absent from the visitor iteration + in that mode. + """ _local_classes: set | None = field(default=None, repr=False) _local_slots: set | None = field(default=None, repr=False) + _external_classes: set | None = field(default=None, repr=False) + _external_slots: set | None = field(default=None, repr=False) + xsd_anyuri_as_iri: bool = False + """Map xsd:anyURI-typed ranges (uri, uriorcurie) to ``@type: @id`` instead of ``@type: xsd:anyURI``. + + This aligns the JSON-LD context with the SHACL generator, which emits + ``sh:nodeKind sh:IRI`` for the same types. + """ # Framing (opt-in via CLI flag) emit_frame: bool = False @@ -66,10 +90,13 @@ class ContextGenerator(Generator): frame_root: str | None = None def __post_init__(self) -> None: + # Must be set before super().__post_init__() because the parent triggers + # the visitor pattern (visit_schema), which accesses _prefix_remap. + self._prefix_remap: dict[str, str] = {} super().__post_init__() if self.namespaces is None: raise TypeError("Schema text must be supplied to context generator. Preparsed schema will not work") - if self.exclude_imports: + if self.exclude_imports or self.exclude_external_imports: if self.schemaview: sv = self.schemaview else: @@ -77,11 +104,40 @@ def __post_init__(self) -> None: if isinstance(source, str) and self.base_dir and not Path(source).is_absolute(): source = str(Path(self.base_dir) / source) sv = SchemaView(source, importmap=self.importmap, base_dir=self.base_dir) - self._local_classes = set(sv.all_classes(imports=False).keys()) - self._local_slots = set(sv.all_slots(imports=False).keys()) + if self.exclude_imports: + self._local_classes = set(sv.all_classes(imports=False).keys()) + self._local_slots = set(sv.all_slots(imports=False).keys()) + if self.exclude_external_imports: + self._external_classes, self._external_slots = self._collect_external_elements(sv) + + @staticmethod + def _collect_external_elements(sv: SchemaView) -> tuple[set[str], set[str]]: + """Identify classes and slots from URL-based external vocabulary imports. + + Walks the SchemaView ``schema_map`` (populated by ``imports_closure``) + and collects element names from schemas whose import key starts with + ``http://`` or ``https://``. Local file imports and ``linkml:`` + standard imports are left untouched. + """ + sv.imports_closure() + external_classes: set[str] = set() + external_slots: set[str] = set() + for schema_key, schema_def in sv.schema_map.items(): + if schema_key == sv.schema.name: + continue + if schema_key.startswith("http://") or schema_key.startswith("https://"): + external_classes.update(schema_def.classes.keys()) + external_slots.update(schema_def.slots.keys()) + return external_classes, external_slots + + def add_prefix(self, ncname: str) -> None: + """Add a prefix, applying well-known prefix normalisation when enabled.""" + super().add_prefix(self._prefix_remap.get(ncname, ncname)) def visit_schema(self, base: str | Namespace | None = None, output: str | None = None, **_): - # Add any explicitly declared prefixes + # Add any explicitly declared prefixes. + # Direct .add() is safe here: the normalisation block below explicitly + # rewrites emit_prefixes entries for any renamed prefixes (Cases 1-3). for prefix in self.schema.prefixes.values(): self.emit_prefixes.add(prefix.prefix_prefix) @@ -89,6 +145,68 @@ def visit_schema(self, base: str | Namespace | None = None, output: str | None = for pfx in self.schema.emit_prefixes: self.add_prefix(pfx) + # Normalise well-known prefix names when --normalize-prefixes is set. + # If the schema declares a non-standard alias for a namespace that has + # a well-known standard name (e.g. ``sdo`` for + # ``https://schema.org/``), replace the alias with the standard name + # so that generated JSON-LD contexts use the conventional prefix. + # + # Three cases are handled: + # 1. Standard prefix is not yet bound → just rebind from old to new. + # 2. Standard prefix is bound to a *different* URI: + # a. User-declared (in schema.prefixes) → collision, skip with warning. + # b. Runtime default (e.g. linkml-runtime's ``schema: http://…``) + # → remove stale binding, then rebind. + # 3. Standard prefix is already bound to the *same* URI (duplicate) + # → just drop the non-standard alias. + # + # A remap dict is stored for ``_build_element_id`` because + # ``prefix_suffix()`` splits CURIEs on ``:`` without looking up the + # namespace dict. + self._prefix_remap.clear() + if self.normalize_prefixes: + wk = well_known_prefix_map() + for old_pfx in list(self.namespaces): + url = str(self.namespaces[old_pfx]) + std_pfx = wk.get(url) + if not std_pfx or std_pfx == old_pfx: + continue + if std_pfx in self.namespaces: + if str(self.namespaces[std_pfx]) != url: + # Case 2: std_pfx is bound to a different URI. + # If the user explicitly declared std_pfx in the schema, + # it is intentional — skip to avoid data loss. + if std_pfx in self.schema.prefixes: + self.logger.warning( + "Prefix collision: cannot rename '%s' to '%s' because '%s' is " + "already declared for <%s>; skipping normalisation for <%s>", + old_pfx, + std_pfx, + std_pfx, + str(self.namespaces[std_pfx]), + url, + ) + continue + # Not user-declared (e.g. linkml-runtime default) — safe to remove + self.emit_prefixes.discard(std_pfx) + del self.namespaces[std_pfx] + else: + # Case 3: standard prefix already bound to same URI + # — just drop the non-standard alias + del self.namespaces[old_pfx] + if old_pfx in self.emit_prefixes: + self.emit_prefixes.discard(old_pfx) + self.emit_prefixes.add(std_pfx) + self._prefix_remap[old_pfx] = std_pfx + continue + # Case 1 (or Case 2 after stale removal): bind standard name + self.namespaces[std_pfx] = self.namespaces[old_pfx] + del self.namespaces[old_pfx] + if old_pfx in self.emit_prefixes: + self.emit_prefixes.discard(old_pfx) + self.emit_prefixes.add(std_pfx) + self._prefix_remap[old_pfx] = std_pfx + # Add the default prefix if self.schema.default_prefix: dflt = self.namespaces.prefix_for(self.schema.default_prefix) @@ -96,6 +214,8 @@ def visit_schema(self, base: str | Namespace | None = None, output: str | None = self.default_ns = dflt if self.default_ns: default_uri = self.namespaces[self.default_ns] + # Direct .add() is safe: default_ns is already resolved from + # the (possibly normalised) namespace bindings above. self.emit_prefixes.add(self.default_ns) else: default_uri = self.schema.default_prefix @@ -189,11 +309,67 @@ def end_schema( with open(frame_path, "w", encoding="UTF-8") as f: json.dump(frame, f, indent=2, ensure_ascii=False) + if self.deterministic: + return self._deterministic_context_json(json.loads(str(as_json(context))), indent=3) + "\n" return str(as_json(context)) + "\n" + @staticmethod + def _deterministic_context_json(data: dict, indent: int = 3) -> str: + """Serialize a JSON-LD context with deterministic key ordering. + + Preserves the conventional JSON-LD context structure: + 1. ``comments`` block first (metadata) + 2. ``@context`` block second, with: + a. ``@``-prefixed directives (``@vocab``, ``@base``) first + b. Prefix declarations (string values) second + c. Class/property term entries (object values) last + 3. Each group sorted alphabetically within itself + + Unlike :func:`deterministic_json`, this understands JSON-LD + conventions so that the output remains human-readable while + still being byte-identical across invocations. + """ + from linkml.utils.generator import deterministic_json + + ordered = {} + + # 1. "comments" first (if present) + if "comments" in data: + ordered["comments"] = data["comments"] + + # 2. "@context" with structured internal ordering + if "@context" in data: + ctx = data["@context"] + ordered_ctx = {} + + # 2a. @-prefixed directives (@vocab, @base, etc.) + for k in sorted(k for k in ctx if k.startswith("@")): + ordered_ctx[k] = ctx[k] + + # 2b. Prefix declarations (string values — short namespace URIs) + for k in sorted(k for k in ctx if not k.startswith("@") and isinstance(ctx[k], str)): + ordered_ctx[k] = ctx[k] + + # 2c. Term definitions (object values) — deep-sorted for determinism + term_entries = {k: v for k, v in ctx.items() if not k.startswith("@") and not isinstance(v, str)} + sorted_terms = json.loads(deterministic_json(term_entries)) + for k in sorted(sorted_terms): + ordered_ctx[k] = sorted_terms[k] + + ordered["@context"] = ordered_ctx + + # 3. Any remaining top-level keys + for k in sorted(data): + if k not in ordered: + ordered[k] = data[k] + + return json.dumps(ordered, indent=indent, ensure_ascii=False) + def visit_class(self, cls: ClassDefinition) -> bool: if self.exclude_imports and cls.name not in self._local_classes: return False + if self.exclude_external_imports and cls.name in self._external_classes: + return False class_def = {} cn = camelcase(cls.name) @@ -224,6 +400,7 @@ def _literal_coercion_for_ranges(self, ranges: list[str]) -> tuple[bool, str | N and "could not resolve safely because the branches disagree". """ coercions: set[str | None] = set() + uri_ranges = URI_RANGES_WITH_XSD if self.xsd_anyuri_as_iri else URI_RANGES for range_name in ranges: if range_name not in self.schema.types: continue @@ -232,7 +409,7 @@ def _literal_coercion_for_ranges(self, ranges: list[str]) -> tuple[bool, str | N range_uri = self.namespaces.uri_for(range_type.uri) if range_uri == XSD.string: coercions.add(None) - elif range_uri in URI_RANGES: + elif range_uri in uri_ranges: coercions.add("@id") else: coercions.add(range_type.uri) @@ -246,6 +423,8 @@ def _literal_coercion_for_ranges(self, ranges: list[str]) -> tuple[bool, str | N def visit_slot(self, aliased_slot_name: str, slot: SlotDefinition) -> None: if self.exclude_imports and slot.name not in self._local_slots: return + if self.exclude_external_imports and slot.name in self._external_slots: + return if slot.identifier: slot_def = "@id" @@ -275,9 +454,10 @@ def visit_slot(self, aliased_slot_name: str, slot: SlotDefinition) -> None: self.emit_prefixes.add(skos) else: range_type = self.schema.types[slot.range] + uri_ranges = URI_RANGES_WITH_XSD if self.xsd_anyuri_as_iri else URI_RANGES if self.namespaces.uri_for(range_type.uri) == XSD.string: pass - elif self.namespaces.uri_for(range_type.uri) in URI_RANGES: + elif self.namespaces.uri_for(range_type.uri) in uri_ranges: slot_def["@type"] = "@id" else: slot_def["@type"] = range_type.uri @@ -310,6 +490,11 @@ def _build_element_id(self, definition: Any, uri: str) -> None: @return: None """ uri_prefix, uri_suffix = self.namespaces.prefix_suffix(uri) + # Apply well-known prefix normalisation (e.g. sdo → schema). + # prefix_suffix() splits CURIEs on ':' without checking the + # namespace dict, so it may return a stale alias. + if uri_prefix and uri_prefix in self._prefix_remap: + uri_prefix = self._prefix_remap[uri_prefix] is_default_namespace = uri_prefix == self.context_body["@vocab"] or uri_prefix == self.namespaces.prefix_for( self.context_body["@vocab"] ) @@ -390,6 +575,19 @@ def serialize( help="Use --exclude-imports to exclude imported elements from the generated JSON-LD context. This is useful when " "extending an ontology whose terms already have context definitions in their own JSON-LD context file.", ) +@click.option( + "--exclude-external-imports/--no-exclude-external-imports", + default=False, + show_default=True, + help="Exclude elements from URL-based external vocabulary imports while keeping local file imports. " + "Useful when extending ontologies (e.g. W3C VC v2) whose terms are @protected in their own JSON-LD context.", +) +@click.option( + "--xsd-anyuri-as-iri/--no-xsd-anyuri-as-iri", + default=False, + show_default=True, + help="Map xsd:anyURI-typed ranges (uri, uriorcurie) to @type: @id instead of @type: xsd:anyURI.", +) @click.version_option(__version__, "-V", "--version") def cli(yamlfile, emit_frame, embed_context_in_frame, output, **args): """Generate jsonld @context definition from LinkML model""" diff --git a/packages/linkml/src/linkml/generators/jsonldgen.py b/packages/linkml/src/linkml/generators/jsonldgen.py index c974e762d..0b58aec23 100644 --- a/packages/linkml/src/linkml/generators/jsonldgen.py +++ b/packages/linkml/src/linkml/generators/jsonldgen.py @@ -1,5 +1,6 @@ """Generate JSONld from a LinkML schema.""" +import json import os from collections.abc import Sequence from copy import deepcopy @@ -178,6 +179,11 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs: # TODO: The _visit function above alters the schema in situ # force some context_kwargs context_kwargs["metadata"] = False + # Forward generator flags so prefix normalisation and deterministic + # output propagate into the inline @context produced for JSON-LD. + for flag in ("normalize_prefixes", "deterministic"): + if hasattr(self, flag): + context_kwargs.setdefault(flag, getattr(self, flag)) add_prefixes = ContextGenerator(self.original_schema, **context_kwargs).serialize() add_prefixes_json = loads(add_prefixes) metamodel_ctx = self.metamodel_context or METAMODEL_CONTEXT_URI @@ -202,6 +208,10 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs: self.schema["@context"].append({"@base": base_prefix}) # json_obj["@id"] = self.schema.id out = str(as_json(self.schema, indent=" ")) + "\n" + if self.deterministic: + from linkml.utils.generator import deterministic_json + + out = deterministic_json(json.loads(out), indent=2) + "\n" self.schema = self.original_schema return out diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index 33c58b0ec..22f04eba0 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -19,8 +19,9 @@ from linkml import METAMODEL_NAMESPACE_NAME from linkml._version import __version__ +from linkml.generators.common.subproperty import is_xsd_anyuri_range from linkml.utils.deprecation import deprecation_warning -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments from linkml_runtime import SchemaView from linkml_runtime.linkml_model.meta import ( AnonymousClassExpression, @@ -41,6 +42,7 @@ ) from linkml_runtime.utils.formatutils import camelcase, underscore from linkml_runtime.utils.introspection import package_schemaview +from linkml_runtime.utils.yamlutils import YAMLRoot logger = logging.getLogger(__name__) @@ -50,6 +52,21 @@ SWRLB = rdflib.Namespace("http://www.w3.org/2003/11/swrlb#") +def _expression_sort_key(expr: YAMLRoot) -> str: + """Return a stable sort key for LinkML anonymous expressions. + + Used by ``--deterministic`` to order ``any_of``, ``all_of``, + ``none_of``, and ``exactly_one_of`` members reproducibly. + + This relies on ``YAMLRoot.__repr__()`` which formats objects using + their **field values** (not memory addresses). All anonymous + expression dataclasses in ``linkml_runtime.linkml_model.meta`` + use ``@dataclass(repr=False)`` and inherit this field-based repr, + so the output is deterministic across runs. + """ + return repr(expr) + + @unique class MetadataProfile(Enum): """ @@ -201,7 +218,29 @@ class OwlSchemaGenerator(Generator): one direct ``is_a`` child, the generator adds ``AbstractClass rdfs:subClassOf (Child1 or Child2 or …)``, expressing the open-world covering constraint that every instance of the abstract class must also be an instance of one of its - direct subclasses.""" + direct subclasses. + + .. note:: A warning is emitted when an abstract class has no children (no axiom generated) + or only one child (covering axiom degenerates to equivalence Parent ≡ Child). + Use this flag to suppress covering axioms entirely if equivalence is undesired.""" + + xsd_anyuri_as_iri: bool = False + """Treat ``range: uri`` / ``range: uriorcurie`` slots as ``owl:ObjectProperty`` + instead of ``owl:DatatypeProperty`` with ``rdfs:range xsd:anyURI``. + + This aligns the OWL output with the SHACL generator (which emits + ``sh:nodeKind sh:IRI``) and the JSON-LD context generator (which emits + ``@type: @id`` when its own ``--xsd-anyuri-as-iri`` flag is set). + + Without this flag, ``range: uri`` produces a semantic inconsistency: + OWL says the value is a literal (``DatatypeProperty``), while SHACL and + JSON-LD say it is an IRI node. Enabling the flag makes all three + generators consistent. + + When enabled, URI-range slots: + - become ``owl:ObjectProperty`` (not ``owl:DatatypeProperty``) + - have no ``rdfs:range`` restriction (any IRI is valid) + """ def as_graph(self) -> Graph: """ @@ -233,6 +272,10 @@ def as_graph(self) -> Graph: self.graph.bind(prefix, self.metamodel.namespaces[prefix]) for pfx in schema.prefixes.values(): self.graph.namespace_manager.bind(pfx.prefix_prefix, URIRef(pfx.prefix_reference)) + if self.normalize_prefixes: + normalize_graph_prefixes( + graph, {str(v.prefix_prefix): str(v.prefix_reference) for v in schema.prefixes.values()} + ) graph.add((base, RDF.type, OWL.Ontology)) # Add main schema elements @@ -267,7 +310,14 @@ def serialize(self, **kwargs) -> str: :return: """ self.as_graph() - data = self.graph.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format) + fmt = "turtle" if self.format in ["owl", "ttl"] else self.format + if self.deterministic and fmt == "turtle": + # Deferred to avoid circular import (generator.py imports from this package) + from linkml.utils.generator import deterministic_turtle + + data = deterministic_turtle(self.graph) + else: + data = self.graph.serialize(format=fmt) return data def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: @@ -471,6 +521,26 @@ def condition_to_bnode(expr: AnonymousClassExpression) -> BNode | None: # must be an instance of at least one of its direct subclasses. if cls.abstract and not self.skip_abstract_class_as_unionof_subclasses: children = sorted(sv.class_children(cls.name, imports=self.mergeimports, mixins=False, is_a=True)) + if not children: + logger.warning( + "Abstract class '%s' has no children. No covering axiom will be generated.", + cls.name, + ) + elif len(children) == 1: + # Warn: with one child C, the covering axiom degenerates to + # Parent ⊑ C which, combined with C ⊑ Parent (from is_a), + # creates Parent ≡ C (equivalence). This is semantically + # correct per OWL 2 but may be surprising for extensible + # ontologies where more children are added later. + logger.warning( + "Abstract class '%s' has only 1 direct child ('%s'). " + "The covering axiom makes them equivalent (%s ≡ %s). " + "Use --skip-abstract-class-as-unionof-subclasses to suppress.", + cls.name, + children[0], + cls.name, + children[0], + ) if children: child_uris = [self._class_uri(child) for child in children] union_node = self._union_of(child_uris) @@ -536,9 +606,15 @@ def transform_class_expression( own_slots = self.get_own_slots(cls) owl_exprs = [] if cls.any_of: - owl_exprs.append(self._union_of([self.transform_class_expression(x) for x in cls.any_of])) + members = list(cls.any_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + owl_exprs.append(self._union_of([self.transform_class_expression(x) for x in members])) if cls.exactly_one_of: - sub_exprs = [self.transform_class_expression(x) for x in cls.exactly_one_of] + members = list(cls.exactly_one_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + sub_exprs = [self.transform_class_expression(x) for x in members] if isinstance(cls, ClassDefinition): cls_uri = self._class_uri(cls.name) listnode = BNode() @@ -546,17 +622,23 @@ def transform_class_expression( graph.add((cls_uri, OWL.disjointUnionOf, listnode)) else: sub_sub_exprs = [] - for i, x in enumerate(cls.exactly_one_of): - rest = cls.exactly_one_of[0:i] + cls.exactly_one_of[i + 1 :] + for i, x in enumerate(members): + rest = members[0:i] + members[i + 1 :] neg_expr = self._complement_of_union_of([self.transform_class_expression(nx) for nx in rest]) pos_expr = self._intersection_of([self.transform_class_expression(x), neg_expr]) sub_sub_exprs.append(pos_expr) owl_exprs.append(self._union_of(sub_sub_exprs)) # owl_exprs.extend(sub_exprs) if cls.all_of: - owl_exprs.append(self._intersection_of([self.transform_class_expression(x) for x in cls.all_of])) + members = list(cls.all_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + owl_exprs.append(self._intersection_of([self.transform_class_expression(x) for x in members])) if cls.none_of: - owl_exprs.append(self._complement_of_union_of([self.transform_class_expression(x) for x in cls.none_of])) + members = list(cls.none_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + owl_exprs.append(self._complement_of_union_of([self.transform_class_expression(x) for x in members])) for slot in own_slots: if slot.name: owltypes = self.slot_node_owltypes(sv.get_slot(slot.name), owning_class=cls) @@ -709,27 +791,37 @@ def transform_class_slot_expression( owl_exprs.append(self.transform_class_slot_expression(cls, slot.all_members, main_slot, owl_types)) if slot.any_of: + members = list(slot.any_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) owl_exprs.append( - self._union_of( - [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in slot.any_of] - ) + self._union_of([self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in members]) ) if slot.all_of: + members = list(slot.all_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) owl_exprs.append( self._intersection_of( - [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in slot.all_of] + [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in members] ) ) if slot.none_of: + members = list(slot.none_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) owl_exprs.append( self._complement_of_union_of( - [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in slot.none_of] + [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in members] ) ) if slot.exactly_one_of: + members = list(slot.exactly_one_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) disj_exprs = [] - for i, operand in enumerate(slot.exactly_one_of): - rest = slot.exactly_one_of[0:i] + slot.exactly_one_of[i + 1 :] + for i, operand in enumerate(members): + rest = members[0:i] + members[i + 1 :] neg_expr = self._complement_of_union_of( [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in rest], owl_types=owl_types, @@ -746,14 +838,19 @@ def transform_class_slot_expression( this_owl_types = set() if range: if range in sv.all_types(imports=True): - self.slot_is_literal_map[main_slot.name].add(True) - this_owl_types.add(RDFS.Literal) - typ = sv.get_type(range) - if self.type_objects: - # TODO - owl_exprs.append(self._type_uri(typ.name)) + if self.xsd_anyuri_as_iri and is_xsd_anyuri_range(sv, range): + # xsd:anyURI ranges become ObjectProperty with no rdfs:range + self.slot_is_literal_map[main_slot.name].add(False) + this_owl_types.add(OWL.Thing) else: - owl_exprs.append(self._type_uri(typ.name)) + self.slot_is_literal_map[main_slot.name].add(True) + this_owl_types.add(RDFS.Literal) + typ = sv.get_type(range) + if self.type_objects: + # TODO + owl_exprs.append(self._type_uri(typ.name)) + else: + owl_exprs.append(self._type_uri(typ.name)) elif range in sv.all_enums(imports=True): # TODO: enums fill this in owl_exprs.append(self._enum_uri(EnumDefinitionName(range))) @@ -998,7 +1095,10 @@ def add_enum(self, e: EnumDefinition) -> None: owl_types = [] enum_owl_type = self._get_metatype(e, self.default_permissible_value_type) - for pv in e.permissible_values.values(): + pvs = e.permissible_values.values() + if self.deterministic: + pvs = sorted(pvs, key=lambda x: x.text) + for pv in pvs: pv_owl_type = self._get_metatype(pv, enum_owl_type) owl_types.append(pv_owl_type) if pv_owl_type == RDFS.Literal: @@ -1330,8 +1430,9 @@ def _boolean_expression( def _range_is_datatype(self, slot: SlotDefinition) -> bool: if self.type_objects: return False - else: - return slot.range in self.schema.types + if self.xsd_anyuri_as_iri and is_xsd_anyuri_range(self.schemaview, slot.range): + return False + return slot.range in self.schema.types def _range_uri(self, slot: SlotDefinition) -> URIRef: if slot.range in self.schema.types: @@ -1450,6 +1551,8 @@ def slot_owl_type(self, slot: SlotDefinition) -> URIRef: elif range in sv.all_enums(): return OWL.ObjectProperty elif range in sv.all_types(): + if self.xsd_anyuri_as_iri and is_xsd_anyuri_range(sv, range): + return OWL.ObjectProperty return OWL.DatatypeProperty else: raise Exception(f"Unknown range: {slot.range}") @@ -1569,7 +1672,19 @@ def slot_owl_type(self, slot: SlotDefinition) -> URIRef: show_default=True, help=( "If true, suppress rdfs:subClassOf owl:unionOf(subclasses) covering axioms for abstract classes. " - "By default such axioms are emitted for every abstract class that has direct is_a children." + "By default such axioms are emitted for every abstract class that has direct is_a children. " + "Note: warnings are emitted for abstract classes with zero children (no axiom) or one child (equivalence)." + ), +) +@click.option( + "--xsd-anyuri-as-iri/--no-xsd-anyuri-as-iri", + default=False, + show_default=True, + help=( + "Treat range: uri / range: uriorcurie slots as owl:ObjectProperty (IRI node) " + "instead of owl:DatatypeProperty with rdfs:range xsd:anyURI (literal). " + "Aligns OWL output with the SHACL generator (sh:nodeKind sh:IRI) and " + "the JSON-LD context generator (--xsd-anyuri-as-iri → @type: @id)." ), ) @click.version_option(__version__, "-V", "--version") diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index 5425051e3..7d5bb6c8f 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -13,7 +13,7 @@ from linkml.generators.common.subproperty import get_subproperty_values, is_uri_range from linkml.generators.shacl.shacl_data_type import ShaclDataType from linkml.generators.shacl.shacl_ifabsent_processor import ShaclIfAbsentProcessor -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments from linkml_runtime.linkml_model.meta import ClassDefinition, ElementName from linkml_runtime.utils.formatutils import underscore from linkml_runtime.utils.yamlutils import TypedNode, extended_float, extended_int, extended_str @@ -93,7 +93,13 @@ def generate_header(self) -> str: def serialize(self, **args) -> str: g = self.as_graph() - data = g.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format) + fmt = "turtle" if self.format in ["owl", "ttl"] else self.format + if self.deterministic and fmt == "turtle": + from linkml.utils.generator import deterministic_turtle + + data = deterministic_turtle(g) + else: + data = g.serialize(format=fmt) return data def as_graph(self) -> Graph: @@ -105,6 +111,10 @@ def as_graph(self) -> Graph: for pfx in self.schema.prefixes.values(): g.bind(str(pfx.prefix_prefix), pfx.prefix_reference) + if self.normalize_prefixes: + normalize_graph_prefixes( + g, {str(v.prefix_prefix): str(v.prefix_reference) for v in self.schema.prefixes.values()} + ) for c in sv.all_classes(imports=not self.exclude_imports).values(): @@ -309,13 +319,13 @@ def _add_enum(self, g: Graph, func: Callable, r: ElementName) -> None: sv = self.schemaview enum = sv.get_enum(r) pv_node = BNode() + pv_items = list(enum.permissible_values.items()) + if self.deterministic: + pv_items = sorted(pv_items, key=lambda x: x[0]) Collection( g, pv_node, - [ - URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) - for pv_name, pv in enum.permissible_values.items() - ], + [URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) for pv_name, pv in pv_items], ) func(SH["in"], pv_node) @@ -469,7 +479,10 @@ def collect_child_properties(class_name: str, output: set) -> None: list_node = BNode() ignored_properties.add(RDF.type) - Collection(g, list_node, list(ignored_properties)) + props = list(ignored_properties) + if self.deterministic: + props = sorted(props, key=str) + Collection(g, list_node, props) return list_node diff --git a/packages/linkml/src/linkml/utils/generator.py b/packages/linkml/src/linkml/utils/generator.py index 88fc48585..548ca7cbf 100644 --- a/packages/linkml/src/linkml/utils/generator.py +++ b/packages/linkml/src/linkml/utils/generator.py @@ -20,11 +20,12 @@ import os import re import sys +import types from collections.abc import Callable, Mapping from dataclasses import dataclass, field from functools import lru_cache from pathlib import Path -from typing import ClassVar, TextIO, Union, cast +from typing import TYPE_CHECKING, ClassVar, TextIO, Union, cast import click from click import Argument, Command, Option @@ -37,6 +38,10 @@ from linkml.utils.schemaloader import SchemaLoader from linkml.utils.typereferences import References from linkml_runtime import SchemaView + +if TYPE_CHECKING: + from rdflib import Graph as RdfGraph + from linkml_runtime.linkml_model.meta import ( ClassDefinition, ClassDefinitionName, @@ -58,6 +63,9 @@ from linkml_runtime.utils.formatutils import camelcase, underscore from linkml_runtime.utils.namespaces import Namespaces +if TYPE_CHECKING: + from rdflib import Graph + logger = logging.getLogger(__name__) @@ -78,6 +86,410 @@ def _resolved_metamodel(mergeimports): return metamodel +def _wl_signatures( + quads: list, + iterations: int = 4, +) -> dict[str, str]: + """Compute Weisfeiler-Lehman structural signatures for blank nodes. + + Uses 1-dimensional WL colour refinement [1]_ to assign each blank + node a deterministic signature derived from its multi-hop + neighbourhood structure. The signature depends only on predicate + IRIs, literal values, and named-node IRIs — **not** on blank-node + identifiers — so it remains stable when unrelated triples are added + or removed. + + Parameters + ---------- + quads : list + Canonical quads from pyoxigraph (after RDFC-1.0). + iterations : int + Number of WL refinement rounds (default 4). + + Returns + ------- + dict[str, str] + Mapping from canonical blank-node ID (e.g. ``c14n42``) to a + truncated SHA-256 hash suitable for use as a stable blank-node + label. + + References + ---------- + .. [1] Weisfeiler, B. & Leman, A. (1968). "The reduction of a graph + to canonical form and the algebra which appears therein." + """ + import hashlib + + import pyoxigraph # guaranteed available — caller (deterministic_turtle) checks + + # Collect all blank node IDs and build adjacency index. + bnode_ids: set[str] = set() + # outgoing[b] = list of (predicate_str, object_str_or_bnode_id, is_bnode) + outgoing: dict[str, list[tuple[str, str, bool]]] = {} + # incoming[b] = list of (subject_str_or_bnode_id, predicate_str, is_bnode) + incoming: dict[str, list[tuple[str, str, bool]]] = {} + + for q in quads: + s, p, o = q.subject, q.predicate, q.object + s_is_bn = isinstance(s, pyoxigraph.BlankNode) + o_is_bn = isinstance(o, pyoxigraph.BlankNode) + p_str = str(p) + + if s_is_bn: + bnode_ids.add(s.value) + outgoing.setdefault(s.value, []).append((p_str, o.value if o_is_bn else str(o), o_is_bn)) + if o_is_bn: + bnode_ids.add(o.value) + incoming.setdefault(o.value, []).append((s.value if s_is_bn else str(s), p_str, s_is_bn)) + + # Initialise signatures: named-node edges only (no bnode IDs). + sig: dict[str, str] = {} + for bid in bnode_ids: + parts = [] + for p_str, o_str, o_is_bn in outgoing.get(bid, []): + if not o_is_bn: + parts.append(f"+{p_str}={o_str}") + for s_str, p_str, s_is_bn in incoming.get(bid, []): + if not s_is_bn: + parts.append(f"-{s_str}={p_str}") + sig[bid] = "|".join(sorted(parts)) + + # Iterative refinement: incorporate neighbour signatures. + for _ in range(iterations): + new_sig: dict[str, str] = {} + for bid in bnode_ids: + parts = [sig[bid]] + for p_str, o_str, o_is_bn in outgoing.get(bid, []): + if o_is_bn: + parts.append(f"+{p_str}={sig.get(o_str, '')}") + for s_str, p_str, s_is_bn in incoming.get(bid, []): + if s_is_bn: + parts.append(f"-{sig.get(s_str, '')}={p_str}") + new_sig[bid] = "|".join(sorted(parts)) + sig = new_sig + + # Convert signatures to truncated SHA-256 hashes. + # Use 12 hex chars (48 bits) — birthday-bound collision probability + # is ~n²/2^49: ~0.002% at 100k nodes. Collisions are handled by + # appending a counter (see below), so correctness is preserved. + hash_map: dict[str, str] = {} + seen_hashes: dict[str, int] = {} + for bid in sorted(bnode_ids): + digest = hashlib.sha256(sig[bid].encode("utf-8")).hexdigest()[:12] + # Handle collisions by appending a counter. + count = seen_hashes.get(digest, 0) + seen_hashes[digest] = count + 1 + label = f"b{digest}" if count == 0 else f"b{digest}_{count}" + hash_map[bid] = label + + return hash_map + + +def deterministic_turtle(graph: "RdfGraph") -> str: + """Serialize an RDF graph to Turtle with deterministic output ordering. + + Uses a three-phase hybrid pipeline for **correctness**, **diff + stability**, and **readability**: + + 1. **RDFC-1.0** [1]_ (via ``pyoxigraph``) canonicalizes the graph, + ensuring isomorphic inputs produce identical triple sets. + 2. **Weisfeiler-Lehman structural hashing** replaces the sequential + ``_:c14nN`` identifiers with content-based hashes derived from + each blank node's multi-hop neighbourhood. These hashes depend + only on predicate IRIs, literal values, and named-node IRIs — + not on blank-node numbering — so adding or removing a triple + only affects the identifiers of directly involved blank nodes. + 3. **Hybrid rdflib re-serialization** parses the canonicalized, + WL-hashed triples back into an rdflib ``Graph`` and serializes + with rdflib's native Turtle writer. This recovers idiomatic + Turtle features that pyoxigraph cannot emit: + + - **Inline blank nodes** (``[ … ]``) for singly-referenced + blank nodes (Turtle §2.7 [2]_), instead of verbose named + ``_:bHASH`` syntax. + - **Collection syntax** (``( … )``) for ``rdf:List`` chains + (Turtle §2.8 [2]_). + - **Prefix filtering**: only prefixes actually used in the + graph's IRIs are declared, following the practice of Apache + Jena, Eclipse RDF4J, and Raptor. + + All triples from the source graph are preserved — the hybrid step + only changes syntactic form, never semantic content. + + Parameters + ---------- + graph : rdflib.Graph + An rdflib Graph to serialize. + + Returns + ------- + str + Deterministic Turtle string with ``@prefix`` declarations. + + References + ---------- + .. [1] W3C (2024). "RDF Dataset Canonicalization (RDFC-1.0)." + W3C Recommendation. https://www.w3.org/TR/rdf-canon/ + .. [2] W3C (2014). "RDF 1.1 Turtle — Terse RDF Triple Language." + W3C Recommendation. https://www.w3.org/TR/turtle/ + """ + try: + import pyoxigraph + except ImportError as exc: + raise ImportError( + "pyoxigraph >= 0.4.0 is required for --deterministic output. " + "Install it with: pip install 'pyoxigraph>=0.4.0'" + ) from exc + + from rdflib import BNode, Graph, Literal, URIRef + + # ── Phase 1: RDFC-1.0 canonicalization ────────────────────────── + nt_data = graph.serialize(format="nt") + + dataset = pyoxigraph.Dataset(pyoxigraph.parse(nt_data, format=pyoxigraph.RdfFormat.N_TRIPLES)) + dataset.canonicalize(pyoxigraph.CanonicalizationAlgorithm.RDFC_1_0) + + canonical_quads = list(dataset) + + # ── Phase 2: WL structural hashing for diff-stable blank node IDs + wl_map = _wl_signatures(canonical_quads) + + def _remap(term): + if isinstance(term, pyoxigraph.BlankNode) and term.value in wl_map: + return pyoxigraph.BlankNode(wl_map[term.value]) + return term + + remapped = [pyoxigraph.Triple(_remap(q.subject), q.predicate, _remap(q.object)) for q in canonical_quads] + + # ── Phase 3: Hybrid rdflib re-serialization ───────────────────── + # Convert pyoxigraph terms to rdflib terms and populate a clean + # Graph that only carries explicitly-bound prefixes. + def _to_rdflib(term): + """Convert a pyoxigraph term to the equivalent rdflib term.""" + if isinstance(term, pyoxigraph.NamedNode): + return URIRef(term.value) + if isinstance(term, pyoxigraph.BlankNode): + return BNode(term.value) + if isinstance(term, pyoxigraph.Literal): + if term.language: + return Literal(term.value, lang=term.language) + if term.datatype: + dt_iri = term.datatype.value + # In RDF 1.1, simple literals are syntactic sugar for + # xsd:string (Turtle §2.5.1). Preserve the shorter form + # to match the original owlgen output and avoid spurious + # diffs on every string literal. + if dt_iri == "http://www.w3.org/2001/XMLSchema#string": + return Literal(term.value) + return Literal(term.value, datatype=URIRef(dt_iri)) + return Literal(term.value) + raise TypeError(f"Unexpected pyoxigraph term type: {type(term).__name__}: {term}") + + result_graph = Graph(bind_namespaces="none") + for triple in remapped: + result_graph.add( + ( + _to_rdflib(triple.subject), + _to_rdflib(triple.predicate), + _to_rdflib(triple.object), + ) + ) + + # Bind only prefixes whose namespace IRI is actually referenced + # by at least one subject, predicate, or object in the graph. + # This filters out rdflib's ~27 built-in default bindings + # (brick, csvw, doap, …) that leak through Graph() even when + # the schema never declared them. + used_iris: set[str] = set() + for s, p, o in result_graph: + for term in (s, p, o): + if isinstance(term, URIRef): + used_iris.add(str(term)) + + for pfx, ns in sorted(graph.namespaces()): + pfx_s, ns_s = str(pfx), str(ns) + if pfx_s and any(iri.startswith(ns_s) for iri in used_iris): + result_graph.bind(pfx_s, ns_s) + + return result_graph.serialize(format="turtle") + + +def deterministic_json(obj: object, indent: int = 3, preserve_list_order_keys: frozenset[str] | None = None) -> str: + """Serialize a JSON-compatible object with deterministic ordering. + + Recursively sorts all dict keys *and* list elements to produce + stable output across Python versions and process invocations. + + List elements are sorted by their canonical JSON representation + (``json.dumps(item, sort_keys=True)``), which handles lists of + dicts, strings, and mixed types. + + :param obj: A JSON-serializable object (typically parsed from ``as_json``). + :param indent: Number of spaces for indentation. + :param preserve_list_order_keys: Dict keys whose list values must NOT be + sorted (e.g. ``@context``, ``@list`` in JSON-LD where array order is + semantic). Defaults to ``_JSONLD_ORDERED_KEYS``. + :returns: Deterministic JSON string. + """ + import json + + skip = preserve_list_order_keys if preserve_list_order_keys is not None else _JSONLD_ORDERED_KEYS + + def _deep_sort(value: object, parent_key: str = "") -> object: + if isinstance(value, dict): + return {k: _deep_sort(v, parent_key=k) for k, v in sorted(value.items())} + if isinstance(value, list): + sorted_items = [_deep_sort(item) for item in value] + if parent_key in skip: + return sorted_items + try: + return sorted(sorted_items, key=lambda x: json.dumps(x, sort_keys=True, ensure_ascii=False)) + except TypeError: + return sorted_items + return value + + return json.dumps(_deep_sort(obj), indent=indent, ensure_ascii=False) + + +# JSON-LD keys whose array values carry ordering semantics and must not +# be sorted. @context arrays define an override cascade (JSON-LD 1.1 +# §4.1); @list containers are explicitly ordered; @graph and @set are +# included defensively. +_JSONLD_ORDERED_KEYS: frozenset[str] = frozenset({"@context", "@list", "@graph", "@set", "imports"}) + + +def well_known_prefix_map() -> dict[str, str]: + """Return a mapping from namespace URI to standard prefix name. + + Uses a frozen, version-independent map derived from rdflib 7.x curated + defaults (which align with the `prefix.cc `_ community + consensus registry). The map is **not** computed at runtime from + ``Graph().namespaces()`` because those defaults can change across rdflib + releases (they differ between 6.x and 7.x), which would silently alter + generator output. + + This allows generators to normalise non-standard prefix aliases + (e.g. ``sdo`` for ``https://schema.org/``) to their conventional names. + + Both ``http`` and ``https`` variants of schema.org are included because + the linkml-runtime historically binds ``schema: http://schema.org/`` + while rdflib (and the W3C) prefer ``https://schema.org/``. + """ + return dict(_WELL_KNOWN_PREFIX_MAP) + + +# Frozen, version-independent map: namespace URI → canonical prefix name. +# Source: rdflib 7.x defaults, cross-checked against https://prefix.cc +_WELL_KNOWN_PREFIX_MAP: types.MappingProxyType[str, str] = types.MappingProxyType( + { + "https://brickschema.org/schema/Brick#": "brick", + "http://www.w3.org/ns/csvw#": "csvw", + "http://purl.org/dc/elements/1.1/": "dc", + "http://purl.org/dc/dcam/": "dcam", + "http://www.w3.org/ns/dcat#": "dcat", + "http://purl.org/dc/dcmitype/": "dcmitype", + "http://purl.org/dc/terms/": "dcterms", + "http://usefulinc.com/ns/doap#": "doap", + "http://xmlns.com/foaf/0.1/": "foaf", + "http://www.opengis.net/ont/geosparql#": "geo", + "http://www.w3.org/ns/odrl/2/": "odrl", + "http://www.w3.org/ns/org#": "org", + "http://www.w3.org/2002/07/owl#": "owl", + "http://www.w3.org/ns/dx/prof/": "prof", + "http://www.w3.org/ns/prov#": "prov", + "http://purl.org/linked-data/cube#": "qb", + "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", + "http://www.w3.org/2000/01/rdf-schema#": "rdfs", + "https://schema.org/": "schema", + "http://schema.org/": "schema", # HTTP variant (linkml-runtime uses this) + "http://www.w3.org/ns/shacl#": "sh", + "http://www.w3.org/2004/02/skos/core#": "skos", + "http://www.w3.org/ns/sosa/": "sosa", + "http://www.w3.org/ns/ssn/": "ssn", + "http://www.w3.org/2006/time#": "time", + "http://purl.org/vocab/vann/": "vann", + "http://rdfs.org/ns/void#": "void", + "https://www.w3.org/2003/01/geo/wgs84_pos#": "wgs", + "http://www.w3.org/2003/01/geo/wgs84_pos#": "wgs", # HTTP variant (W3C canonical) + "http://www.w3.org/XML/1998/namespace": "xml", + "http://www.w3.org/2001/XMLSchema#": "xsd", + } +) + + +def normalize_graph_prefixes(graph: "Graph", schema_prefixes: dict[str, str]) -> None: + """Normalise non-standard prefix aliases in an rdflib Graph. + + For each prefix bound in *schema_prefixes* (mapping prefix name → + namespace URI), check whether ``well_known_prefix_map()`` knows a + standard name for that URI. If the standard name differs from the + schema-declared name, rebind the namespace to the standard name. + + This is the **shared implementation** used by OWL, SHACL, and (via a + different code-path) JSON-LD context generators so that all serialisation + formats agree on prefix names when ``--normalize-prefixes`` is active. + + :param graph: rdflib Graph whose namespace bindings should be adjusted. + :param schema_prefixes: mapping of prefix name → namespace URI string, + typically from ``schema.prefixes``. + """ + from rdflib import Namespace + + wk = well_known_prefix_map() + + # Phase 1: normalise schema-declared prefixes. + for old_pfx, ns_uri in schema_prefixes.items(): + ns_str = str(ns_uri) + std_pfx = wk.get(ns_str) + if not std_pfx or std_pfx == old_pfx: + continue + # Collision: the user explicitly declared std_pfx for a different + # namespace — do not clobber their binding. + if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str: + logger.warning( + "Prefix collision: cannot rename '%s' to '%s' because '%s' is already " + "declared for <%s>; skipping normalisation for <%s>", + old_pfx, + std_pfx, + std_pfx, + schema_prefixes[std_pfx], + ns_str, + ) + continue + # Rebind: remove old prefix, add standard prefix. + # ``replace=True`` forces the new prefix even if the prefix name + # is already bound to a different namespace. + graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True) + + # Phase 2: normalise runtime-injected bindings (e.g. metamodel defaults). + # The linkml-runtime / rdflib may inject well-known namespaces under + # non-standard prefix names. After Phase 1 rebinds schema-declared + # prefixes, orphaned runtime bindings can appear as ``schema1``, ``dc0``, + # etc. Scan the graph's current bindings and fix any that map to a + # well-known namespace under a non-standard name, provided the standard + # name isn't already claimed by the user for a different namespace. + # + # Guard: if Phase 1 already bound std_pfx to a different URI (e.g. + # ``schema`` → ``https://schema.org/``), do not clobber it with the + # HTTP variant (``http://schema.org/``). Build a snapshot of the + # current bindings after Phase 1 to detect this. + current_bindings = {str(p): str(n) for p, n in graph.namespaces()} + for pfx, ns in list(graph.namespaces()): + pfx_str, ns_str = str(pfx), str(ns) + std_pfx = wk.get(ns_str) + if not std_pfx or std_pfx == pfx_str: + continue + # Same collision check as Phase 1: respect user-declared prefixes. + if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str: + continue + # Guard: if std_pfx is already bound to a different (correct) URI + # by Phase 1, do not overwrite it. This prevents the HTTP variant + # of schema.org from clobbering the HTTPS binding. + if std_pfx in current_bindings and current_bindings[std_pfx] != ns_str: + continue + graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True) + + @dataclass class Generator(metaclass=abc.ABCMeta): """ @@ -139,6 +551,9 @@ class Generator(metaclass=abc.ABCMeta): mergeimports: bool | None = True """True means merge non-linkml sources into importing package. False means separate packages""" + deterministic: bool = False + """True means produce stable, reproducible output with sorted keys and canonical blank-node ordering""" + source_file_date: str | None = None """Modification date of input source file""" @@ -180,6 +595,12 @@ class Generator(metaclass=abc.ABCMeta): stacktrace: bool = False """True means print stack trace, false just error message""" + normalize_prefixes: bool = False + """True means normalise non-standard prefix aliases to well-known names + from the static ``_WELL_KNOWN_PREFIX_MAP`` (derived from rdflib 7.x + defaults / prefix.cc consensus). E.g. ``sdo`` → ``schema`` for + ``https://schema.org/``.""" + include: str | Path | SchemaDefinition | None = None """If set, include extra schema outside of the imports mechanism""" @@ -986,6 +1407,26 @@ def decorator(f: Command) -> Command: callback=stacktrace_callback, ) ) + f.params.append( + Option( + ("--deterministic/--no-deterministic",), + default=False, + show_default=True, + help="Generate stable, reproducible output with sorted keys and canonical blank-node ordering. " + "Supported by OWL, SHACL, JSON-LD, and JSON-LD Context generators. " + "Useful when generated artifacts are stored in version control.", + ) + ) + f.params.append( + Option( + ("--normalize-prefixes/--no-normalize-prefixes",), + default=False, + show_default=True, + help="Normalise non-standard prefix aliases to rdflib's curated default names " + "(e.g. sdo → schema for https://schema.org/). " + "Supported by OWL, SHACL, and JSON-LD Context generators.", + ) + ) return f diff --git a/tests/linkml/test_generators/test_deterministic_benchmark.py b/tests/linkml/test_generators/test_deterministic_benchmark.py new file mode 100644 index 000000000..b7488a8dd --- /dev/null +++ b/tests/linkml/test_generators/test_deterministic_benchmark.py @@ -0,0 +1,356 @@ +"""Benchmark: deterministic Turtle serializer on real-world ontologies. + +Evaluates the ``--deterministic`` flag against schema.org (~16 000 triples, +~800 classes, ~1 400 properties) and the kitchen_sink LinkML schema to +demonstrate four properties: + +1. **Semantic equivalence** — ``rdflib.compare.isomorphic()`` confirms that + deterministic and non-deterministic outputs encode the same RDF graph. +2. **Byte-level stability** — SHA-256 identity across repeated runs proves + that deterministic output is truly reproducible. +3. **Diff quality** — controlled mutations show that small schema changes + produce small, focused diffs (high signal-to-noise ratio). +4. **Performance** — generation time stays within acceptable bounds even + on large real-world graphs. + +Schema.org tests exercise ``deterministic_turtle()`` directly on a +pre-existing OWL ontology. Kitchen_sink tests exercise the full +``OwlSchemaGenerator`` / ``ShaclGenerator`` pipeline with LinkML schemas. + +References +---------- +- W3C RDFC-1.0: https://www.w3.org/TR/rdf-canon/ +- W3C Turtle 1.1: https://www.w3.org/TR/turtle/ +- schema.org: https://schema.org/docs/developers.html +""" + +import difflib +import hashlib +import time +from pathlib import Path + +import pytest +import yaml +from rdflib import Graph +from rdflib.compare import isomorphic + +from linkml.generators.owlgen import OwlSchemaGenerator +from linkml.generators.shaclgen import ShaclGenerator +from linkml.utils.generator import deterministic_turtle + +_has_pyoxigraph = False +try: + import pyoxigraph + + _has_pyoxigraph = hasattr(pyoxigraph, "Dataset") +except ImportError: + pass + +pytestmark = pytest.mark.skipif( + not _has_pyoxigraph, + reason="pyoxigraph >= 0.4.0 required for deterministic benchmarks", +) + +KITCHEN_SINK = str(Path(__file__).parent / "input" / "kitchen_sink.yaml") +SCHEMA_ORG_URL = "https://schema.org/version/latest/schemaorg-current-https.ttl" + + +def _sha256(text: str) -> str: + return hashlib.sha256(text.encode()).hexdigest() + + +def _diff_line_count(a: str, b: str) -> int: + """Count lines present in *b* but not in *a* (unified-diff additions).""" + al = a.strip().splitlines() + bl = b.strip().splitlines() + return sum( + 1 for line in difflib.unified_diff(al, bl, lineterm="") if line.startswith("+") and not line.startswith("+++") + ) + + +# ── Schema.org: direct serializer benchmark ──────────────────────── + + +@pytest.fixture(scope="module") +def schema_org_graph(): + """Download and parse schema.org as an rdflib Graph. + + Cached for the module so the network fetch only happens once. + Skips all dependent tests if the download fails. + """ + try: + import urllib.request + + with urllib.request.urlopen(SCHEMA_ORG_URL, timeout=60) as resp: + data = resp.read().decode("utf-8") + except Exception as exc: + pytest.skip(f"Could not fetch schema.org: {exc}") + + g = Graph() + g.parse(data=data, format="turtle") + return g + + +@pytest.mark.network +class TestSchemaOrgDeterministicSerializer: + """Benchmark ``deterministic_turtle()`` on schema.org OWL ontology.""" + + def test_semantic_equivalence(self, schema_org_graph): + """Deterministic serialization must be isomorphic to the original graph.""" + det_ttl = deterministic_turtle(schema_org_graph) + + g_det = Graph() + g_det.parse(data=det_ttl, format="turtle") + + assert len(g_det) == len(schema_org_graph), ( + f"Triple count mismatch: original={len(schema_org_graph)}, deterministic={len(g_det)}" + ) + assert isomorphic(g_det, schema_org_graph), ( + "Deterministic output is NOT isomorphic to original schema.org graph" + ) + + def test_byte_stability(self, schema_org_graph): + """Two deterministic runs must produce byte-identical output.""" + run1 = deterministic_turtle(schema_org_graph) + run2 = deterministic_turtle(schema_org_graph) + assert _sha256(run1) == _sha256(run2), "Deterministic serializer produced different output across runs" + + def test_prefix_filtering(self, schema_org_graph): + """Only prefixes actually used in the graph should be declared.""" + det_ttl = deterministic_turtle(schema_org_graph) + + # Extract declared prefixes + declared = {} + for line in det_ttl.splitlines(): + if line.startswith("@prefix"): + parts = line.split() + pfx = parts[1].rstrip(":") + ns = parts[2].strip("<>") + declared[pfx] = ns + + # Collect all IRIs in the graph + from rdflib import URIRef + + used_iris = set() + for s, p, o in schema_org_graph: + for term in (s, p, o): + if isinstance(term, URIRef): + used_iris.add(str(term)) + + # Every declared prefix must have at least one IRI using it + for pfx, ns in declared.items(): + assert any(iri.startswith(ns) for iri in used_iris), f"Prefix '{pfx}:' <{ns}> declared but no IRI uses it" + + def test_performance(self, schema_org_graph): + """Serialization must complete within 60 seconds for ~16K triples.""" + start = time.time() + det_ttl = deterministic_turtle(schema_org_graph) + elapsed = time.time() - start + triple_count = len(schema_org_graph) + throughput = triple_count / elapsed if elapsed > 0 else float("inf") + + # Log for benchmark visibility (shows with pytest -v) + print(f"\n schema.org: {triple_count} triples in {elapsed:.1f}s ({throughput:.0f} triples/s)") + + assert elapsed < 60.0, f"Serialization took {elapsed:.1f}s (limit: 60s) for {triple_count} triples" + assert len(det_ttl) > 1000, "Output suspiciously short" + + +# ── Kitchen_sink: full pipeline benchmark ─────────────────────────── + + +def _mutate_kitchen_sink(description_suffix: str = "", add_slot: bool = False) -> str: + """Create a mutated copy of kitchen_sink.yaml **in the same directory** and return its path. + + The copy must live alongside the original so that LinkML relative imports + (``linkml:types``, ``core``, etc.) resolve correctly. + + Uses a unique filename (via ``os.getpid()``) to avoid race conditions + when tests run in parallel under pytest-xdist. + + Parameters + ---------- + description_suffix + Text appended to the first class description found. + add_slot + If True, adds a synthetic ``benchmark_notes`` slot to the first class. + """ + import os + + ks_path = Path(KITCHEN_SINK) + schema = yaml.safe_load(ks_path.read_text()) + + if description_suffix or add_slot: + # Find the first class with a description + for cls_name, cls_def in schema.get("classes", {}).items(): + if isinstance(cls_def, dict) and cls_def.get("description"): + if description_suffix: + cls_def["description"] += description_suffix + if add_slot: + slots = cls_def.get("slots", []) + slots.append("benchmark_notes") + cls_def["slots"] = slots + break + + # Define the synthetic slot if adding one + if add_slot: + slots_dict = schema.setdefault("slots", {}) + slots_dict["benchmark_notes"] = { + "description": "Synthetic benchmark slot for diff quality testing.", + "range": "string", + } + + # Write in the same directory so relative imports resolve. + # Use PID to avoid race conditions with pytest-xdist workers. + out_path = ks_path.parent / f"_benchmark_mutated_{os.getpid()}_kitchen_sink.yaml" + out_path.write_text( + yaml.dump(schema, default_flow_style=False, allow_unicode=True), + encoding="utf-8", + ) + return str(out_path) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +class TestKitchenSinkDiffQuality: + """Measure diff quality on the kitchen_sink schema with controlled mutations.""" + + def test_mutation_description_change(self, generator_cls): + """A single description change must produce a small, focused diff. + + Deterministic mode should change only the affected line(s) and their + immediate context (e.g. SHACL may repeat descriptions in sh:description). + Non-deterministic mode produces a much larger diff due to blank-node + and property-ordering instability. + """ + base = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + mutated_path = _mutate_kitchen_sink(description_suffix=" (benchmark edit)") + try: + mutated = generator_cls(mutated_path, deterministic=True).serialize() + finally: + Path(mutated_path).unlink(missing_ok=True) + + det_diff = _diff_line_count(base, mutated) + + # Non-deterministic baseline for comparison + non_base = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + non_mutated_path = _mutate_kitchen_sink(description_suffix=" (benchmark edit)") + try: + non_mutated = generator_cls(non_mutated_path, deterministic=False).serialize() + finally: + Path(non_mutated_path).unlink(missing_ok=True) + + non_diff = _diff_line_count(non_base, non_mutated) + + # The deterministic diff must be small (description + any SHACL mirrors) + assert det_diff <= 20, ( + f"Deterministic diff too large for a 1-description change: {det_diff} lines (expected ≤20)" + ) + # Signal-to-noise: deterministic must be at least 5× smaller + if non_diff > 0: + ratio = non_diff / max(det_diff, 1) + assert ratio >= 5, ( + f"Insufficient noise reduction: det={det_diff}, non-det={non_diff}, ratio={ratio:.1f}× (expected ≥5×)" + ) + + print( + f"\n {generator_cls.__name__} description mutation: " + f"det={det_diff} lines, non-det={non_diff} lines, " + f"noise reduction={non_diff / max(det_diff, 1):.0f}×" + ) + + def test_mutation_add_slot(self, generator_cls): + """Adding a new slot must produce a proportionally small diff. + + A new slot adds ~10-20 triples (label, range, domain, restrictions). + The diff should be roughly proportional to the new content, not a + full-file rewrite. + """ + base = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + mutated_path = _mutate_kitchen_sink(add_slot=True) + try: + mutated = generator_cls(mutated_path, deterministic=True).serialize() + finally: + Path(mutated_path).unlink(missing_ok=True) + + det_diff = _diff_line_count(base, mutated) + + # Non-deterministic baseline for comparison + non_base = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + non_mutated_path = _mutate_kitchen_sink(add_slot=True) + try: + non_mutated = generator_cls(non_mutated_path, deterministic=False).serialize() + finally: + Path(non_mutated_path).unlink(missing_ok=True) + + non_diff = _diff_line_count(non_base, non_mutated) + + g_base = Graph() + g_base.parse(data=base, format="turtle") + g_mut = Graph() + g_mut.parse(data=mutated, format="turtle") + new_triples = len(g_mut) - len(g_base) + + # Diff should be proportional to new triples (allow 5× margin) + assert det_diff <= max(new_triples * 5, 40), ( + f"Deterministic diff ({det_diff} lines) disproportionate to new triples ({new_triples})" + ) + # Signal-to-noise: deterministic must be at least 5× smaller + if non_diff > 0: + ratio = non_diff / max(det_diff, 1) + assert ratio >= 5, ( + f"Insufficient noise reduction: det={det_diff}, non-det={non_diff}, ratio={ratio:.1f}× (expected ≥5×)" + ) + + print( + f"\n {generator_cls.__name__} add-slot mutation: " + f"det_diff={det_diff} lines, non-det={non_diff} lines, " + f"new_triples={new_triples}, noise reduction={non_diff / max(det_diff, 1):.0f}×" + ) + + print(f"\n {generator_cls.__name__} add-slot mutation: det_diff={det_diff} lines, new_triples={new_triples}") + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +class TestKitchenSinkEquivalence: + """Verify semantic equivalence between deterministic and non-deterministic modes.""" + + def test_triple_count_matches(self, generator_cls): + """Both modes must produce the same number of triples.""" + det = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + nondet = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=det, format="turtle") + g_nondet = Graph() + g_nondet.parse(data=nondet, format="turtle") + + assert len(g_det) == len(g_nondet), ( + f"Triple count mismatch: deterministic={len(g_det)}, non-deterministic={len(g_nondet)}" + ) + + def test_byte_stability_across_runs(self, generator_cls): + """Three deterministic runs must produce identical output.""" + runs = [generator_cls(KITCHEN_SINK, deterministic=True).serialize() for _ in range(3)] + hashes = [_sha256(r) for r in runs] + assert hashes[0] == hashes[1] == hashes[2], f"Deterministic output varies across runs: {hashes}" + + def test_non_deterministic_instability(self, generator_cls): + """Non-deterministic output should vary across runs (documents the problem). + + This test is advisory — it passes regardless but logs the instability. + """ + runs = [generator_cls(KITCHEN_SINK, deterministic=False).serialize() for _ in range(3)] + hashes = [_sha256(r) for r in runs] + identical = hashes[0] == hashes[1] == hashes[2] + print( + f"\n {generator_cls.__name__} non-det stable: {identical} " + f"(expected: False for Turtle due to bnode/ordering instability)" + ) diff --git a/tests/linkml/test_generators/test_deterministic_output.py b/tests/linkml/test_generators/test_deterministic_output.py new file mode 100644 index 000000000..6721c2ac9 --- /dev/null +++ b/tests/linkml/test_generators/test_deterministic_output.py @@ -0,0 +1,481 @@ +"""Tests for deterministic generator output. + +When ``deterministic=True``, generators must produce byte-identical output +across multiple invocations. This ensures version-controlled artifacts don't +show spurious diffs from blank-node relabeling or dict-ordering instability. + +Generators must also produce **isomorphic** output — the deterministic +serialization must encode the same RDF graph as non-deterministic mode. +""" + +import json +import time +from pathlib import Path + +import pytest +from rdflib import Graph +from rdflib.compare import isomorphic + +from linkml.generators.jsonldcontextgen import ContextGenerator +from linkml.generators.jsonldgen import JSONLDGenerator +from linkml.generators.owlgen import OwlSchemaGenerator +from linkml.generators.shaclgen import ShaclGenerator + +# Deterministic Turtle requires pyoxigraph >= 0.4.0 (for Dataset/canonicalize). +# When an older version is present (e.g. pulled in by morph-kgc), skip these tests. +_has_pyoxigraph = False +try: + import pyoxigraph + + _has_pyoxigraph = hasattr(pyoxigraph, "Dataset") +except ImportError: + pass + +pytestmark = pytest.mark.skipif(not _has_pyoxigraph, reason="pyoxigraph >= 0.4.0 required for deterministic tests") + +SCHEMA = str(Path(__file__).parent / "input" / "personinfo.yaml") + + +@pytest.mark.parametrize( + "generator_cls,kwargs", + [ + (OwlSchemaGenerator, {}), + (ShaclGenerator, {}), + (ContextGenerator, {}), + (JSONLDGenerator, {}), + ], + ids=["owl", "shacl", "context", "jsonld"], +) +def test_deterministic_output_is_identical_across_runs(generator_cls, kwargs): + """Generate output twice with deterministic=True and verify identity.""" + out1 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize() + out2 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize() + # JSONLDGenerator embeds a generation_date timestamp — normalize it + if generator_cls is JSONLDGenerator: + import re + + ts_re = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}") + out1 = ts_re.sub("TIMESTAMP", out1) + out2 = ts_re.sub("TIMESTAMP", out2) + assert out1 == out2, f"{generator_cls.__name__} produced different output across runs" + assert len(out1) > 100, "Output suspiciously short — generator may have failed silently" + + +@pytest.mark.parametrize( + "generator_cls", + [ContextGenerator, JSONLDGenerator], + ids=["context", "jsonld"], +) +def test_deterministic_json_has_sorted_keys(generator_cls): + """When deterministic=True, JSON dict keys should be sorted at all levels. + + For the ContextGenerator, @context keys use grouped ordering (prefixes + before term entries) — each group is sorted, but not globally. + """ + out = generator_cls(SCHEMA, deterministic=True).serialize() + parsed = json.loads(out) + + is_context_gen = generator_cls is ContextGenerator + + def _check_sorted_keys(obj, path="root"): + if isinstance(obj, dict): + keys = list(obj.keys()) + # Context generator groups @context keys: @-directives, prefixes, terms + if is_context_gen and path == "root.@context": + at_keys = [k for k in keys if k.startswith("@")] + prefix_keys = [k for k in keys if not k.startswith("@") and isinstance(obj[k], str)] + term_keys = [k for k in keys if not k.startswith("@") and not isinstance(obj[k], str)] + assert at_keys == sorted(at_keys), f"@-keys not sorted: {at_keys}" + assert prefix_keys == sorted(prefix_keys), f"Prefix keys not sorted: {prefix_keys}" + assert term_keys == sorted(term_keys), f"Term keys not sorted: {term_keys}" + else: + assert keys == sorted(keys), f"Keys not sorted at {path}: {keys}" + for k, v in obj.items(): + _check_sorted_keys(v, f"{path}.{k}") + elif isinstance(obj, list): + for i, item in enumerate(obj): + _check_sorted_keys(item, f"{path}[{i}]") + + _check_sorted_keys(parsed) + + +@pytest.mark.parametrize( + "generator_cls", + [ContextGenerator, JSONLDGenerator], + ids=["context", "jsonld"], +) +def test_deterministic_json_lists_are_sorted(generator_cls): + """When deterministic=True, JSON list elements should be sorted. + + Lists under JSON-LD structural keys (``@context``, ``@list``, ``imports``, + etc.) are exempt because their ordering carries semantic meaning. + """ + out = generator_cls(SCHEMA, deterministic=True).serialize() + parsed = json.loads(out) + + # JSON-LD keys whose array values carry ordering semantics. + _ORDERED_KEYS = {"@context", "@list", "@graph", "@set", "imports"} + + def _check_sorted_lists(obj, path="root", parent_key=""): + if isinstance(obj, dict): + for k, v in obj.items(): + _check_sorted_lists(v, f"{path}.{k}", parent_key=k) + elif isinstance(obj, list): + if parent_key not in _ORDERED_KEYS: + str_items = [json.dumps(item, sort_keys=True, ensure_ascii=False) for item in obj] + assert str_items == sorted(str_items), f"List not sorted at {path}" + for i, item in enumerate(obj): + _check_sorted_lists(item, f"{path}[{i}]") + + _check_sorted_lists(parsed) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_turtle_preserves_at_prefix(generator_cls): + """deterministic_turtle must produce standard @prefix, not SPARQL PREFIX.""" + out = generator_cls(SCHEMA, deterministic=True).serialize() + assert "@prefix" in out, "Output uses non-standard prefix syntax" + assert "PREFIX " not in out, "Output uses SPARQL PREFIX instead of Turtle @prefix" + + +def test_deterministic_turtle_performance(): + """Deterministic OWL generation must complete within 10 seconds for personinfo. + + The Weisfeiler-Lehman approach is O(n log n), so this should easily pass. + The previous canon=True approach was exponential and failed this test + for graphs above ~250 triples. + """ + start = time.time() + out = OwlSchemaGenerator(SCHEMA, deterministic=True).serialize() + elapsed = time.time() - start + assert elapsed < 10.0, f"Deterministic generation took {elapsed:.1f}s (limit: 10s)" + assert len(out) > 100, "Output suspiciously short" + + +def test_shacl_closed_ignored_properties_deterministic(): + """sh:ignoredProperties in closed shapes must be deterministic. + + ``_build_ignored_properties`` collects inherited slots into a set; without + explicit sorting this produces different ``rdf:first``/``rdf:rest`` chains + on each run. With ``deterministic=True`` (and sorted Collection inputs) + the output must be byte-identical. + """ + runs = [ShaclGenerator(SCHEMA, deterministic=True, closed=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "sh:ignoredProperties ordering differs across runs" + assert "sh:ignoredProperties" in runs[0], "Expected closed shapes with sh:ignoredProperties" + + +def test_shacl_enum_in_deterministic(): + """sh:in RDF lists for enums must be deterministic. + + ``_build_enum_constraint`` iterates ``enum.permissible_values.items()`` + (dict iteration order) into a ``Collection``. Without sorting, the + ``rdf:first``/``rdf:rest`` chain varies across runs. + """ + runs = [ShaclGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "sh:in enum list ordering differs across runs" + assert "sh:in" in runs[0], "Expected sh:in constraints for enums" + + +def test_owl_enum_one_of_deterministic(): + """owl:oneOf RDF lists for enums must be deterministic. + + ``_boolean_expression`` feeds ``pv_uris`` (from ``permissible_values``) + into a ``Collection``. Without sorting, ``owl:oneOf`` list ordering varies. + """ + runs = [OwlSchemaGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "owl:oneOf enum list ordering differs across runs" + + +KITCHEN_SINK = str(Path(__file__).parent / "input" / "kitchen_sink.yaml") + + +def test_deterministic_large_schema(): + """End-to-end idempotency on a complex schema (kitchen_sink). + + Exercises many code paths simultaneously: closed shapes, enums, imports, + class hierarchies, and mixed ranges. + """ + owl1 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize() + owl2 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize() + assert owl1 == owl2, "OWL output differs across runs for kitchen_sink" + assert len(owl1) > 500, "kitchen_sink output suspiciously short" + + shacl1 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize() + shacl2 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize() + assert shacl1 == shacl2, "SHACL output differs across runs for kitchen_sink" + assert len(shacl1) > 500, "kitchen_sink output suspiciously short" + + +def test_deterministic_context_preserves_jsonld_structure(): + """Deterministic JSON-LD context must preserve conventional structure. + + JSON-LD contexts have a conventional layout: + 1. ``comments`` block first (metadata) + 2. ``@context`` block second, with prefixes grouped before term entries + + ``deterministic_json()`` would scramble this by sorting all keys + uniformly. The context generator must use JSON-LD-aware ordering. + """ + out = ContextGenerator(SCHEMA, deterministic=True, metadata=True).serialize() + parsed = json.loads(out) + + # Top-level key order: "comments" before "@context" + top_keys = list(parsed.keys()) + assert "comments" in top_keys, "Expected 'comments' block with metadata=True" + assert top_keys.index("comments") < top_keys.index("@context"), ( + f"'comments' should precede '@context', got: {top_keys}" + ) + + # Inside @context: @-directives, then prefixes (str values), then terms (dict values) + ctx = parsed["@context"] + ctx_keys = list(ctx.keys()) + + at_keys = [k for k in ctx_keys if k.startswith("@")] + prefix_keys = [k for k in ctx_keys if not k.startswith("@") and isinstance(ctx[k], str)] + term_keys = [k for k in ctx_keys if not k.startswith("@") and not isinstance(ctx[k], str)] + + # Verify grouping: all @-keys before all prefix keys before all term keys + last_at = max(ctx_keys.index(k) for k in at_keys) if at_keys else -1 + first_prefix = min(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else len(ctx_keys) + last_prefix = max(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else -1 + first_term = min(ctx_keys.index(k) for k in term_keys) if term_keys else len(ctx_keys) + + assert last_at < first_prefix, "@-directives must come before prefixes" + assert last_prefix < first_term, "Prefixes must come before term entries" + + # Verify each group is sorted internally + assert at_keys == sorted(at_keys), f"@-directives not sorted: {at_keys}" + assert prefix_keys == sorted(prefix_keys), f"Prefixes not sorted: {prefix_keys}" + assert term_keys == sorted(term_keys), f"Term entries not sorted: {term_keys}" + + +def test_non_deterministic_is_default(): + """Verify that ``deterministic`` defaults to False.""" + gen = OwlSchemaGenerator(SCHEMA) + assert gen.deterministic is False + + +def test_wl_handles_structurally_similar_bnodes(): + """Blank nodes with identical local structure but different named neighbours + must receive different WL signatures and thus different stable labels. + + This tests the core WL property: two BNodes that differ only in their + connected named nodes (URIs/literals) must be distinguishable. + """ + from rdflib import BNode, Graph, Namespace, URIRef + + from linkml.utils.generator import deterministic_turtle + + RDF_TYPE = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type") + OWL_RESTRICTION = URIRef("http://www.w3.org/2002/07/owl#Restriction") + OWL_ON_PROP = URIRef("http://www.w3.org/2002/07/owl#onProperty") + OWL_ALL_VALUES = URIRef("http://www.w3.org/2002/07/owl#allValuesFrom") + + EX = Namespace("http://example.org/") + g = Graph() + + # Two restrictions with same structure but different property URIs + r1 = BNode() + g.add((r1, RDF_TYPE, OWL_RESTRICTION)) + g.add((r1, OWL_ON_PROP, EX.alpha)) + g.add((r1, OWL_ALL_VALUES, EX.Target1)) + + r2 = BNode() + g.add((r2, RDF_TYPE, OWL_RESTRICTION)) + g.add((r2, OWL_ON_PROP, EX.beta)) + g.add((r2, OWL_ALL_VALUES, EX.Target2)) + + RDFS_SUBCLASS = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") + g.add((EX.MyClass, RDFS_SUBCLASS, r1)) + g.add((EX.MyClass, RDFS_SUBCLASS, r2)) + + # Must be deterministic across runs + out1 = deterministic_turtle(g) + out2 = deterministic_turtle(g) + assert out1 == out2, "WL-based serializer is not deterministic for similar BNodes" + + # Both restrictions must appear (not collapsed) + assert "alpha" in out1 + assert "beta" in out1 + + +def test_deterministic_turtle_no_bnodes(): + """Graphs with no blank nodes should still produce sorted, deterministic output.""" + from rdflib import Graph, Literal, Namespace + from rdflib.namespace import RDFS + + from linkml.utils.generator import deterministic_turtle + + EX = Namespace("http://example.org/") + g = Graph() + g.add((EX.B, RDFS.label, Literal("B"))) + g.add((EX.A, RDFS.label, Literal("A"))) + + out1 = deterministic_turtle(g) + out2 = deterministic_turtle(g) + assert out1 == out2 + + # A should appear before B (sorted) + a_pos = out1.find("example.org/A") + b_pos = out1.find("example.org/B") + assert a_pos < b_pos, "Triples should be sorted: A before B" + + +@pytest.mark.xfail( + reason=( + "Collection sorting (owl:oneOf, sh:in) in deterministic mode intentionally " + "reorders RDF list triples for canonical output. The resulting graph is " + "semantically equivalent (OWL/SHACL interpret these as unordered sets) but " + "not RDF-isomorphic because rdf:first/rdf:rest chains encode ordering." + ), + strict=True, +) +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_turtle_is_isomorphic(generator_cls): + """Deterministic output is NOT RDF-isomorphic to non-deterministic output. + + This documents the trade-off identified in linkml/linkml#3295 review: + deterministic mode sorts Collection inputs (owl:oneOf, sh:in, + sh:ignoredProperties) to produce canonical RDF list ordering. Since RDF + Collections encode order via rdf:first/rdf:rest triples, the sorted graph + is structurally different from the insertion-order graph — even though the + OWL/SHACL semantics are identical (these Collections represent sets). + + The test is marked xfail(strict=True) so that it: + - Documents the known, intentional non-isomorphism + - Alerts maintainers if the behaviour changes (strict xfail fails on pass) + """ + out_det = generator_cls(SCHEMA, deterministic=True).serialize() + out_nondet = generator_cls(SCHEMA, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=out_det, format="turtle") + + g_nondet = Graph() + g_nondet.parse(data=out_nondet, format="turtle") + + assert len(g_det) == len(g_nondet), ( + f"Triple count mismatch: deterministic={len(g_det)}, non-deterministic={len(g_nondet)}" + ) + assert isomorphic(g_det, g_nondet), ( + f"{generator_cls.__name__}: deterministic output is NOT isomorphic " + "to non-deterministic output — the serialization changed the graph" + ) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_non_deterministic_output_unchanged(generator_cls): + """Non-deterministic output must still produce valid RDF. + + Ensures that changes for deterministic mode don't break default behavior. + """ + out = generator_cls(SCHEMA, deterministic=False).serialize() + assert len(out) > 100, "Output suspiciously short" + g = Graph() + g.parse(data=out, format="turtle") + assert len(g) > 50, f"Graph has too few triples ({len(g)})" + + +@pytest.mark.parametrize( + "generator_cls,kwargs", + [ + (OwlSchemaGenerator, {}), + (ShaclGenerator, {}), + (ContextGenerator, {}), + (JSONLDGenerator, {}), + ], + ids=["owl", "shacl", "context", "jsonld"], +) +def test_non_deterministic_produces_valid_output(generator_cls, kwargs): + """All generators must produce valid output in non-deterministic mode.""" + out = generator_cls(SCHEMA, deterministic=False, **kwargs).serialize() + assert len(out) > 100, f"{generator_cls.__name__} output suspiciously short" + + +@pytest.mark.xfail( + reason=( + "Collection sorting in deterministic mode produces non-isomorphic RDF " + "(different rdf:first/rdf:rest triples). See test_deterministic_turtle_is_isomorphic." + ), + strict=True, +) +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_kitchen_sink_isomorphic(generator_cls): + """Isomorphism check on the complex kitchen_sink schema. + + Expected to fail for the same reason as test_deterministic_turtle_is_isomorphic: + Collection sorting changes the RDF structure while preserving OWL/SHACL semantics. + """ + out_det = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + out_nondet = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=out_det, format="turtle") + + g_nondet = Graph() + g_nondet.parse(data=out_nondet, format="turtle") + + assert isomorphic(g_det, g_nondet), ( + f"{generator_cls.__name__}: kitchen_sink deterministic output is NOT isomorphic to non-deterministic output" + ) + + +@pytest.mark.skipif(False, reason="does not require pyoxigraph") +def test_expression_sort_key_is_stable(): + """``_expression_sort_key`` must produce stable, content-based keys. + + LinkML anonymous expressions inherit ``YAMLRoot.__repr__()``, which + formats objects using **field values** (not memory addresses). + The ``_expression_sort_key`` helper relies on this for deterministic + ordering of ``any_of`` / ``all_of`` / ``none_of`` members. + + This test verifies that: + 1. Two distinct objects with identical fields produce the same key. + 2. Objects with different fields produce different keys. + 3. Sorting is stable across repeated calls. + """ + from linkml.generators.owlgen import _expression_sort_key + from linkml_runtime.linkml_model.meta import AnonymousClassExpression, AnonymousSlotExpression + + # Two distinct objects with identical content → same key + a1 = AnonymousClassExpression(is_a="Parent") + a2 = AnonymousClassExpression(is_a="Parent") + assert a1 is not a2 + assert _expression_sort_key(a1) == _expression_sort_key(a2) + + # Different content → different keys + b = AnonymousClassExpression(is_a="Child") + assert _expression_sort_key(a1) != _expression_sort_key(b) + + # Sorting stability: same order every time + items = [b, a1, a2] + for _ in range(5): + result = sorted(items, key=_expression_sort_key) + # "Child" < "Parent" alphabetically, so b comes first + assert _expression_sort_key(result[0]) == _expression_sort_key(b) + assert _expression_sort_key(result[1]) == _expression_sort_key(result[2]) # a1, a2 together + + # Slot expressions work too + s1 = AnonymousSlotExpression(range="string") + s2 = AnonymousSlotExpression(range="integer") + assert _expression_sort_key(s1) != _expression_sort_key(s2) + order1 = sorted([s2, s1], key=_expression_sort_key) + order2 = sorted([s1, s2], key=_expression_sort_key) + assert [_expression_sort_key(x) for x in order1] == [_expression_sort_key(x) for x in order2] diff --git a/tests/linkml/test_generators/test_jsonldcontextgen.py b/tests/linkml/test_generators/test_jsonldcontextgen.py index 6de23347a..7d5378879 100644 --- a/tests/linkml/test_generators/test_jsonldcontextgen.py +++ b/tests/linkml/test_generators/test_jsonldcontextgen.py @@ -1,4 +1,5 @@ import json +import textwrap import pytest from click.testing import CliRunner @@ -571,3 +572,780 @@ def test_exclude_imports(input_path): # Imported class and slot must NOT be present assert "BaseClass" not in ctx, "Imported class 'BaseClass' must not appear in exclude-imports context" assert "baseProperty" not in ctx, "Imported slot 'baseProperty' must not appear in exclude-imports context" + + +@pytest.mark.parametrize("mergeimports", [True, False], ids=["merge", "no-merge"]) +def test_exclude_external_imports(tmp_path, mergeimports): + """With --exclude-external-imports, elements from URL-based external + vocabulary imports must not appear in the generated JSON-LD context, + while local file imports and linkml standard imports are kept. + + When a schema imports terms from an external vocabulary (e.g. W3C VC + v2), those terms already have context definitions in their own JSON-LD + context file. Re-defining them in the local context can conflict with + @protected term definitions from the external context (JSON-LD 1.1 + section 4.1.11). + """ + ext_dir = tmp_path / "ext" + ext_dir.mkdir() + (ext_dir / "external_vocab.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/external-vocab + name: external_vocab + default_prefix: ext + prefixes: + linkml: https://w3id.org/linkml/ + ext: https://example.org/external-vocab/ + imports: + - linkml:types + slots: + issuer: + slot_uri: ext:issuer + range: string + validFrom: + slot_uri: ext:validFrom + range: date + classes: + ExternalCredential: + class_uri: ext:ExternalCredential + slots: + - issuer + - validFrom + """), + encoding="utf-8", + ) + + (tmp_path / "main.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/main + name: main + default_prefix: main + prefixes: + linkml: https://w3id.org/linkml/ + main: https://example.org/main/ + ext: https://example.org/external-vocab/ + imports: + - linkml:types + - https://example.org/external-vocab + slots: + localName: + slot_uri: main:localName + range: string + classes: + LocalThing: + class_uri: main:LocalThing + slots: + - localName + """), + encoding="utf-8", + ) + + importmap = {"https://example.org/external-vocab": str(ext_dir / "external_vocab")} + + context_text = ContextGenerator( + str(tmp_path / "main.yaml"), + exclude_external_imports=True, + mergeimports=mergeimports, + importmap=importmap, + base_dir=str(tmp_path), + ).serialize() + context = json.loads(context_text) + ctx = context["@context"] + + # Local terms must be present + assert "localName" in ctx or "local_name" in ctx, ( + f"Local slot missing with mergeimports={mergeimports}, got: {list(ctx.keys())}" + ) + assert "LocalThing" in ctx, f"Local class missing with mergeimports={mergeimports}, got: {list(ctx.keys())}" + + # External vocabulary terms must NOT be present + assert "issuer" not in ctx, f"External slot 'issuer' present with mergeimports={mergeimports}" + assert "validFrom" not in ctx and "valid_from" not in ctx, ( + f"External slot 'validFrom' present with mergeimports={mergeimports}" + ) + assert "ExternalCredential" not in ctx, ( + f"External class 'ExternalCredential' present with mergeimports={mergeimports}" + ) + + +def test_exclude_external_imports_preserves_linkml_types(tmp_path): + """linkml:types (standard library import) must NOT be treated as external. + + The ``linkml:types`` import resolves to a URL internally + (``https://w3id.org/linkml/types``), but it is a standard LinkML import, + not a user-declared external vocabulary. The ``_collect_external_elements`` + method filters by ``schema_key.startswith("http")`` — this test verifies + that linkml built-in types (string, integer, date, etc.) survive the filter. + """ + (tmp_path / "schema.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/test + name: test_linkml_types + default_prefix: ex + prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/ + imports: + - linkml:types + slots: + name: + slot_uri: ex:name + range: string + age: + slot_uri: ex:age + range: integer + classes: + Person: + class_uri: ex:Person + slots: + - name + - age + """), + encoding="utf-8", + ) + + context_text = ContextGenerator( + str(tmp_path / "schema.yaml"), + exclude_external_imports=True, + ).serialize() + ctx = json.loads(context_text)["@context"] + + # Local classes and slots must be present + assert "Person" in ctx, f"Local class 'Person' missing, got: {list(ctx.keys())}" + assert "name" in ctx, f"Local slot 'name' missing, got: {list(ctx.keys())}" + assert "age" in ctx, f"Local slot 'age' missing, got: {list(ctx.keys())}" + + +def test_exclude_external_imports_preserves_local_file_imports(tmp_path): + """Local file imports (non-URL) must be preserved when exclude_external_imports is set. + + Only URL-based imports (http:// or https://) are considered external. + File-path imports between local schemas must remain in the context. + """ + local_dir = tmp_path / "local" + local_dir.mkdir() + (local_dir / "base.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/base + name: base + default_prefix: base + prefixes: + linkml: https://w3id.org/linkml/ + base: https://example.org/base/ + imports: + - linkml:types + slots: + baseField: + slot_uri: base:baseField + range: string + classes: + BaseRecord: + class_uri: base:BaseRecord + slots: + - baseField + """), + encoding="utf-8", + ) + + (tmp_path / "main.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/main + name: main + default_prefix: main + prefixes: + linkml: https://w3id.org/linkml/ + main: https://example.org/main/ + base: https://example.org/base/ + imports: + - linkml:types + - local/base + slots: + localField: + slot_uri: main:localField + range: string + classes: + MainRecord: + class_uri: main:MainRecord + slots: + - localField + """), + encoding="utf-8", + ) + + context_text = ContextGenerator( + str(tmp_path / "main.yaml"), + exclude_external_imports=True, + mergeimports=True, + base_dir=str(tmp_path), + ).serialize() + ctx = json.loads(context_text)["@context"] + + # Local file import terms must be present + assert "MainRecord" in ctx, f"Local class 'MainRecord' missing, got: {list(ctx.keys())}" + assert "BaseRecord" in ctx, f"Local-file-imported class 'BaseRecord' missing, got: {list(ctx.keys())}" + assert "baseField" in ctx or "base_field" in ctx, ( + f"Local-file-imported slot 'baseField' missing, got: {list(ctx.keys())}" + ) + + +def test_exclude_external_imports_works_with_mergeimports_false(tmp_path): + """exclude_external_imports is effective even when mergeimports=False. + + Although mergeimports=False prevents most imported elements from appearing, + external vocabulary elements can still leak into the context via the + schema_map. The exclude_external_imports flag catches these. + """ + ext_dir = tmp_path / "ext" + ext_dir.mkdir() + (ext_dir / "external_vocab.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/external-vocab + name: external_vocab + default_prefix: ext + prefixes: + linkml: https://w3id.org/linkml/ + ext: https://example.org/external-vocab/ + imports: + - linkml:types + slots: + issuer: + slot_uri: ext:issuer + range: string + classes: + ExternalCredential: + class_uri: ext:ExternalCredential + slots: + - issuer + """), + encoding="utf-8", + ) + + (tmp_path / "main.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/main + name: main + default_prefix: main + prefixes: + linkml: https://w3id.org/linkml/ + main: https://example.org/main/ + ext: https://example.org/external-vocab/ + imports: + - linkml:types + - https://example.org/external-vocab + slots: + localName: + slot_uri: main:localName + range: string + classes: + LocalThing: + class_uri: main:LocalThing + slots: + - localName + """), + encoding="utf-8", + ) + + importmap = {"https://example.org/external-vocab": str(ext_dir / "external_vocab")} + + ctx_text = ContextGenerator( + str(tmp_path / "main.yaml"), + exclude_external_imports=True, + mergeimports=False, + importmap=importmap, + base_dir=str(tmp_path), + ).serialize() + ctx = json.loads(ctx_text)["@context"] + + # Local terms must still be present + assert "LocalThing" in ctx, f"Local class missing, got: {list(ctx.keys())}" + + # External vocabulary terms must be excluded + assert "issuer" not in ctx, "External slot 'issuer' should be excluded with mergeimports=False" + assert "ExternalCredential" not in ctx, "External class should be excluded with mergeimports=False" + + +def test_xsd_anyuri_as_iri_flag(): + """Test that --xsd-anyuri-as-iri maps uri ranges to @type: @id. + + By default, ``range: uri`` (type_uri ``xsd:anyURI``) produces + ``@type: xsd:anyURI`` (typed literal). With ``xsd_anyuri_as_iri=True``, + it produces ``@type: @id`` (IRI node reference), aligning the JSON-LD + context with the SHACL generator which already emits ``sh:nodeKind sh:IRI`` + for the same type. + + See: + - W3C SHACL §4.8.1 sh:nodeKind (https://www.w3.org/TR/shacl/#NodeKindConstraintComponent) + - JSON-LD 1.1 §4.2.2 Type Coercion (https://www.w3.org/TR/json-ld11/#type-coercion) + - RDF 1.1 §3.3 Literals vs §3.2 IRIs (https://www.w3.org/TR/rdf11-concepts/) + """ + schema_yaml = """ +id: https://example.org/test-uri-context +name: test_uri_context + +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + +imports: + - linkml:types + +default_prefix: ex +default_range: string + +slots: + homepage: + range: uri + slot_uri: ex:homepage + node_ref: + range: nodeidentifier + slot_uri: ex:nodeRef + name: + range: string + slot_uri: ex:name + +classes: + Thing: + slots: + - homepage + - node_ref + - name +""" + # Default behaviour: uri → xsd:anyURI (backward compatible) + ctx_default = json.loads(ContextGenerator(schema_yaml).serialize())["@context"] + assert ctx_default["homepage"]["@type"] == "xsd:anyURI" + + # Opt-in: uri → @id (aligned with SHACL sh:nodeKind sh:IRI) + ctx_iri = json.loads(ContextGenerator(schema_yaml, xsd_anyuri_as_iri=True).serialize())["@context"] + assert ctx_iri["homepage"]["@type"] == "@id", ( + f"Expected @type: @id for uri range with xsd_anyuri_as_iri=True, got {ctx_iri['homepage'].get('@type')}" + ) + + # nodeidentifier is unaffected by the flag (not xsd:anyURI-typed) + # Its default @type depends on URI_RANGES matching shex:nonLiteral; + # we only verify the flag doesn't change its behaviour. + assert ctx_default["node_ref"]["@type"] == ctx_iri["node_ref"]["@type"] + + # string → no @type regardless of flag + assert "@type" not in ctx_default.get("name", {}) + assert "@type" not in ctx_iri.get("name", {}) + + +def test_xsd_anyuri_as_iri_with_any_of(): + """The --xsd-anyuri-as-iri flag must also apply to ``any_of`` slots + whose type branches include ``uri`` mixed with class ranges. + + ``_literal_coercion_for_ranges`` resolves mixed any_of type branches + and must use the extended URI_RANGES when the flag is active. + """ + schema_yaml = """ +id: https://example.org/test-anyof-uri +name: test_anyof_uri + +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + +imports: + - linkml:types + +default_prefix: ex +default_range: string + +classes: + Container: + slots: + - mixed_slot + Target: + class_uri: ex:Target + +slots: + mixed_slot: + slot_uri: ex:mixed + any_of: + - range: Target + - range: uri +""" + # Default: mixed class+uri any_of — uri resolves to xsd:anyURI literal, + # which disagrees with @id from the class branch → no coercion emitted + ctx_default = json.loads(ContextGenerator(schema_yaml).serialize())["@context"] + default_type = ctx_default.get("mixed_slot", {}).get("@type") + assert default_type != "@id", f"Without flag, mixed any_of should not resolve to @id, got {default_type}" + + # With flag: uri branch now also resolves to @id, matching the class branch + # → all branches agree → @id is emitted + ctx_iri = json.loads(ContextGenerator(schema_yaml, xsd_anyuri_as_iri=True).serialize())["@context"] + assert ctx_iri["mixed_slot"]["@type"] == "@id", ( + f"Expected @id for mixed any_of with flag, got {ctx_iri.get('mixed_slot', {}).get('@type')}" + ) + + +def test_xsd_anyuri_as_iri_owl(): + """OWL generator must produce owl:ObjectProperty for uri ranges when flag is set. + + Without the flag, ``range: uri`` produces ``owl:DatatypeProperty`` with + ``rdfs:range xsd:anyURI``. With ``xsd_anyuri_as_iri=True``, it should + produce ``owl:ObjectProperty`` (no rdfs:range restriction), aligning + with the SHACL generator's ``sh:nodeKind sh:IRI``. + """ + from rdflib import OWL, RDF, URIRef + + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_yaml = """ +id: https://example.org/test-owl-uri +name: test_owl_uri +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + homepage: + range: uri + slot_uri: ex:homepage + name: + range: string + slot_uri: ex:name +classes: + Thing: + slots: + - homepage + - name +""" + # Default: uri → DatatypeProperty (must disable type_objects which + # unconditionally returns ObjectProperty for all type-ranged slots) + gen_default = OwlSchemaGenerator(schema_yaml, type_objects=False) + g_default = gen_default.as_graph() + homepage_uri = URIRef("https://example.org/homepage") + default_rdf_type = set(g_default.objects(homepage_uri, RDF.type)) + assert OWL.DatatypeProperty in default_rdf_type, ( + f"Without flag, homepage should be DatatypeProperty, got {default_rdf_type}" + ) + + # With flag: uri → ObjectProperty + gen_iri = OwlSchemaGenerator(schema_yaml, xsd_anyuri_as_iri=True, type_objects=False) + g_iri = gen_iri.as_graph() + iri_rdf_type = set(g_iri.objects(homepage_uri, RDF.type)) + assert OWL.ObjectProperty in iri_rdf_type, f"With flag, homepage should be ObjectProperty, got {iri_rdf_type}" + assert OWL.DatatypeProperty not in iri_rdf_type, ( + f"With flag, homepage should NOT be DatatypeProperty, got {iri_rdf_type}" + ) + + # String slot must remain DatatypeProperty regardless of flag + name_uri = URIRef("https://example.org/name") + name_rdf_type = set(g_iri.objects(name_uri, RDF.type)) + assert OWL.DatatypeProperty in name_rdf_type, f"String slot should remain DatatypeProperty, got {name_rdf_type}" + + +def test_xsd_anyuri_as_iri_uriorcurie_range(): + """``uriorcurie`` also maps to ``xsd:anyURI`` and must behave identically + to ``uri`` when the ``--xsd-anyuri-as-iri`` flag is active. + + This is a high-priority coverage gap: ``uriorcurie`` is distinct from + ``uri`` at the LinkML level but shares the same XSD type. + """ + schema_yaml = """ +id: https://example.org/test-uriorcurie +name: test_uriorcurie + +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + +imports: + - linkml:types + +default_prefix: ex +default_range: string + +slots: + reference: + range: uriorcurie + slot_uri: ex:reference + homepage: + range: uri + slot_uri: ex:homepage + +classes: + Thing: + slots: + - reference + - homepage +""" + ctx_default = json.loads(ContextGenerator(schema_yaml).serialize())["@context"] + assert ctx_default["reference"]["@type"] == "xsd:anyURI" + assert ctx_default["homepage"]["@type"] == "xsd:anyURI" + + ctx_iri = json.loads(ContextGenerator(schema_yaml, xsd_anyuri_as_iri=True).serialize())["@context"] + assert ctx_iri["reference"]["@type"] == "@id", "uriorcurie should map to @id with xsd_anyuri_as_iri=True" + assert ctx_iri["homepage"]["@type"] == "@id", "uri should map to @id with xsd_anyuri_as_iri=True" + + +def test_xsd_anyuri_as_iri_curie_range_unchanged(): + """``curie`` maps to ``xsd:string`` (not ``xsd:anyURI``), so the flag + must NOT affect its coercion. + + This documents the cross-type boundary: ``uri`` and ``uriorcurie`` + share ``xsd:anyURI``, but ``curie`` uses ``xsd:string``. + """ + schema_yaml = """ +id: https://example.org/test-curie +name: test_curie + +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + +imports: + - linkml:types + +default_prefix: ex +default_range: string + +slots: + curie_slot: + range: curie + slot_uri: ex:curieSlot + uri_slot: + range: uri + slot_uri: ex:uriSlot + +classes: + Thing: + slots: + - curie_slot + - uri_slot +""" + ctx_default = json.loads(ContextGenerator(schema_yaml).serialize())["@context"] + ctx_iri = json.loads(ContextGenerator(schema_yaml, xsd_anyuri_as_iri=True).serialize())["@context"] + + # curie (xsd:string) must be unaffected by the flag + curie_default = ctx_default.get("curie_slot", {}).get("@type") + curie_iri = ctx_iri.get("curie_slot", {}).get("@type") + assert curie_default == curie_iri, f"curie coercion should not change with flag: {curie_default} vs {curie_iri}" + + # uri (xsd:anyURI) must change — sanity check + assert ctx_iri["uri_slot"]["@type"] == "@id" + + +def test_xsd_anyuri_as_iri_owl_curie_unchanged(): + """OWL generator must keep ``range: curie`` as DatatypeProperty even with flag. + + ``curie`` maps to ``xsd:string`` (not ``xsd:anyURI``), so the + ``--xsd-anyuri-as-iri`` flag must not promote it to ObjectProperty. + This verifies cross-generator consistency: the JSON-LD context generator + already correctly excludes ``curie`` via ``URI_RANGES_WITH_XSD``; the + OWL generator must match via ``is_xsd_anyuri_range()``. + """ + from rdflib import OWL, RDF, URIRef + + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_yaml = """ +id: https://example.org/test-owl-curie +name: test_owl_curie +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + compact_id: + range: curie + slot_uri: ex:compactId + homepage: + range: uri + slot_uri: ex:homepage +classes: + Thing: + slots: + - compact_id + - homepage +""" + compact_id_uri = URIRef("https://example.org/compact_id") + homepage_uri = URIRef("https://example.org/homepage") + + # With flag: curie must stay DatatypeProperty, uri must become ObjectProperty + gen = OwlSchemaGenerator(schema_yaml, xsd_anyuri_as_iri=True, type_objects=False) + g = gen.as_graph() + + curie_types = set(g.objects(compact_id_uri, RDF.type)) + assert OWL.DatatypeProperty in curie_types, f"curie slot must remain DatatypeProperty with flag, got {curie_types}" + assert OWL.ObjectProperty not in curie_types, ( + f"curie slot must NOT become ObjectProperty with flag, got {curie_types}" + ) + + # Sanity: uri must become ObjectProperty + uri_types = set(g.objects(homepage_uri, RDF.type)) + assert OWL.ObjectProperty in uri_types, f"uri slot should be ObjectProperty with flag, got {uri_types}" + + +def test_xsd_anyuri_as_iri_cli_flag(): + """Verify the ``--xsd-anyuri-as-iri`` flag is wired through Click.""" + import tempfile + from pathlib import Path + + from click.testing import CliRunner + + from linkml.generators.jsonldcontextgen import cli + + schema_yaml = """ +id: https://example.org/test-cli +name: test_cli + +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + +imports: + - linkml:types + +default_prefix: ex +default_range: string + +slots: + homepage: + range: uri + slot_uri: ex:homepage + +classes: + Thing: + slots: + - homepage +""" + with tempfile.TemporaryDirectory() as tmpdir: + schema_path = Path(tmpdir) / "test.yaml" + schema_path.write_text(schema_yaml) + + runner = CliRunner() + + # Without flag + result_default = runner.invoke(cli, [str(schema_path)]) + assert result_default.exit_code == 0, result_default.output + ctx_default = json.loads(result_default.output)["@context"] + assert ctx_default["homepage"]["@type"] == "xsd:anyURI" + + # With flag + result_iri = runner.invoke(cli, [str(schema_path), "--xsd-anyuri-as-iri"]) + assert result_iri.exit_code == 0, result_iri.output + ctx_iri = json.loads(result_iri.output)["@context"] + assert ctx_iri["homepage"]["@type"] == "@id" + + +def test_normalize_prefixes_renames_nonstandard_alias(tmp_path): + """When --normalize-prefixes is set, non-standard aliases are replaced by rdflib defaults. + + rdflib binds ``dc`` to ``http://purl.org/dc/elements/1.1/`` by default. + A schema that declares ``dce`` for the same URI should have it normalised + to ``dc`` when the flag is enabled. + + See: rdflib default namespace bindings. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_normalize +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + dce: http://purl.org/dc/elements/1.1/ +imports: + - linkml:types +classes: + Record: + class_uri: ex:Record + attributes: + title: + range: string + slot_uri: dce:title +""", + encoding="utf-8", + ) + + # Flag OFF (default): non-standard alias preserved + ctx_off = json.loads(ContextGenerator(str(schema), normalize_prefixes=False).serialize())["@context"] + assert "dce" in ctx_off, "With flag off, original prefix 'dce' must be preserved" + + # Flag ON: rdflib default name used + ctx_on = json.loads(ContextGenerator(str(schema), normalize_prefixes=True).serialize())["@context"] + assert "dc" in ctx_on, "With flag on, 'dce' should be normalised to 'dc'" + assert "dce" not in ctx_on, "With flag on, original alias 'dce' should be removed" + assert ctx_on["dc"] == "http://purl.org/dc/elements/1.1/" + + +def test_normalize_prefixes_default_is_off(tmp_path): + """The --normalize-prefixes flag defaults to False — no prefix renaming. + + Ensures backward compatibility: existing schemas produce identical output. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_default +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ +imports: + - linkml:types +classes: + Thing: + class_uri: sdo:Thing + attributes: + name: + range: string + slot_uri: sdo:name +""", + encoding="utf-8", + ) + + ctx = json.loads(ContextGenerator(str(schema)).serialize())["@context"] + # Without the flag, the schema's own prefix name must be preserved + assert "sdo" in ctx, "Default behavior must preserve schema-declared prefix 'sdo'" + + +def test_normalize_prefixes_curie_remapping(tmp_path): + """CURIEs in element @id values use the normalised prefix name. + + When ``sdo`` is normalised to ``schema``, slot URIs like ``sdo:name`` + must appear as ``schema:name`` in the generated context. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_curie +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ +imports: + - linkml:types +classes: + Person: + class_uri: sdo:Person + attributes: + full_name: + range: string + slot_uri: sdo:name +""", + encoding="utf-8", + ) + + ctx = json.loads(ContextGenerator(str(schema), normalize_prefixes=True).serialize())["@context"] + # The prefix declaration must use the standard name + assert "schema" in ctx, "Normalised prefix 'schema' must appear" + # Element @id must use the normalised prefix + person = ctx.get("Person", {}) + assert person.get("@id", "").startswith("schema:"), ( + f"Person @id should use normalised prefix 'schema:', got {person}" + ) diff --git a/tests/linkml/test_generators/test_normalize_prefixes.py b/tests/linkml/test_generators/test_normalize_prefixes.py new file mode 100644 index 000000000..5eb3f5b87 --- /dev/null +++ b/tests/linkml/test_generators/test_normalize_prefixes.py @@ -0,0 +1,555 @@ +"""Tests for the --normalize-prefixes flag across all generators. + +Verifies that non-standard prefix aliases (e.g. ``sdo`` for ``https://schema.org/``) +are normalised to well-known names (e.g. ``schema``) consistently in OWL, SHACL, +and JSON-LD context output. + +References: +- prefix.cc — community consensus RDF prefix registry +- rdflib 7.x curated default namespace bindings +- W3C Turtle §2.4 — prefix declarations are syntactic sugar +""" + +import json +import logging +import re +import textwrap + +# ── Shared test schema ────────────────────────────────────────────── + +SCHEMA_SDO = textwrap.dedent("""\ + id: https://example.org/test + name: test_normalize + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ + imports: + - linkml:types + classes: + Person: + class_uri: sdo:Person + attributes: + full_name: + range: string + slot_uri: sdo:name +""") + +SCHEMA_DCE = textwrap.dedent("""\ + id: https://example.org/test + name: test_normalize_dce + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + dce: http://purl.org/dc/elements/1.1/ + imports: + - linkml:types + classes: + Record: + class_uri: ex:Record + attributes: + title: + range: string + slot_uri: dce:title +""") + +# HTTP variant — linkml-runtime historically binds schema: http://schema.org/ +# while rdflib (and the W3C) prefer https://schema.org/. The normalize flag +# must handle both. +SCHEMA_HTTP_SDO = textwrap.dedent("""\ + id: https://example.org/test + name: test_http_schema + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: http://schema.org/ + imports: + - linkml:types + classes: + Place: + class_uri: sdo:Place + attributes: + geo: + range: string + slot_uri: sdo:geo +""") + +# Collision scenario: user declares 'foaf' for a custom namespace AND 'myfoaf' +# for http://xmlns.com/foaf/0.1/. Normalisation must NOT clobber the user's 'foaf'. +# Uses 'foaf' instead of 'schema' because 'schema' is declared in linkml:types, +# which causes a SchemaLoader merge conflict before normalisation even runs. +SCHEMA_COLLISION = textwrap.dedent("""\ + id: https://example.org/test + name: test_collision + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + foaf: https://something-else.org/ + myfoaf: http://xmlns.com/foaf/0.1/ + imports: + - linkml:types + classes: + Agent: + class_uri: myfoaf:Agent + attributes: + label: + range: string + slot_uri: myfoaf:name +""") + + +def _write_schema(tmp_path, content: str, name: str = "schema.yaml") -> str: + """Write schema content to a temporary file and return its path as string.""" + p = tmp_path / name + p.write_text(content, encoding="utf-8") + return str(p) + + +def _turtle_prefixes(ttl: str) -> dict[str, str]: + """Extract @prefix declarations from Turtle output → {prefix: namespace}.""" + result = {} + for m in re.finditer(r"@prefix\s+(\w+):\s+<([^>]+)>", ttl): + result[m.group(1)] = m.group(2) + return result + + +# ── OWL Generator Tests ───────────────────────────────────────────── + + +class TestOwlNormalizePrefixes: + """OWL generator prefix normalisation tests.""" + + def test_sdo_normalised_to_schema(self, tmp_path): + """sdo → schema when --normalize-prefixes is active.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix in OWL output, got: {sorted(pfx)}" + assert pfx["schema"] == "https://schema.org/" + assert "sdo" not in pfx, "Non-standard 'sdo' prefix should be removed" + + def test_flag_off_preserves_original(self, tmp_path): + """Without the flag, schema-declared prefix names are preserved.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=False).serialize() + pfx = _turtle_prefixes(ttl) + assert "sdo" in pfx, "With flag off, original prefix 'sdo' must be preserved" + + def test_dce_normalised_to_dc(self, tmp_path): + """dce → dc for http://purl.org/dc/elements/1.1/ in graph bindings. + + Note: rdflib's Turtle serializer only emits @prefix declarations for + namespaces actually used in triples. Since the OWL generator may not + produce triples using dc:elements URIs for simple attribute schemas, + we verify the graph's namespace bindings directly. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_DCE) + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "dc" in bound, f"Expected 'dc' in graph bindings, got: {sorted(bound)}" + assert bound["dc"] == "http://purl.org/dc/elements/1.1/" + + def test_custom_prefix_not_affected(self, tmp_path): + """Domain-specific prefixes (e.g. 'ex') are not touched by normalisation.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "ex" in pfx, "Custom prefix 'ex' must survive normalisation" + assert pfx["ex"] == "https://example.org/" + + def test_http_schema_org_normalised(self, tmp_path): + """http://schema.org/ (HTTP variant) also normalises to 'schema'. + + The linkml-runtime historically binds ``schema: http://schema.org/`` + while the W3C and rdflib prefer ``https://schema.org/``. Both + variants must be recognised by the static well-known prefix map. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix for http://schema.org/, got: {sorted(pfx)}" + assert "sdo" not in pfx + + def test_no_schema1_from_runtime_http_binding(self, tmp_path): + """Runtime-injected ``schema: http://schema.org/`` must not create ``schema1``. + + The linkml metamodel (types.yaml) declares ``schema: http://schema.org/`` + (HTTP). When a user schema declares ``sdo: https://schema.org/`` (HTTPS), + normalisation must clean up *both* variants so the output never contains + auto-generated suffixed prefixes like ``schema1``. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + suffixed = [p for p in pfx if re.match(r"schema\d+", p)] + assert not suffixed, ( + f"Auto-generated suffixed prefix(es) {suffixed} found — " + "runtime http://schema.org/ binding was not cleaned up" + ) + + +# ── SHACL Generator Tests ─────────────────────────────────────────── + + +class TestShaclNormalizePrefixes: + """SHACL generator prefix normalisation tests.""" + + def test_sdo_normalised_to_schema(self, tmp_path): + """sdo → schema when --normalize-prefixes is active.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix in SHACL output, got: {sorted(pfx)}" + assert pfx["schema"] == "https://schema.org/" + assert "sdo" not in pfx, "Non-standard 'sdo' prefix should be removed" + + def test_flag_off_preserves_original(self, tmp_path): + """Without the flag, schema-declared prefix names are preserved.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=False).serialize() + pfx = _turtle_prefixes(ttl) + assert "sdo" in pfx, "With flag off, original prefix 'sdo' must be preserved" + + def test_dce_normalised_to_dc(self, tmp_path): + """dce → dc for http://purl.org/dc/elements/1.1/.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_DCE) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "dc" in pfx, f"Expected 'dc' prefix in SHACL output, got: {sorted(pfx)}" + assert pfx["dc"] == "http://purl.org/dc/elements/1.1/" + assert "dce" not in pfx, "Non-standard 'dce' prefix should be removed" + + def test_custom_prefix_not_affected(self, tmp_path): + """Domain-specific prefixes (e.g. 'ex') are not touched by normalisation. + + Note: rdflib only emits @prefix for namespaces used in triples. + We verify graph bindings directly. + """ + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + gen = ShaclGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "ex" in bound, f"Custom prefix 'ex' must survive in graph bindings, got: {sorted(bound)}" + assert bound["ex"] == "https://example.org/" + + def test_http_schema_org_normalised(self, tmp_path): + """http://schema.org/ (HTTP variant) also normalises to 'schema'.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix for http://schema.org/, got: {sorted(pfx)}" + assert "sdo" not in pfx + + def test_no_schema1_from_runtime_http_binding(self, tmp_path): + """Runtime-injected ``schema: http://schema.org/`` must not create ``schema1``. + + Same scenario as the OWL test: linkml:types imports bring in + ``schema: http://schema.org/`` while the user schema has + ``sdo: https://schema.org/``. Phase 2 of normalisation must + clean up the orphaned HTTP binding. + """ + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + suffixed = [p for p in pfx if re.match(r"schema\d+", p)] + assert not suffixed, ( + f"Auto-generated suffixed prefix(es) {suffixed} found — " + "runtime http://schema.org/ binding was not cleaned up" + ) + + +# ── JSON-LD Context Generator Tests ───────────────────────────────── + + +class TestContextNormalizePrefixes: + """JSON-LD context generator prefix normalisation tests (supplements existing tests).""" + + def test_http_schema_org_normalised(self, tmp_path): + """http://schema.org/ (HTTP variant) normalises to 'schema' in JSON-LD context. + + This covers the edge case where linkml-runtime's ``schema: http://schema.org/`` + conflicts with rdflib's ``schema: https://schema.org/``. The stale binding + must be removed and replaced with the correct one. + """ + from linkml.generators.jsonldcontextgen import ContextGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + assert "schema" in ctx, "HTTP schema.org should normalise to 'schema'" + assert "sdo" not in ctx, "Non-standard 'sdo' should be removed" + # The namespace URI must match the schema-declared one (http, not https) + schema_val = ctx["schema"] + if isinstance(schema_val, dict): + schema_val = schema_val.get("@id", "") + assert schema_val == "http://schema.org/", f"Namespace URI must be preserved: got {schema_val}" + + +# ── Static Prefix Map Tests ───────────────────────────────────────── + + +class TestWellKnownPrefixMap: + """Tests for the frozen static prefix map.""" + + def test_returns_dict(self): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert isinstance(wk, dict) + assert len(wk) >= 29, f"Expected ≥29 entries, got {len(wk)}" + + def test_schema_https(self): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["https://schema.org/"] == "schema" + + def test_schema_http_variant(self): + """Both http and https schema.org must map to 'schema'.""" + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["http://schema.org/"] == "schema" + + def test_dc_elements(self): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["http://purl.org/dc/elements/1.1/"] == "dc" + + def test_returns_copy(self): + """Callers should not be able to mutate the internal map.""" + from linkml.utils.generator import well_known_prefix_map + + wk1 = well_known_prefix_map() + wk1["http://example.org/"] = "test" + wk2 = well_known_prefix_map() + assert "http://example.org/" not in wk2 + + def test_matches_rdflib_defaults(self): + """The static map must be a superset of rdflib's current defaults. + + This test documents the relationship: if rdflib adds new defaults in + a future version, this test will flag them for inclusion. + """ + from rdflib import Graph as RdfGraph + + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + rdflib_map = {str(ns): str(pfx) for pfx, ns in RdfGraph().namespaces() if str(pfx)} + missing = {ns: pfx for ns, pfx in rdflib_map.items() if ns not in wk} + assert not missing, f"Static map missing rdflib defaults: {missing}" + + +# ── Cross-Generator Consistency Tests ──────────────────────────────── + + +class TestCrossGeneratorConsistency: + """Ensure all generators agree on prefix normalisation.""" + + def test_all_generators_normalise_sdo_to_schema(self, tmp_path): + """OWL, SHACL, and JSON-LD context must all use 'schema' for schema.org.""" + from linkml.generators.jsonldcontextgen import ContextGenerator + from linkml.generators.owlgen import OwlSchemaGenerator + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + + owl_ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + shacl_ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + + owl_pfx = _turtle_prefixes(owl_ttl) + shacl_pfx = _turtle_prefixes(shacl_ttl) + + assert "schema" in owl_pfx, "OWL must use 'schema'" + assert "schema" in shacl_pfx, "SHACL must use 'schema'" + assert "schema" in ctx, "JSON-LD context must use 'schema'" + + assert "sdo" not in owl_pfx, "OWL must not have 'sdo'" + assert "sdo" not in shacl_pfx, "SHACL must not have 'sdo'" + assert "sdo" not in ctx, "JSON-LD context must not have 'sdo'" + + +# ── Prefix Collision Tests ──────────────────────────────────────────── + + +class TestPrefixCollision: + """Collision: user claims the standard prefix name for a different namespace.""" + + def test_owl_collision_skips_rename(self, tmp_path, caplog): + """OWL: myfoaf must NOT be renamed to 'foaf' when user claims that name.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + # myfoaf must NOT have been renamed to 'foaf' + assert "myfoaf" in bound, "Non-standard 'myfoaf' must remain when collision prevents renaming" + assert bound["myfoaf"] == "http://xmlns.com/foaf/0.1/" + # Warning emitted + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + def test_shacl_collision_skips_rename(self, tmp_path, caplog): + """SHACL: myfoaf must NOT be renamed to 'foaf' when user claims that name.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + gen = ShaclGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "myfoaf" in bound, "Non-standard 'myfoaf' must remain when collision prevents renaming" + assert bound["myfoaf"] == "http://xmlns.com/foaf/0.1/" + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + def test_context_collision_preserves_user_prefix(self, tmp_path, caplog): + """JSON-LD: user's 'foaf: https://something-else.org/' must survive.""" + from linkml.generators.jsonldcontextgen import ContextGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + # User's 'foaf' binding preserved + foaf_val = ctx.get("foaf") + if isinstance(foaf_val, dict): + foaf_val = foaf_val.get("@id", "") + assert foaf_val == "https://something-else.org/", f"User's 'foaf' binding must be preserved, got: {foaf_val}" + # myfoaf must remain (not renamed to foaf) + assert "myfoaf" in ctx, "Non-standard 'myfoaf' must remain when collision prevents renaming" + # Warning emitted + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + +# ── JSONLDGenerator Flag Forwarding Tests ───────────────────────────── + + +class TestJSONLDGeneratorForwarding: + """Verify JSONLDGenerator propagates flags to its embedded ContextGenerator.""" + + def test_normalize_prefixes_forwarded(self, tmp_path): + """JSONLDGenerator must pass normalize_prefixes to embedded ContextGenerator. + + Without forwarding, the inline @context in JSON-LD output would keep + non-standard prefix aliases even when --normalize-prefixes is set. + """ + from linkml.generators.jsonldgen import JSONLDGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + out = JSONLDGenerator(schema_path, normalize_prefixes=True).serialize() + parsed = json.loads(out) + # The @context may be a list; find the dict entry + ctx = parsed.get("@context", {}) + if isinstance(ctx, list): + for item in ctx: + if isinstance(item, dict): + ctx = item + break + assert "sdo" not in ctx, "normalize_prefixes not forwarded: 'sdo' still in embedded @context" + + +# ── Phase 2 HTTP/HTTPS Overwrite Bug Tests ──────────────────────────── + + +class TestPhase2HttpsPreservation: + """Phase 2 must not overwrite Phase 1 HTTPS bindings with HTTP variants.""" + + def test_phase2_does_not_overwrite_https_with_http(self, tmp_path): + """When Phase 1 binds schema → https://schema.org/, Phase 2 must not + overwrite it with http://schema.org/ from the runtime metamodel. + + Reproduction: linkml:types imports bring schema: http://schema.org/ + (HTTP) while the user schema has sdo: https://schema.org/ (HTTPS). + Phase 1 normalises sdo → schema (HTTPS). Phase 2 must not then + rebind schema → http://schema.org/ when it encounters the runtime + HTTP binding. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "schema" in bound, f"Expected 'schema' in bindings, got: {sorted(bound)}" + # MUST be HTTPS (from the user's schema), not HTTP (from runtime) + assert bound["schema"] == "https://schema.org/", ( + f"Phase 2 overwrote HTTPS with HTTP: schema bound to {bound['schema']}" + ) + + def test_normalize_graph_prefixes_phase2_guard(self): + """Direct unit test for the Phase 2 guard in normalize_graph_prefixes. + + Simulates the exact scenario: Phase 1 binds schema → https://schema.org/, + then Phase 2 encounters schema1 → http://schema.org/ and must NOT rebind. + """ + from rdflib import Graph, Namespace, URIRef + + from linkml.utils.generator import normalize_graph_prefixes + + g = Graph(bind_namespaces="none") + # Simulate Phase 1 result + g.bind("schema", Namespace("https://schema.org/")) + # Simulate runtime-injected HTTP variant (would appear as schema1) + g.bind("schema1", Namespace("http://schema.org/")) + # Add a triple so the graph isn't empty + g.add((URIRef("https://example.org/s"), URIRef("https://schema.org/name"), URIRef("https://example.org/o"))) + + normalize_graph_prefixes(g, {"sdo": "https://schema.org/"}) + + bound = {str(p): str(n) for p, n in g.namespaces()} + assert bound.get("schema") == "https://schema.org/", ( + f"Phase 2 guard failed: schema bound to {bound.get('schema')}" + ) + + def test_empty_schema_no_crash(self, tmp_path): + """A schema with no custom prefixes must not crash normalize_graph_prefixes.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + (tmp_path / "empty.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/empty + name: empty + default_prefix: ex + prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/ + imports: + - linkml:types + """), + encoding="utf-8", + ) + # Should not raise + gen = OwlSchemaGenerator(str(tmp_path / "empty.yaml"), normalize_prefixes=True) + ttl = gen.serialize() + assert len(ttl) > 0 diff --git a/tests/linkml/test_generators/test_owlgen.py b/tests/linkml/test_generators/test_owlgen.py index 062864721..74c96c3e6 100644 --- a/tests/linkml/test_generators/test_owlgen.py +++ b/tests/linkml/test_generators/test_owlgen.py @@ -1,3 +1,4 @@ +import logging from enum import Enum import pytest @@ -460,6 +461,172 @@ def test_abstract_class_without_subclasses_gets_no_union_of_axiom(): assert _union_members(g, EX.Orphan) is None +def test_abstract_class_with_no_children_emits_warning(caplog): + """An abstract class with no children emits a warning about missing coverage. + + When an abstract class has zero subclasses, no covering axiom can be + generated. The warning alerts users that the class hierarchy is incomplete. + + See: mgskjaeveland's review on linkml/linkml#3309. + """ + sb = SchemaBuilder() + sb.add_class("Orphan", abstract=True) + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # No covering axiom emitted + assert _union_members(g, EX.Orphan) is None + + # But a warning must be logged + assert any("has no children" in msg for msg in caplog.messages), ( + "Expected a warning about abstract class with no children" + ) + assert any("No covering axiom" in msg for msg in caplog.messages), ( + "Warning should mention that no covering axiom will be generated" + ) + + +def test_no_children_warning_suppressed_by_skip_flag(caplog): + """When --skip-abstract-class-as-unionof-subclasses is set, no warning for zero children.""" + sb = SchemaBuilder() + sb.add_class("Orphan", abstract=True) + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + _owl_graph(sb, skip_abstract_class_as_unionof_subclasses=True) + + assert not any("has no children" in msg for msg in caplog.messages) + + +def test_abstract_class_with_single_child_emits_warning(caplog): + """An abstract class with one child still gets a covering axiom but emits a warning. + + Per OWL 2 semantics, the covering axiom with a single child creates an + equivalence (Parent ≡ Child). This is logically correct but may surprise + users who plan to extend the ontology later. The generator should warn + and recommend ``--skip-abstract-class-as-unionof-subclasses``. + + See: W3C OWL 2 Primer §4.2 — bidirectional rdfs:subClassOf = equivalence. + See: mgskjaeveland's review on linkml/linkml#3309. + """ + sb = SchemaBuilder() + sb.add_class("GrandParent") + sb.add_class("Parent", is_a="GrandParent", abstract=True) + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # Covering axiom IS still emitted (single child → equivalence is OWL-correct). + # With one child, _union_of returns the child URI directly (no owl:unionOf wrapper), + # so the covering axiom materialises as Parent rdfs:subClassOf Child. + # Combined with Child rdfs:subClassOf Parent (from is_a), this is the equivalence. + assert (EX.Parent, RDFS.subClassOf, EX.Child) in g, ( + "Covering axiom should produce Parent rdfs:subClassOf Child for single-child case" + ) + assert (EX.Child, RDFS.subClassOf, EX.Parent) in g + assert (EX.Parent, RDFS.subClassOf, EX.GrandParent) in g + + # But a warning must be logged + assert any("only 1 direct child" in msg for msg in caplog.messages), ( + "Expected a warning about single-child covering axiom creating equivalence" + ) + assert any("--skip-abstract-class-as-unionof-subclasses" in msg for msg in caplog.messages), ( + "Warning should recommend the skip flag" + ) + + +def test_single_child_warning_suppressed_by_skip_flag(caplog): + """When --skip-abstract-class-as-unionof-subclasses is set, no warning is emitted. + + The skip flag suppresses covering axioms entirely, so the single-child + equivalence case never arises. + """ + sb = SchemaBuilder() + sb.add_class("Parent", abstract=True) + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb, skip_abstract_class_as_unionof_subclasses=True) + + # No covering axiom emitted + assert (EX.Parent, RDFS.subClassOf, EX.Child) not in g + # No warning either + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_multiple_children_no_warning(caplog): + """An abstract class with 2+ children must NOT emit a warning. + + The covering axiom is a proper union (not a degenerate equivalence), + so no warning is needed. + """ + sb = SchemaBuilder() + sb.add_class("Animal", abstract=True) + sb.add_class("Dog", is_a="Animal") + sb.add_class("Cat", is_a="Animal") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # Covering axiom emitted (proper union) + members = _union_members(g, EX.Animal) + assert members == {EX.Dog, EX.Cat} + + # No warning about children count + assert not any("has no children" in msg for msg in caplog.messages) + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_non_abstract_class_no_warning(caplog): + """A non-abstract class must NOT emit covering axiom warnings. + + Covering axioms only apply to abstract classes. Concrete classes + should be silently skipped regardless of child count. + """ + sb = SchemaBuilder() + sb.add_class("Parent") # not abstract + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # No covering axiom for non-abstract class + assert _union_members(g, EX.Parent) is None + assert (EX.Parent, RDFS.subClassOf, EX.Child) not in g + + # No warning either + assert not any("has no children" in msg for msg in caplog.messages) + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_abstract_class_with_only_mixin_children_emits_warning(caplog): + """An abstract class whose only children are via mixins (not is_a) gets the no-children warning. + + The covering axiom only considers direct is_a children (not mixins). + If an abstract class has mixin children but no is_a children, it should + warn about having no children for covering axiom purposes. + """ + sb = SchemaBuilder() + sb.add_class("Base", abstract=True) + sb.add_class("MixinChild", mixins=["Base"]) + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + assert _union_members(g, EX.Base) is None + assert any("has no children" in msg for msg in caplog.messages), ( + "Abstract class with only mixin children should warn about no is_a children" + ) + + @pytest.mark.parametrize("skip", [False, True]) def test_union_of_axiom_only_covers_direct_children(skip: bool): """Union-of axiom lists only direct is_a children, not grandchildren.