From dd0ce0c4567138e11010ba3087d42d4cd7b0c978 Mon Sep 17 00:00:00 2001 From: jdsika Date: Thu, 2 Apr 2026 17:21:36 +0200 Subject: [PATCH] feat(generators): add --normalize-prefixes flag for well-known prefix names Add an opt-in --normalize-prefixes flag to OWL, SHACL, and JSON-LD Context generators that normalises non-standard prefix aliases to well-known names from a static prefix map (derived from rdflib 7.x defaults, cross-checked against prefix.cc consensus). Key design decisions: - Static frozen map (MappingProxyType) instead of runtime Graph().namespaces() lookup eliminates rdflib version dependency - Both http://schema.org/ and https://schema.org/ map to 'schema' - Shared normalize_graph_prefixes() helper used by OWL and SHACL - Two-phase graph normalisation: Phase 1 normalises schema-declared prefixes, Phase 2 cleans up runtime-injected bindings - Collision detection: skip with warning when standard prefix name is already user-declared for a different namespace - Phase 2 guard prevents overwriting HTTPS bindings with HTTP variants The flag defaults to off, preserving existing behaviour. Tests cover OWL, SHACL, and context generators with sdo->schema, dce->dc, http/https edge case, custom prefix preservation, flag-off backward compatibility, cross-generator consistency, prefix collision detection, schema1 regression prevention, Phase 2 HTTPS guard, empty schema edge case, and static map integrity. Signed-off-by: jdsika --- .../src/linkml/generators/jsonldcontextgen.py | 82 ++- .../linkml/src/linkml/generators/jsonldgen.py | 5 + .../linkml/src/linkml/generators/owlgen.py | 6 +- .../linkml/src/linkml/generators/shaclgen.py | 6 +- packages/linkml/src/linkml/utils/generator.py | 154 ++++- .../test_generators/test_jsonldcontextgen.py | 115 ++++ .../test_normalize_prefixes.py | 555 ++++++++++++++++++ 7 files changed, 918 insertions(+), 5 deletions(-) create mode 100644 tests/linkml/test_generators/test_normalize_prefixes.py diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py index 60eaa9ffd..872e92ea7 100644 --- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py +++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py @@ -15,7 +15,7 @@ from linkml._version import __version__ from linkml.utils.deprecation import deprecated_fields -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, shared_arguments, well_known_prefix_map from linkml_runtime.linkml_model.meta import ClassDefinition, SlotDefinition from linkml_runtime.linkml_model.types import SHEX from linkml_runtime.utils.formatutils import camelcase, underscore @@ -66,6 +66,9 @@ class ContextGenerator(Generator): frame_root: str | None = None def __post_init__(self) -> None: + # Must be set before super().__post_init__() because the parent triggers + # the visitor pattern (visit_schema), which accesses _prefix_remap. + self._prefix_remap: dict[str, str] = {} super().__post_init__() if self.namespaces is None: raise TypeError("Schema text must be supplied to context generator. Preparsed schema will not work") @@ -80,8 +83,14 @@ def __post_init__(self) -> None: self._local_classes = set(sv.all_classes(imports=False).keys()) self._local_slots = set(sv.all_slots(imports=False).keys()) + def add_prefix(self, ncname: str) -> None: + """Add a prefix, applying well-known prefix normalisation when enabled.""" + super().add_prefix(self._prefix_remap.get(ncname, ncname)) + def visit_schema(self, base: str | Namespace | None = None, output: str | None = None, **_): - # Add any explicitly declared prefixes + # Add any explicitly declared prefixes. + # Direct .add() is safe here: the normalisation block below explicitly + # rewrites emit_prefixes entries for any renamed prefixes (Cases 1-3). for prefix in self.schema.prefixes.values(): self.emit_prefixes.add(prefix.prefix_prefix) @@ -89,6 +98,68 @@ def visit_schema(self, base: str | Namespace | None = None, output: str | None = for pfx in self.schema.emit_prefixes: self.add_prefix(pfx) + # Normalise well-known prefix names when --normalize-prefixes is set. + # If the schema declares a non-standard alias for a namespace that has + # a well-known standard name (e.g. ``sdo`` for + # ``https://schema.org/``), replace the alias with the standard name + # so that generated JSON-LD contexts use the conventional prefix. + # + # Three cases are handled: + # 1. Standard prefix is not yet bound → just rebind from old to new. + # 2. Standard prefix is bound to a *different* URI: + # a. User-declared (in schema.prefixes) → collision, skip with warning. + # b. Runtime default (e.g. linkml-runtime's ``schema: http://…``) + # → remove stale binding, then rebind. + # 3. Standard prefix is already bound to the *same* URI (duplicate) + # → just drop the non-standard alias. + # + # A remap dict is stored for ``_build_element_id`` because + # ``prefix_suffix()`` splits CURIEs on ``:`` without looking up the + # namespace dict. + self._prefix_remap.clear() + if self.normalize_prefixes: + wk = well_known_prefix_map() + for old_pfx in list(self.namespaces): + url = str(self.namespaces[old_pfx]) + std_pfx = wk.get(url) + if not std_pfx or std_pfx == old_pfx: + continue + if std_pfx in self.namespaces: + if str(self.namespaces[std_pfx]) != url: + # Case 2: std_pfx is bound to a different URI. + # If the user explicitly declared std_pfx in the schema, + # it is intentional — skip to avoid data loss. + if std_pfx in self.schema.prefixes: + self.logger.warning( + "Prefix collision: cannot rename '%s' to '%s' because '%s' is " + "already declared for <%s>; skipping normalisation for <%s>", + old_pfx, + std_pfx, + std_pfx, + str(self.namespaces[std_pfx]), + url, + ) + continue + # Not user-declared (e.g. linkml-runtime default) — safe to remove + self.emit_prefixes.discard(std_pfx) + del self.namespaces[std_pfx] + else: + # Case 3: standard prefix already bound to same URI + # — just drop the non-standard alias + del self.namespaces[old_pfx] + if old_pfx in self.emit_prefixes: + self.emit_prefixes.discard(old_pfx) + self.emit_prefixes.add(std_pfx) + self._prefix_remap[old_pfx] = std_pfx + continue + # Case 1 (or Case 2 after stale removal): bind standard name + self.namespaces[std_pfx] = self.namespaces[old_pfx] + del self.namespaces[old_pfx] + if old_pfx in self.emit_prefixes: + self.emit_prefixes.discard(old_pfx) + self.emit_prefixes.add(std_pfx) + self._prefix_remap[old_pfx] = std_pfx + # Add the default prefix if self.schema.default_prefix: dflt = self.namespaces.prefix_for(self.schema.default_prefix) @@ -96,6 +167,8 @@ def visit_schema(self, base: str | Namespace | None = None, output: str | None = self.default_ns = dflt if self.default_ns: default_uri = self.namespaces[self.default_ns] + # Direct .add() is safe: default_ns is already resolved from + # the (possibly normalised) namespace bindings above. self.emit_prefixes.add(self.default_ns) else: default_uri = self.schema.default_prefix @@ -310,6 +383,11 @@ def _build_element_id(self, definition: Any, uri: str) -> None: @return: None """ uri_prefix, uri_suffix = self.namespaces.prefix_suffix(uri) + # Apply well-known prefix normalisation (e.g. sdo → schema). + # prefix_suffix() splits CURIEs on ':' without checking the + # namespace dict, so it may return a stale alias. + if uri_prefix and uri_prefix in self._prefix_remap: + uri_prefix = self._prefix_remap[uri_prefix] is_default_namespace = uri_prefix == self.context_body["@vocab"] or uri_prefix == self.namespaces.prefix_for( self.context_body["@vocab"] ) diff --git a/packages/linkml/src/linkml/generators/jsonldgen.py b/packages/linkml/src/linkml/generators/jsonldgen.py index c974e762d..67dd942e2 100644 --- a/packages/linkml/src/linkml/generators/jsonldgen.py +++ b/packages/linkml/src/linkml/generators/jsonldgen.py @@ -178,6 +178,11 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs: # TODO: The _visit function above alters the schema in situ # force some context_kwargs context_kwargs["metadata"] = False + # Forward generator flags so prefix normalisation and deterministic + # output propagate into the inline @context produced for JSON-LD. + for flag in ("normalize_prefixes", "deterministic"): + if hasattr(self, flag): + context_kwargs.setdefault(flag, getattr(self, flag)) add_prefixes = ContextGenerator(self.original_schema, **context_kwargs).serialize() add_prefixes_json = loads(add_prefixes) metamodel_ctx = self.metamodel_context or METAMODEL_CONTEXT_URI diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index 33c58b0ec..2e0bc17b6 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -20,7 +20,7 @@ from linkml import METAMODEL_NAMESPACE_NAME from linkml._version import __version__ from linkml.utils.deprecation import deprecation_warning -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments from linkml_runtime import SchemaView from linkml_runtime.linkml_model.meta import ( AnonymousClassExpression, @@ -233,6 +233,10 @@ def as_graph(self) -> Graph: self.graph.bind(prefix, self.metamodel.namespaces[prefix]) for pfx in schema.prefixes.values(): self.graph.namespace_manager.bind(pfx.prefix_prefix, URIRef(pfx.prefix_reference)) + if self.normalize_prefixes: + normalize_graph_prefixes( + graph, {str(v.prefix_prefix): str(v.prefix_reference) for v in schema.prefixes.values()} + ) graph.add((base, RDF.type, OWL.Ontology)) # Add main schema elements diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index 5425051e3..97d09bfe6 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -13,7 +13,7 @@ from linkml.generators.common.subproperty import get_subproperty_values, is_uri_range from linkml.generators.shacl.shacl_data_type import ShaclDataType from linkml.generators.shacl.shacl_ifabsent_processor import ShaclIfAbsentProcessor -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments from linkml_runtime.linkml_model.meta import ClassDefinition, ElementName from linkml_runtime.utils.formatutils import underscore from linkml_runtime.utils.yamlutils import TypedNode, extended_float, extended_int, extended_str @@ -105,6 +105,10 @@ def as_graph(self) -> Graph: for pfx in self.schema.prefixes.values(): g.bind(str(pfx.prefix_prefix), pfx.prefix_reference) + if self.normalize_prefixes: + normalize_graph_prefixes( + g, {str(v.prefix_prefix): str(v.prefix_reference) for v in self.schema.prefixes.values()} + ) for c in sv.all_classes(imports=not self.exclude_imports).values(): diff --git a/packages/linkml/src/linkml/utils/generator.py b/packages/linkml/src/linkml/utils/generator.py index 88fc48585..47b509449 100644 --- a/packages/linkml/src/linkml/utils/generator.py +++ b/packages/linkml/src/linkml/utils/generator.py @@ -20,11 +20,12 @@ import os import re import sys +import types from collections.abc import Callable, Mapping from dataclasses import dataclass, field from functools import lru_cache from pathlib import Path -from typing import ClassVar, TextIO, Union, cast +from typing import TYPE_CHECKING, ClassVar, TextIO, Union, cast import click from click import Argument, Command, Option @@ -58,6 +59,9 @@ from linkml_runtime.utils.formatutils import camelcase, underscore from linkml_runtime.utils.namespaces import Namespaces +if TYPE_CHECKING: + from rdflib import Graph + logger = logging.getLogger(__name__) @@ -78,6 +82,138 @@ def _resolved_metamodel(mergeimports): return metamodel +def well_known_prefix_map() -> dict[str, str]: + """Return a mapping from namespace URI to standard prefix name. + + Uses a frozen, version-independent map derived from rdflib 7.x curated + defaults (which align with the `prefix.cc `_ community + consensus registry). The map is **not** computed at runtime from + ``Graph().namespaces()`` because those defaults can change across rdflib + releases (they differ between 6.x and 7.x), which would silently alter + generator output. + + This allows generators to normalise non-standard prefix aliases + (e.g. ``sdo`` for ``https://schema.org/``) to their conventional names. + + Both ``http`` and ``https`` variants of schema.org are included because + the linkml-runtime historically binds ``schema: http://schema.org/`` + while rdflib (and the W3C) prefer ``https://schema.org/``. + """ + return dict(_WELL_KNOWN_PREFIX_MAP) + + +# Frozen, version-independent map: namespace URI → canonical prefix name. +# Source: rdflib 7.x defaults, cross-checked against https://prefix.cc +_WELL_KNOWN_PREFIX_MAP: types.MappingProxyType[str, str] = types.MappingProxyType( + { + "https://brickschema.org/schema/Brick#": "brick", + "http://www.w3.org/ns/csvw#": "csvw", + "http://purl.org/dc/elements/1.1/": "dc", + "http://purl.org/dc/dcam/": "dcam", + "http://www.w3.org/ns/dcat#": "dcat", + "http://purl.org/dc/dcmitype/": "dcmitype", + "http://purl.org/dc/terms/": "dcterms", + "http://usefulinc.com/ns/doap#": "doap", + "http://xmlns.com/foaf/0.1/": "foaf", + "http://www.opengis.net/ont/geosparql#": "geo", + "http://www.w3.org/ns/odrl/2/": "odrl", + "http://www.w3.org/ns/org#": "org", + "http://www.w3.org/2002/07/owl#": "owl", + "http://www.w3.org/ns/dx/prof/": "prof", + "http://www.w3.org/ns/prov#": "prov", + "http://purl.org/linked-data/cube#": "qb", + "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", + "http://www.w3.org/2000/01/rdf-schema#": "rdfs", + "https://schema.org/": "schema", + "http://schema.org/": "schema", # HTTP variant (linkml-runtime uses this) + "http://www.w3.org/ns/shacl#": "sh", + "http://www.w3.org/2004/02/skos/core#": "skos", + "http://www.w3.org/ns/sosa/": "sosa", + "http://www.w3.org/ns/ssn/": "ssn", + "http://www.w3.org/2006/time#": "time", + "http://purl.org/vocab/vann/": "vann", + "http://rdfs.org/ns/void#": "void", + "https://www.w3.org/2003/01/geo/wgs84_pos#": "wgs", + "http://www.w3.org/2003/01/geo/wgs84_pos#": "wgs", # HTTP variant (W3C canonical) + "http://www.w3.org/XML/1998/namespace": "xml", + "http://www.w3.org/2001/XMLSchema#": "xsd", + } +) + + +def normalize_graph_prefixes(graph: "Graph", schema_prefixes: dict[str, str]) -> None: + """Normalise non-standard prefix aliases in an rdflib Graph. + + For each prefix bound in *schema_prefixes* (mapping prefix name → + namespace URI), check whether ``well_known_prefix_map()`` knows a + standard name for that URI. If the standard name differs from the + schema-declared name, rebind the namespace to the standard name. + + This is the **shared implementation** used by OWL, SHACL, and (via a + different code-path) JSON-LD context generators so that all serialisation + formats agree on prefix names when ``--normalize-prefixes`` is active. + + :param graph: rdflib Graph whose namespace bindings should be adjusted. + :param schema_prefixes: mapping of prefix name → namespace URI string, + typically from ``schema.prefixes``. + """ + from rdflib import Namespace + + wk = well_known_prefix_map() + + # Phase 1: normalise schema-declared prefixes. + for old_pfx, ns_uri in schema_prefixes.items(): + ns_str = str(ns_uri) + std_pfx = wk.get(ns_str) + if not std_pfx or std_pfx == old_pfx: + continue + # Collision: the user explicitly declared std_pfx for a different + # namespace — do not clobber their binding. + if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str: + logger.warning( + "Prefix collision: cannot rename '%s' to '%s' because '%s' is already " + "declared for <%s>; skipping normalisation for <%s>", + old_pfx, + std_pfx, + std_pfx, + schema_prefixes[std_pfx], + ns_str, + ) + continue + # Rebind: remove old prefix, add standard prefix. + # ``replace=True`` forces the new prefix even if the prefix name + # is already bound to a different namespace. + graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True) + + # Phase 2: normalise runtime-injected bindings (e.g. metamodel defaults). + # The linkml-runtime / rdflib may inject well-known namespaces under + # non-standard prefix names. After Phase 1 rebinds schema-declared + # prefixes, orphaned runtime bindings can appear as ``schema1``, ``dc0``, + # etc. Scan the graph's current bindings and fix any that map to a + # well-known namespace under a non-standard name, provided the standard + # name isn't already claimed by the user for a different namespace. + # + # Guard: if Phase 1 already bound std_pfx to a different URI (e.g. + # ``schema`` → ``https://schema.org/``), do not clobber it with the + # HTTP variant (``http://schema.org/``). Build a snapshot of the + # current bindings after Phase 1 to detect this. + current_bindings = {str(p): str(n) for p, n in graph.namespaces()} + for pfx, ns in list(graph.namespaces()): + pfx_str, ns_str = str(pfx), str(ns) + std_pfx = wk.get(ns_str) + if not std_pfx or std_pfx == pfx_str: + continue + # Same collision check as Phase 1: respect user-declared prefixes. + if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str: + continue + # Guard: if std_pfx is already bound to a different (correct) URI + # by Phase 1, do not overwrite it. This prevents the HTTP variant + # of schema.org from clobbering the HTTPS binding. + if std_pfx in current_bindings and current_bindings[std_pfx] != ns_str: + continue + graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True) + + @dataclass class Generator(metaclass=abc.ABCMeta): """ @@ -180,6 +316,12 @@ class Generator(metaclass=abc.ABCMeta): stacktrace: bool = False """True means print stack trace, false just error message""" + normalize_prefixes: bool = False + """True means normalise non-standard prefix aliases to well-known names + from the static ``_WELL_KNOWN_PREFIX_MAP`` (derived from rdflib 7.x + defaults / prefix.cc consensus). E.g. ``sdo`` → ``schema`` for + ``https://schema.org/``.""" + include: str | Path | SchemaDefinition | None = None """If set, include extra schema outside of the imports mechanism""" @@ -986,6 +1128,16 @@ def decorator(f: Command) -> Command: callback=stacktrace_callback, ) ) + f.params.append( + Option( + ("--normalize-prefixes/--no-normalize-prefixes",), + default=False, + show_default=True, + help="Normalise non-standard prefix aliases to rdflib's curated default names " + "(e.g. sdo → schema for https://schema.org/). " + "Supported by OWL, SHACL, and JSON-LD Context generators.", + ) + ) return f diff --git a/tests/linkml/test_generators/test_jsonldcontextgen.py b/tests/linkml/test_generators/test_jsonldcontextgen.py index 6de23347a..e4579ffb0 100644 --- a/tests/linkml/test_generators/test_jsonldcontextgen.py +++ b/tests/linkml/test_generators/test_jsonldcontextgen.py @@ -571,3 +571,118 @@ def test_exclude_imports(input_path): # Imported class and slot must NOT be present assert "BaseClass" not in ctx, "Imported class 'BaseClass' must not appear in exclude-imports context" assert "baseProperty" not in ctx, "Imported slot 'baseProperty' must not appear in exclude-imports context" + + +def test_normalize_prefixes_renames_nonstandard_alias(tmp_path): + """When --normalize-prefixes is set, non-standard aliases are replaced by rdflib defaults. + + rdflib binds ``dc`` to ``http://purl.org/dc/elements/1.1/`` by default. + A schema that declares ``dce`` for the same URI should have it normalised + to ``dc`` when the flag is enabled. + + See: rdflib default namespace bindings. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_normalize +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + dce: http://purl.org/dc/elements/1.1/ +imports: + - linkml:types +classes: + Record: + class_uri: ex:Record + attributes: + title: + range: string + slot_uri: dce:title +""", + encoding="utf-8", + ) + + # Flag OFF (default): non-standard alias preserved + ctx_off = json.loads(ContextGenerator(str(schema), normalize_prefixes=False).serialize())["@context"] + assert "dce" in ctx_off, "With flag off, original prefix 'dce' must be preserved" + + # Flag ON: rdflib default name used + ctx_on = json.loads(ContextGenerator(str(schema), normalize_prefixes=True).serialize())["@context"] + assert "dc" in ctx_on, "With flag on, 'dce' should be normalised to 'dc'" + assert "dce" not in ctx_on, "With flag on, original alias 'dce' should be removed" + assert ctx_on["dc"] == "http://purl.org/dc/elements/1.1/" + + +def test_normalize_prefixes_default_is_off(tmp_path): + """The --normalize-prefixes flag defaults to False — no prefix renaming. + + Ensures backward compatibility: existing schemas produce identical output. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_default +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ +imports: + - linkml:types +classes: + Thing: + class_uri: sdo:Thing + attributes: + name: + range: string + slot_uri: sdo:name +""", + encoding="utf-8", + ) + + ctx = json.loads(ContextGenerator(str(schema)).serialize())["@context"] + # Without the flag, the schema's own prefix name must be preserved + assert "sdo" in ctx, "Default behavior must preserve schema-declared prefix 'sdo'" + + +def test_normalize_prefixes_curie_remapping(tmp_path): + """CURIEs in element @id values use the normalised prefix name. + + When ``sdo`` is normalised to ``schema``, slot URIs like ``sdo:name`` + must appear as ``schema:name`` in the generated context. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_curie +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ +imports: + - linkml:types +classes: + Person: + class_uri: sdo:Person + attributes: + full_name: + range: string + slot_uri: sdo:name +""", + encoding="utf-8", + ) + + ctx = json.loads(ContextGenerator(str(schema), normalize_prefixes=True).serialize())["@context"] + # The prefix declaration must use the standard name + assert "schema" in ctx, "Normalised prefix 'schema' must appear" + # Element @id must use the normalised prefix + person = ctx.get("Person", {}) + assert person.get("@id", "").startswith("schema:"), ( + f"Person @id should use normalised prefix 'schema:', got {person}" + ) diff --git a/tests/linkml/test_generators/test_normalize_prefixes.py b/tests/linkml/test_generators/test_normalize_prefixes.py new file mode 100644 index 000000000..5eb3f5b87 --- /dev/null +++ b/tests/linkml/test_generators/test_normalize_prefixes.py @@ -0,0 +1,555 @@ +"""Tests for the --normalize-prefixes flag across all generators. + +Verifies that non-standard prefix aliases (e.g. ``sdo`` for ``https://schema.org/``) +are normalised to well-known names (e.g. ``schema``) consistently in OWL, SHACL, +and JSON-LD context output. + +References: +- prefix.cc — community consensus RDF prefix registry +- rdflib 7.x curated default namespace bindings +- W3C Turtle §2.4 — prefix declarations are syntactic sugar +""" + +import json +import logging +import re +import textwrap + +# ── Shared test schema ────────────────────────────────────────────── + +SCHEMA_SDO = textwrap.dedent("""\ + id: https://example.org/test + name: test_normalize + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ + imports: + - linkml:types + classes: + Person: + class_uri: sdo:Person + attributes: + full_name: + range: string + slot_uri: sdo:name +""") + +SCHEMA_DCE = textwrap.dedent("""\ + id: https://example.org/test + name: test_normalize_dce + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + dce: http://purl.org/dc/elements/1.1/ + imports: + - linkml:types + classes: + Record: + class_uri: ex:Record + attributes: + title: + range: string + slot_uri: dce:title +""") + +# HTTP variant — linkml-runtime historically binds schema: http://schema.org/ +# while rdflib (and the W3C) prefer https://schema.org/. The normalize flag +# must handle both. +SCHEMA_HTTP_SDO = textwrap.dedent("""\ + id: https://example.org/test + name: test_http_schema + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: http://schema.org/ + imports: + - linkml:types + classes: + Place: + class_uri: sdo:Place + attributes: + geo: + range: string + slot_uri: sdo:geo +""") + +# Collision scenario: user declares 'foaf' for a custom namespace AND 'myfoaf' +# for http://xmlns.com/foaf/0.1/. Normalisation must NOT clobber the user's 'foaf'. +# Uses 'foaf' instead of 'schema' because 'schema' is declared in linkml:types, +# which causes a SchemaLoader merge conflict before normalisation even runs. +SCHEMA_COLLISION = textwrap.dedent("""\ + id: https://example.org/test + name: test_collision + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + foaf: https://something-else.org/ + myfoaf: http://xmlns.com/foaf/0.1/ + imports: + - linkml:types + classes: + Agent: + class_uri: myfoaf:Agent + attributes: + label: + range: string + slot_uri: myfoaf:name +""") + + +def _write_schema(tmp_path, content: str, name: str = "schema.yaml") -> str: + """Write schema content to a temporary file and return its path as string.""" + p = tmp_path / name + p.write_text(content, encoding="utf-8") + return str(p) + + +def _turtle_prefixes(ttl: str) -> dict[str, str]: + """Extract @prefix declarations from Turtle output → {prefix: namespace}.""" + result = {} + for m in re.finditer(r"@prefix\s+(\w+):\s+<([^>]+)>", ttl): + result[m.group(1)] = m.group(2) + return result + + +# ── OWL Generator Tests ───────────────────────────────────────────── + + +class TestOwlNormalizePrefixes: + """OWL generator prefix normalisation tests.""" + + def test_sdo_normalised_to_schema(self, tmp_path): + """sdo → schema when --normalize-prefixes is active.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix in OWL output, got: {sorted(pfx)}" + assert pfx["schema"] == "https://schema.org/" + assert "sdo" not in pfx, "Non-standard 'sdo' prefix should be removed" + + def test_flag_off_preserves_original(self, tmp_path): + """Without the flag, schema-declared prefix names are preserved.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=False).serialize() + pfx = _turtle_prefixes(ttl) + assert "sdo" in pfx, "With flag off, original prefix 'sdo' must be preserved" + + def test_dce_normalised_to_dc(self, tmp_path): + """dce → dc for http://purl.org/dc/elements/1.1/ in graph bindings. + + Note: rdflib's Turtle serializer only emits @prefix declarations for + namespaces actually used in triples. Since the OWL generator may not + produce triples using dc:elements URIs for simple attribute schemas, + we verify the graph's namespace bindings directly. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_DCE) + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "dc" in bound, f"Expected 'dc' in graph bindings, got: {sorted(bound)}" + assert bound["dc"] == "http://purl.org/dc/elements/1.1/" + + def test_custom_prefix_not_affected(self, tmp_path): + """Domain-specific prefixes (e.g. 'ex') are not touched by normalisation.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "ex" in pfx, "Custom prefix 'ex' must survive normalisation" + assert pfx["ex"] == "https://example.org/" + + def test_http_schema_org_normalised(self, tmp_path): + """http://schema.org/ (HTTP variant) also normalises to 'schema'. + + The linkml-runtime historically binds ``schema: http://schema.org/`` + while the W3C and rdflib prefer ``https://schema.org/``. Both + variants must be recognised by the static well-known prefix map. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix for http://schema.org/, got: {sorted(pfx)}" + assert "sdo" not in pfx + + def test_no_schema1_from_runtime_http_binding(self, tmp_path): + """Runtime-injected ``schema: http://schema.org/`` must not create ``schema1``. + + The linkml metamodel (types.yaml) declares ``schema: http://schema.org/`` + (HTTP). When a user schema declares ``sdo: https://schema.org/`` (HTTPS), + normalisation must clean up *both* variants so the output never contains + auto-generated suffixed prefixes like ``schema1``. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + suffixed = [p for p in pfx if re.match(r"schema\d+", p)] + assert not suffixed, ( + f"Auto-generated suffixed prefix(es) {suffixed} found — " + "runtime http://schema.org/ binding was not cleaned up" + ) + + +# ── SHACL Generator Tests ─────────────────────────────────────────── + + +class TestShaclNormalizePrefixes: + """SHACL generator prefix normalisation tests.""" + + def test_sdo_normalised_to_schema(self, tmp_path): + """sdo → schema when --normalize-prefixes is active.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix in SHACL output, got: {sorted(pfx)}" + assert pfx["schema"] == "https://schema.org/" + assert "sdo" not in pfx, "Non-standard 'sdo' prefix should be removed" + + def test_flag_off_preserves_original(self, tmp_path): + """Without the flag, schema-declared prefix names are preserved.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=False).serialize() + pfx = _turtle_prefixes(ttl) + assert "sdo" in pfx, "With flag off, original prefix 'sdo' must be preserved" + + def test_dce_normalised_to_dc(self, tmp_path): + """dce → dc for http://purl.org/dc/elements/1.1/.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_DCE) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "dc" in pfx, f"Expected 'dc' prefix in SHACL output, got: {sorted(pfx)}" + assert pfx["dc"] == "http://purl.org/dc/elements/1.1/" + assert "dce" not in pfx, "Non-standard 'dce' prefix should be removed" + + def test_custom_prefix_not_affected(self, tmp_path): + """Domain-specific prefixes (e.g. 'ex') are not touched by normalisation. + + Note: rdflib only emits @prefix for namespaces used in triples. + We verify graph bindings directly. + """ + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + gen = ShaclGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "ex" in bound, f"Custom prefix 'ex' must survive in graph bindings, got: {sorted(bound)}" + assert bound["ex"] == "https://example.org/" + + def test_http_schema_org_normalised(self, tmp_path): + """http://schema.org/ (HTTP variant) also normalises to 'schema'.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix for http://schema.org/, got: {sorted(pfx)}" + assert "sdo" not in pfx + + def test_no_schema1_from_runtime_http_binding(self, tmp_path): + """Runtime-injected ``schema: http://schema.org/`` must not create ``schema1``. + + Same scenario as the OWL test: linkml:types imports bring in + ``schema: http://schema.org/`` while the user schema has + ``sdo: https://schema.org/``. Phase 2 of normalisation must + clean up the orphaned HTTP binding. + """ + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + suffixed = [p for p in pfx if re.match(r"schema\d+", p)] + assert not suffixed, ( + f"Auto-generated suffixed prefix(es) {suffixed} found — " + "runtime http://schema.org/ binding was not cleaned up" + ) + + +# ── JSON-LD Context Generator Tests ───────────────────────────────── + + +class TestContextNormalizePrefixes: + """JSON-LD context generator prefix normalisation tests (supplements existing tests).""" + + def test_http_schema_org_normalised(self, tmp_path): + """http://schema.org/ (HTTP variant) normalises to 'schema' in JSON-LD context. + + This covers the edge case where linkml-runtime's ``schema: http://schema.org/`` + conflicts with rdflib's ``schema: https://schema.org/``. The stale binding + must be removed and replaced with the correct one. + """ + from linkml.generators.jsonldcontextgen import ContextGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + assert "schema" in ctx, "HTTP schema.org should normalise to 'schema'" + assert "sdo" not in ctx, "Non-standard 'sdo' should be removed" + # The namespace URI must match the schema-declared one (http, not https) + schema_val = ctx["schema"] + if isinstance(schema_val, dict): + schema_val = schema_val.get("@id", "") + assert schema_val == "http://schema.org/", f"Namespace URI must be preserved: got {schema_val}" + + +# ── Static Prefix Map Tests ───────────────────────────────────────── + + +class TestWellKnownPrefixMap: + """Tests for the frozen static prefix map.""" + + def test_returns_dict(self): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert isinstance(wk, dict) + assert len(wk) >= 29, f"Expected ≥29 entries, got {len(wk)}" + + def test_schema_https(self): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["https://schema.org/"] == "schema" + + def test_schema_http_variant(self): + """Both http and https schema.org must map to 'schema'.""" + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["http://schema.org/"] == "schema" + + def test_dc_elements(self): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["http://purl.org/dc/elements/1.1/"] == "dc" + + def test_returns_copy(self): + """Callers should not be able to mutate the internal map.""" + from linkml.utils.generator import well_known_prefix_map + + wk1 = well_known_prefix_map() + wk1["http://example.org/"] = "test" + wk2 = well_known_prefix_map() + assert "http://example.org/" not in wk2 + + def test_matches_rdflib_defaults(self): + """The static map must be a superset of rdflib's current defaults. + + This test documents the relationship: if rdflib adds new defaults in + a future version, this test will flag them for inclusion. + """ + from rdflib import Graph as RdfGraph + + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + rdflib_map = {str(ns): str(pfx) for pfx, ns in RdfGraph().namespaces() if str(pfx)} + missing = {ns: pfx for ns, pfx in rdflib_map.items() if ns not in wk} + assert not missing, f"Static map missing rdflib defaults: {missing}" + + +# ── Cross-Generator Consistency Tests ──────────────────────────────── + + +class TestCrossGeneratorConsistency: + """Ensure all generators agree on prefix normalisation.""" + + def test_all_generators_normalise_sdo_to_schema(self, tmp_path): + """OWL, SHACL, and JSON-LD context must all use 'schema' for schema.org.""" + from linkml.generators.jsonldcontextgen import ContextGenerator + from linkml.generators.owlgen import OwlSchemaGenerator + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + + owl_ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + shacl_ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + + owl_pfx = _turtle_prefixes(owl_ttl) + shacl_pfx = _turtle_prefixes(shacl_ttl) + + assert "schema" in owl_pfx, "OWL must use 'schema'" + assert "schema" in shacl_pfx, "SHACL must use 'schema'" + assert "schema" in ctx, "JSON-LD context must use 'schema'" + + assert "sdo" not in owl_pfx, "OWL must not have 'sdo'" + assert "sdo" not in shacl_pfx, "SHACL must not have 'sdo'" + assert "sdo" not in ctx, "JSON-LD context must not have 'sdo'" + + +# ── Prefix Collision Tests ──────────────────────────────────────────── + + +class TestPrefixCollision: + """Collision: user claims the standard prefix name for a different namespace.""" + + def test_owl_collision_skips_rename(self, tmp_path, caplog): + """OWL: myfoaf must NOT be renamed to 'foaf' when user claims that name.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + # myfoaf must NOT have been renamed to 'foaf' + assert "myfoaf" in bound, "Non-standard 'myfoaf' must remain when collision prevents renaming" + assert bound["myfoaf"] == "http://xmlns.com/foaf/0.1/" + # Warning emitted + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + def test_shacl_collision_skips_rename(self, tmp_path, caplog): + """SHACL: myfoaf must NOT be renamed to 'foaf' when user claims that name.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + gen = ShaclGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "myfoaf" in bound, "Non-standard 'myfoaf' must remain when collision prevents renaming" + assert bound["myfoaf"] == "http://xmlns.com/foaf/0.1/" + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + def test_context_collision_preserves_user_prefix(self, tmp_path, caplog): + """JSON-LD: user's 'foaf: https://something-else.org/' must survive.""" + from linkml.generators.jsonldcontextgen import ContextGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + # User's 'foaf' binding preserved + foaf_val = ctx.get("foaf") + if isinstance(foaf_val, dict): + foaf_val = foaf_val.get("@id", "") + assert foaf_val == "https://something-else.org/", f"User's 'foaf' binding must be preserved, got: {foaf_val}" + # myfoaf must remain (not renamed to foaf) + assert "myfoaf" in ctx, "Non-standard 'myfoaf' must remain when collision prevents renaming" + # Warning emitted + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + +# ── JSONLDGenerator Flag Forwarding Tests ───────────────────────────── + + +class TestJSONLDGeneratorForwarding: + """Verify JSONLDGenerator propagates flags to its embedded ContextGenerator.""" + + def test_normalize_prefixes_forwarded(self, tmp_path): + """JSONLDGenerator must pass normalize_prefixes to embedded ContextGenerator. + + Without forwarding, the inline @context in JSON-LD output would keep + non-standard prefix aliases even when --normalize-prefixes is set. + """ + from linkml.generators.jsonldgen import JSONLDGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + out = JSONLDGenerator(schema_path, normalize_prefixes=True).serialize() + parsed = json.loads(out) + # The @context may be a list; find the dict entry + ctx = parsed.get("@context", {}) + if isinstance(ctx, list): + for item in ctx: + if isinstance(item, dict): + ctx = item + break + assert "sdo" not in ctx, "normalize_prefixes not forwarded: 'sdo' still in embedded @context" + + +# ── Phase 2 HTTP/HTTPS Overwrite Bug Tests ──────────────────────────── + + +class TestPhase2HttpsPreservation: + """Phase 2 must not overwrite Phase 1 HTTPS bindings with HTTP variants.""" + + def test_phase2_does_not_overwrite_https_with_http(self, tmp_path): + """When Phase 1 binds schema → https://schema.org/, Phase 2 must not + overwrite it with http://schema.org/ from the runtime metamodel. + + Reproduction: linkml:types imports bring schema: http://schema.org/ + (HTTP) while the user schema has sdo: https://schema.org/ (HTTPS). + Phase 1 normalises sdo → schema (HTTPS). Phase 2 must not then + rebind schema → http://schema.org/ when it encounters the runtime + HTTP binding. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "schema" in bound, f"Expected 'schema' in bindings, got: {sorted(bound)}" + # MUST be HTTPS (from the user's schema), not HTTP (from runtime) + assert bound["schema"] == "https://schema.org/", ( + f"Phase 2 overwrote HTTPS with HTTP: schema bound to {bound['schema']}" + ) + + def test_normalize_graph_prefixes_phase2_guard(self): + """Direct unit test for the Phase 2 guard in normalize_graph_prefixes. + + Simulates the exact scenario: Phase 1 binds schema → https://schema.org/, + then Phase 2 encounters schema1 → http://schema.org/ and must NOT rebind. + """ + from rdflib import Graph, Namespace, URIRef + + from linkml.utils.generator import normalize_graph_prefixes + + g = Graph(bind_namespaces="none") + # Simulate Phase 1 result + g.bind("schema", Namespace("https://schema.org/")) + # Simulate runtime-injected HTTP variant (would appear as schema1) + g.bind("schema1", Namespace("http://schema.org/")) + # Add a triple so the graph isn't empty + g.add((URIRef("https://example.org/s"), URIRef("https://schema.org/name"), URIRef("https://example.org/o"))) + + normalize_graph_prefixes(g, {"sdo": "https://schema.org/"}) + + bound = {str(p): str(n) for p, n in g.namespaces()} + assert bound.get("schema") == "https://schema.org/", ( + f"Phase 2 guard failed: schema bound to {bound.get('schema')}" + ) + + def test_empty_schema_no_crash(self, tmp_path): + """A schema with no custom prefixes must not crash normalize_graph_prefixes.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + (tmp_path / "empty.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/empty + name: empty + default_prefix: ex + prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/ + imports: + - linkml:types + """), + encoding="utf-8", + ) + # Should not raise + gen = OwlSchemaGenerator(str(tmp_path / "empty.yaml"), normalize_prefixes=True) + ttl = gen.serialize() + assert len(ttl) > 0