Skip to content

Commit d82a791

Browse files
committed
feat(generators): add --normalize-prefixes flag for well-known prefix names
Add an opt-in --normalize-prefixes flag to OWL, SHACL, and JSON-LD Context generators that normalises non-standard prefix aliases to well-known names from a static prefix map (derived from rdflib 7.x defaults, cross-checked against prefix.cc consensus). Key design decisions: - Static frozen map (MappingProxyType) instead of runtime Graph().namespaces() lookup eliminates rdflib version dependency - Both http://schema.org/ and https://schema.org/ map to 'schema' - Shared normalize_graph_prefixes() helper used by OWL and SHACL - Two-phase graph normalisation: Phase 1 normalises schema-declared prefixes, Phase 2 cleans up runtime-injected bindings - Collision detection: skip with warning when standard prefix name is already user-declared for a different namespace - Phase 2 guard prevents overwriting HTTPS bindings with HTTP variants The flag defaults to off, preserving existing behaviour. Tests cover OWL, SHACL, and context generators with sdo->schema, dce->dc, http/https edge case, custom prefix preservation, flag-off backward compatibility, cross-generator consistency, prefix collision detection, schema1 regression prevention, Phase 2 HTTPS guard, empty schema edge case, and static map integrity. Signed-off-by: jdsika <carlo.van-driesten@bmw.de>
1 parent c6f7c77 commit d82a791

7 files changed

Lines changed: 895 additions & 10 deletions

File tree

packages/linkml/src/linkml/generators/jsonldcontextgen.py

Lines changed: 80 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
from linkml._version import __version__
1717
from linkml.utils.deprecation import deprecated_fields
18-
from linkml.utils.generator import Generator, shared_arguments
18+
from linkml.utils.generator import Generator, shared_arguments, well_known_prefix_map
1919
from linkml_runtime.linkml_model.meta import ClassDefinition, SlotDefinition
2020
from linkml_runtime.linkml_model.types import SHEX
2121
from linkml_runtime.utils.formatutils import camelcase, underscore
@@ -90,6 +90,9 @@ class ContextGenerator(Generator):
9090
frame_root: str | None = None
9191

9292
def __post_init__(self) -> None:
93+
# Must be set before super().__post_init__() because the parent triggers
94+
# the visitor pattern (visit_schema), which accesses _prefix_remap.
95+
self._prefix_remap: dict[str, str] = {}
9396
super().__post_init__()
9497
if self.namespaces is None:
9598
raise TypeError("Schema text must be supplied to context generator. Preparsed schema will not work")
@@ -127,22 +130,92 @@ def _collect_external_elements(sv: SchemaView) -> tuple[set[str], set[str]]:
127130
external_slots.update(schema_def.slots.keys())
128131
return external_classes, external_slots
129132

133+
def add_prefix(self, ncname: str) -> None:
134+
"""Add a prefix, applying well-known prefix normalisation when enabled."""
135+
super().add_prefix(self._prefix_remap.get(ncname, ncname))
136+
130137
def visit_schema(self, base: str | Namespace | None = None, output: str | None = None, **_):
131-
# Add any explicitly declared prefixes
138+
# Add any explicitly declared prefixes.
139+
# Direct .add() is safe here: the normalisation block below explicitly
140+
# rewrites emit_prefixes entries for any renamed prefixes (Cases 1-3).
132141
for prefix in self.schema.prefixes.values():
133142
self.emit_prefixes.add(prefix.prefix_prefix)
134143

135144
# Add any prefixes explicitly declared
136145
for pfx in self.schema.emit_prefixes:
137146
self.add_prefix(pfx)
138147

148+
# Normalise well-known prefix names when --normalize-prefixes is set.
149+
# If the schema declares a non-standard alias for a namespace that has
150+
# a well-known standard name (e.g. ``sdo`` for
151+
# ``https://schema.org/``), replace the alias with the standard name
152+
# so that generated JSON-LD contexts use the conventional prefix.
153+
#
154+
# Three cases are handled:
155+
# 1. Standard prefix is not yet bound → just rebind from old to new.
156+
# 2. Standard prefix is bound to a *different* URI:
157+
# a. User-declared (in schema.prefixes) → collision, skip with warning.
158+
# b. Runtime default (e.g. linkml-runtime's ``schema: http://…``)
159+
# → remove stale binding, then rebind.
160+
# 3. Standard prefix is already bound to the *same* URI (duplicate)
161+
# → just drop the non-standard alias.
162+
#
163+
# A remap dict is stored for ``_build_element_id`` because
164+
# ``prefix_suffix()`` splits CURIEs on ``:`` without looking up the
165+
# namespace dict.
166+
self._prefix_remap.clear()
167+
if self.normalize_prefixes:
168+
wk = well_known_prefix_map()
169+
for old_pfx in list(self.namespaces):
170+
url = str(self.namespaces[old_pfx])
171+
std_pfx = wk.get(url)
172+
if not std_pfx or std_pfx == old_pfx:
173+
continue
174+
if std_pfx in self.namespaces:
175+
if str(self.namespaces[std_pfx]) != url:
176+
# Case 2: std_pfx is bound to a different URI.
177+
# If the user explicitly declared std_pfx in the schema,
178+
# it is intentional — skip to avoid data loss.
179+
if std_pfx in self.schema.prefixes:
180+
self.logger.warning(
181+
"Prefix collision: cannot rename '%s' to '%s' because '%s' is "
182+
"already declared for <%s>; skipping normalisation for <%s>",
183+
old_pfx,
184+
std_pfx,
185+
std_pfx,
186+
str(self.namespaces[std_pfx]),
187+
url,
188+
)
189+
continue
190+
# Not user-declared (e.g. linkml-runtime default) — safe to remove
191+
self.emit_prefixes.discard(std_pfx)
192+
del self.namespaces[std_pfx]
193+
else:
194+
# Case 3: standard prefix already bound to same URI
195+
# — just drop the non-standard alias
196+
del self.namespaces[old_pfx]
197+
if old_pfx in self.emit_prefixes:
198+
self.emit_prefixes.discard(old_pfx)
199+
self.emit_prefixes.add(std_pfx)
200+
self._prefix_remap[old_pfx] = std_pfx
201+
continue
202+
# Case 1 (or Case 2 after stale removal): bind standard name
203+
self.namespaces[std_pfx] = self.namespaces[old_pfx]
204+
del self.namespaces[old_pfx]
205+
if old_pfx in self.emit_prefixes:
206+
self.emit_prefixes.discard(old_pfx)
207+
self.emit_prefixes.add(std_pfx)
208+
self._prefix_remap[old_pfx] = std_pfx
209+
139210
# Add the default prefix
140211
if self.schema.default_prefix:
141212
dflt = self.namespaces.prefix_for(self.schema.default_prefix)
142213
if dflt:
143214
self.default_ns = dflt
144215
if self.default_ns:
145216
default_uri = self.namespaces[self.default_ns]
217+
# Direct .add() is safe: default_ns is already resolved from
218+
# the (possibly normalised) namespace bindings above.
146219
self.emit_prefixes.add(self.default_ns)
147220
else:
148221
default_uri = self.schema.default_prefix
@@ -417,6 +490,11 @@ def _build_element_id(self, definition: Any, uri: str) -> None:
417490
@return: None
418491
"""
419492
uri_prefix, uri_suffix = self.namespaces.prefix_suffix(uri)
493+
# Apply well-known prefix normalisation (e.g. sdo → schema).
494+
# prefix_suffix() splits CURIEs on ':' without checking the
495+
# namespace dict, so it may return a stale alias.
496+
if uri_prefix and uri_prefix in self._prefix_remap:
497+
uri_prefix = self._prefix_remap[uri_prefix]
420498
is_default_namespace = uri_prefix == self.context_body["@vocab"] or uri_prefix == self.namespaces.prefix_for(
421499
self.context_body["@vocab"]
422500
)

packages/linkml/src/linkml/generators/jsonldgen.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,11 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs:
179179
# TODO: The _visit function above alters the schema in situ
180180
# force some context_kwargs
181181
context_kwargs["metadata"] = False
182+
# Forward generator flags so prefix normalisation and deterministic
183+
# output propagate into the inline @context produced for JSON-LD.
184+
for flag in ("normalize_prefixes", "deterministic"):
185+
if hasattr(self, flag):
186+
context_kwargs.setdefault(flag, getattr(self, flag))
182187
add_prefixes = ContextGenerator(self.original_schema, **context_kwargs).serialize()
183188
add_prefixes_json = loads(add_prefixes)
184189
metamodel_ctx = self.metamodel_context or METAMODEL_CONTEXT_URI

packages/linkml/src/linkml/generators/owlgen.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from linkml._version import __version__
2222
from linkml.generators.common.subproperty import is_xsd_anyuri_range
2323
from linkml.utils.deprecation import deprecation_warning
24-
from linkml.utils.generator import Generator, shared_arguments
24+
from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments
2525
from linkml_runtime import SchemaView
2626
from linkml_runtime.linkml_model.meta import (
2727
AnonymousClassExpression,
@@ -272,6 +272,10 @@ def as_graph(self) -> Graph:
272272
self.graph.bind(prefix, self.metamodel.namespaces[prefix])
273273
for pfx in schema.prefixes.values():
274274
self.graph.namespace_manager.bind(pfx.prefix_prefix, URIRef(pfx.prefix_reference))
275+
if self.normalize_prefixes:
276+
normalize_graph_prefixes(
277+
graph, {str(v.prefix_prefix): str(v.prefix_reference) for v in schema.prefixes.values()}
278+
)
275279
graph.add((base, RDF.type, OWL.Ontology))
276280

277281
# Add main schema elements

packages/linkml/src/linkml/generators/shaclgen.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from linkml.generators.common.subproperty import get_subproperty_values, is_uri_range
1414
from linkml.generators.shacl.shacl_data_type import ShaclDataType
1515
from linkml.generators.shacl.shacl_ifabsent_processor import ShaclIfAbsentProcessor
16-
from linkml.utils.generator import Generator, shared_arguments
16+
from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments
1717
from linkml_runtime.linkml_model.meta import ClassDefinition, ElementName
1818
from linkml_runtime.utils.formatutils import underscore
1919
from linkml_runtime.utils.yamlutils import TypedNode, extended_float, extended_int, extended_str
@@ -111,6 +111,10 @@ def as_graph(self) -> Graph:
111111

112112
for pfx in self.schema.prefixes.values():
113113
g.bind(str(pfx.prefix_prefix), pfx.prefix_reference)
114+
if self.normalize_prefixes:
115+
normalize_graph_prefixes(
116+
g, {str(v.prefix_prefix): str(v.prefix_reference) for v in self.schema.prefixes.values()}
117+
)
114118

115119
for c in sv.all_classes(imports=not self.exclude_imports).values():
116120

packages/linkml/src/linkml/utils/generator.py

Lines changed: 130 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import os
2121
import re
2222
import sys
23+
import types
2324
from collections.abc import Callable, Mapping
2425
from dataclasses import dataclass, field
2526
from functools import lru_cache
@@ -62,6 +63,9 @@
6263
from linkml_runtime.utils.formatutils import camelcase, underscore
6364
from linkml_runtime.utils.namespaces import Namespaces
6465

66+
if TYPE_CHECKING:
67+
from rdflib import Graph
68+
6569
logger = logging.getLogger(__name__)
6670

6771

@@ -357,15 +361,133 @@ def _deep_sort(value: object, parent_key: str = "") -> object:
357361
def well_known_prefix_map() -> dict[str, str]:
358362
"""Return a mapping from namespace URI to standard prefix name.
359363
360-
Uses rdflib's curated default namespace bindings as the source of truth.
361-
For example, ``https://schema.org/`` maps to ``schema``.
364+
Uses a frozen, version-independent map derived from rdflib 7.x curated
365+
defaults (which align with the `prefix.cc <https://prefix.cc>`_ community
366+
consensus registry). The map is **not** computed at runtime from
367+
``Graph().namespaces()`` because those defaults can change across rdflib
368+
releases (they differ between 6.x and 7.x), which would silently alter
369+
generator output.
362370
363371
This allows generators to normalise non-standard prefix aliases
364372
(e.g. ``sdo`` for ``https://schema.org/``) to their conventional names.
373+
374+
Both ``http`` and ``https`` variants of schema.org are included because
375+
the linkml-runtime historically binds ``schema: http://schema.org/``
376+
while rdflib (and the W3C) prefer ``https://schema.org/``.
365377
"""
366-
from rdflib import Graph as RdfGraph
378+
return dict(_WELL_KNOWN_PREFIX_MAP)
379+
380+
381+
# Frozen, version-independent map: namespace URI → canonical prefix name.
382+
# Source: rdflib 7.x defaults, cross-checked against https://prefix.cc
383+
_WELL_KNOWN_PREFIX_MAP: types.MappingProxyType[str, str] = types.MappingProxyType(
384+
{
385+
"https://brickschema.org/schema/Brick#": "brick",
386+
"http://www.w3.org/ns/csvw#": "csvw",
387+
"http://purl.org/dc/elements/1.1/": "dc",
388+
"http://purl.org/dc/dcam/": "dcam",
389+
"http://www.w3.org/ns/dcat#": "dcat",
390+
"http://purl.org/dc/dcmitype/": "dcmitype",
391+
"http://purl.org/dc/terms/": "dcterms",
392+
"http://usefulinc.com/ns/doap#": "doap",
393+
"http://xmlns.com/foaf/0.1/": "foaf",
394+
"http://www.opengis.net/ont/geosparql#": "geo",
395+
"http://www.w3.org/ns/odrl/2/": "odrl",
396+
"http://www.w3.org/ns/org#": "org",
397+
"http://www.w3.org/2002/07/owl#": "owl",
398+
"http://www.w3.org/ns/dx/prof/": "prof",
399+
"http://www.w3.org/ns/prov#": "prov",
400+
"http://purl.org/linked-data/cube#": "qb",
401+
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
402+
"http://www.w3.org/2000/01/rdf-schema#": "rdfs",
403+
"https://schema.org/": "schema",
404+
"http://schema.org/": "schema", # HTTP variant (linkml-runtime uses this)
405+
"http://www.w3.org/ns/shacl#": "sh",
406+
"http://www.w3.org/2004/02/skos/core#": "skos",
407+
"http://www.w3.org/ns/sosa/": "sosa",
408+
"http://www.w3.org/ns/ssn/": "ssn",
409+
"http://www.w3.org/2006/time#": "time",
410+
"http://purl.org/vocab/vann/": "vann",
411+
"http://rdfs.org/ns/void#": "void",
412+
"https://www.w3.org/2003/01/geo/wgs84_pos#": "wgs",
413+
"http://www.w3.org/2003/01/geo/wgs84_pos#": "wgs", # HTTP variant (W3C canonical)
414+
"http://www.w3.org/XML/1998/namespace": "xml",
415+
"http://www.w3.org/2001/XMLSchema#": "xsd",
416+
}
417+
)
418+
367419

368-
return {str(ns): str(pfx) for pfx, ns in RdfGraph().namespaces() if str(pfx)}
420+
def normalize_graph_prefixes(graph: "Graph", schema_prefixes: dict[str, str]) -> None:
421+
"""Normalise non-standard prefix aliases in an rdflib Graph.
422+
423+
For each prefix bound in *schema_prefixes* (mapping prefix name →
424+
namespace URI), check whether ``well_known_prefix_map()`` knows a
425+
standard name for that URI. If the standard name differs from the
426+
schema-declared name, rebind the namespace to the standard name.
427+
428+
This is the **shared implementation** used by OWL, SHACL, and (via a
429+
different code-path) JSON-LD context generators so that all serialisation
430+
formats agree on prefix names when ``--normalize-prefixes`` is active.
431+
432+
:param graph: rdflib Graph whose namespace bindings should be adjusted.
433+
:param schema_prefixes: mapping of prefix name → namespace URI string,
434+
typically from ``schema.prefixes``.
435+
"""
436+
from rdflib import Namespace
437+
438+
wk = well_known_prefix_map()
439+
440+
# Phase 1: normalise schema-declared prefixes.
441+
for old_pfx, ns_uri in schema_prefixes.items():
442+
ns_str = str(ns_uri)
443+
std_pfx = wk.get(ns_str)
444+
if not std_pfx or std_pfx == old_pfx:
445+
continue
446+
# Collision: the user explicitly declared std_pfx for a different
447+
# namespace — do not clobber their binding.
448+
if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str:
449+
logger.warning(
450+
"Prefix collision: cannot rename '%s' to '%s' because '%s' is already "
451+
"declared for <%s>; skipping normalisation for <%s>",
452+
old_pfx,
453+
std_pfx,
454+
std_pfx,
455+
schema_prefixes[std_pfx],
456+
ns_str,
457+
)
458+
continue
459+
# Rebind: remove old prefix, add standard prefix.
460+
# ``replace=True`` forces the new prefix even if the prefix name
461+
# is already bound to a different namespace.
462+
graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True)
463+
464+
# Phase 2: normalise runtime-injected bindings (e.g. metamodel defaults).
465+
# The linkml-runtime / rdflib may inject well-known namespaces under
466+
# non-standard prefix names. After Phase 1 rebinds schema-declared
467+
# prefixes, orphaned runtime bindings can appear as ``schema1``, ``dc0``,
468+
# etc. Scan the graph's current bindings and fix any that map to a
469+
# well-known namespace under a non-standard name, provided the standard
470+
# name isn't already claimed by the user for a different namespace.
471+
#
472+
# Guard: if Phase 1 already bound std_pfx to a different URI (e.g.
473+
# ``schema`` → ``https://schema.org/``), do not clobber it with the
474+
# HTTP variant (``http://schema.org/``). Build a snapshot of the
475+
# current bindings after Phase 1 to detect this.
476+
current_bindings = {str(p): str(n) for p, n in graph.namespaces()}
477+
for pfx, ns in list(graph.namespaces()):
478+
pfx_str, ns_str = str(pfx), str(ns)
479+
std_pfx = wk.get(ns_str)
480+
if not std_pfx or std_pfx == pfx_str:
481+
continue
482+
# Same collision check as Phase 1: respect user-declared prefixes.
483+
if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str:
484+
continue
485+
# Guard: if std_pfx is already bound to a different (correct) URI
486+
# by Phase 1, do not overwrite it. This prevents the HTTP variant
487+
# of schema.org from clobbering the HTTPS binding.
488+
if std_pfx in current_bindings and current_bindings[std_pfx] != ns_str:
489+
continue
490+
graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True)
369491

370492

371493
@dataclass
@@ -474,8 +596,10 @@ class Generator(metaclass=abc.ABCMeta):
474596
"""True means print stack trace, false just error message"""
475597

476598
normalize_prefixes: bool = False
477-
"""True means normalise non-standard prefix aliases to rdflib's curated default names
478-
(e.g. ``sdo`` → ``schema`` for ``https://schema.org/``)."""
599+
"""True means normalise non-standard prefix aliases to well-known names
600+
from the static ``_WELL_KNOWN_PREFIX_MAP`` (derived from rdflib 7.x
601+
defaults / prefix.cc consensus). E.g. ``sdo`` → ``schema`` for
602+
``https://schema.org/``."""
479603

480604
include: str | Path | SchemaDefinition | None = None
481605
"""If set, include extra schema outside of the imports mechanism"""

0 commit comments

Comments
 (0)