Skip to content

Commit 37cafc8

Browse files
committed
feat(generators): add --deterministic flag for reproducible output
Add a --deterministic flag to OWL, SHACL, and JSON-LD generators that produces byte-identical output across invocations, eliminating spurious diffs in version-controlled artifacts. Deterministic Turtle serialization (deterministic_turtle): - W3C RDFC-1.0 canonicalization via pyoxigraph (standard-compliant) - Weisfeiler-Lehman structural hashing for diff-stable blank node IDs (_:b<hash>) instead of sequential (_:c14nN) - Sorted prefix declarations and triple blocks Collection sorting (gated behind --deterministic): - owl:oneOf, sh:in, sh:ignoredProperties items sorted when flag is set - Preserves existing behaviour by default deterministic_json: - Recursive deep-sort for JSON-LD context output pyoxigraph >= 0.4.0 is imported lazily and only when --deterministic is used. Not a core dependency — avoids conflict with morph-kgc. Tests skip gracefully when pyoxigraph >= 0.4.0 is unavailable. Signed-off-by: jdsika <carlo.van-driesten@bmw.de>
1 parent 8b7bfa4 commit 37cafc8

7 files changed

Lines changed: 1124 additions & 9 deletions

File tree

packages/linkml/src/linkml/generators/jsonldcontextgen.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,62 @@ def end_schema(
189189
with open(frame_path, "w", encoding="UTF-8") as f:
190190
json.dump(frame, f, indent=2, ensure_ascii=False)
191191

192+
if self.deterministic:
193+
return self._deterministic_context_json(json.loads(str(as_json(context))), indent=3) + "\n"
192194
return str(as_json(context)) + "\n"
193195

196+
@staticmethod
197+
def _deterministic_context_json(data: dict, indent: int = 3) -> str:
198+
"""Serialize a JSON-LD context with deterministic key ordering.
199+
200+
Preserves the conventional JSON-LD context structure:
201+
1. ``comments`` block first (metadata)
202+
2. ``@context`` block second, with:
203+
a. ``@``-prefixed directives (``@vocab``, ``@base``) first
204+
b. Prefix declarations (string values) second
205+
c. Class/property term entries (object values) last
206+
3. Each group sorted alphabetically within itself
207+
208+
Unlike :func:`deterministic_json`, this understands JSON-LD
209+
conventions so that the output remains human-readable while
210+
still being byte-identical across invocations.
211+
"""
212+
from linkml.utils.generator import deterministic_json
213+
214+
ordered = {}
215+
216+
# 1. "comments" first (if present)
217+
if "comments" in data:
218+
ordered["comments"] = data["comments"]
219+
220+
# 2. "@context" with structured internal ordering
221+
if "@context" in data:
222+
ctx = data["@context"]
223+
ordered_ctx = {}
224+
225+
# 2a. @-prefixed directives (@vocab, @base, etc.)
226+
for k in sorted(k for k in ctx if k.startswith("@")):
227+
ordered_ctx[k] = ctx[k]
228+
229+
# 2b. Prefix declarations (string values — short namespace URIs)
230+
for k in sorted(k for k in ctx if not k.startswith("@") and isinstance(ctx[k], str)):
231+
ordered_ctx[k] = ctx[k]
232+
233+
# 2c. Term definitions (object values) — deep-sorted for determinism
234+
term_entries = {k: v for k, v in ctx.items() if not k.startswith("@") and not isinstance(v, str)}
235+
sorted_terms = json.loads(deterministic_json(term_entries))
236+
for k in sorted(sorted_terms):
237+
ordered_ctx[k] = sorted_terms[k]
238+
239+
ordered["@context"] = ordered_ctx
240+
241+
# 3. Any remaining top-level keys
242+
for k in sorted(data):
243+
if k not in ordered:
244+
ordered[k] = data[k]
245+
246+
return json.dumps(ordered, indent=indent, ensure_ascii=False)
247+
194248
def visit_class(self, cls: ClassDefinition) -> bool:
195249
if self.exclude_imports and cls.name not in self._local_classes:
196250
return False

packages/linkml/src/linkml/generators/jsonldgen.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Generate JSONld from a LinkML schema."""
22

3+
import json
34
import os
45
from collections.abc import Sequence
56
from copy import deepcopy
@@ -202,6 +203,10 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs:
202203
self.schema["@context"].append({"@base": base_prefix})
203204
# json_obj["@id"] = self.schema.id
204205
out = str(as_json(self.schema, indent=" ")) + "\n"
206+
if self.deterministic:
207+
from linkml.utils.generator import deterministic_json
208+
209+
out = deterministic_json(json.loads(out), indent=2) + "\n"
205210
self.schema = self.original_schema
206211
return out
207212

packages/linkml/src/linkml/generators/owlgen.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,14 @@ def serialize(self, **kwargs) -> str:
267267
:return:
268268
"""
269269
self.as_graph()
270-
data = self.graph.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format)
270+
fmt = "turtle" if self.format in ["owl", "ttl"] else self.format
271+
if self.deterministic and fmt == "turtle":
272+
# Deferred to avoid circular import (generator.py imports from this package)
273+
from linkml.utils.generator import deterministic_turtle
274+
275+
data = deterministic_turtle(self.graph)
276+
else:
277+
data = self.graph.serialize(format=fmt)
271278
return data
272279

273280
def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None:
@@ -998,7 +1005,10 @@ def add_enum(self, e: EnumDefinition) -> None:
9981005
owl_types = []
9991006
enum_owl_type = self._get_metatype(e, self.default_permissible_value_type)
10001007

1001-
for pv in e.permissible_values.values():
1008+
pvs = e.permissible_values.values()
1009+
if self.deterministic:
1010+
pvs = sorted(pvs, key=lambda x: x.text)
1011+
for pv in pvs:
10021012
pv_owl_type = self._get_metatype(pv, enum_owl_type)
10031013
owl_types.append(pv_owl_type)
10041014
if pv_owl_type == RDFS.Literal:

packages/linkml/src/linkml/generators/shaclgen.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,13 @@ def generate_header(self) -> str:
9393

9494
def serialize(self, **args) -> str:
9595
g = self.as_graph()
96-
data = g.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format)
96+
fmt = "turtle" if self.format in ["owl", "ttl"] else self.format
97+
if self.deterministic and fmt == "turtle":
98+
from linkml.utils.generator import deterministic_turtle
99+
100+
data = deterministic_turtle(g)
101+
else:
102+
data = g.serialize(format=fmt)
97103
return data
98104

99105
def as_graph(self) -> Graph:
@@ -309,13 +315,13 @@ def _add_enum(self, g: Graph, func: Callable, r: ElementName) -> None:
309315
sv = self.schemaview
310316
enum = sv.get_enum(r)
311317
pv_node = BNode()
318+
pv_items = list(enum.permissible_values.items())
319+
if self.deterministic:
320+
pv_items = sorted(pv_items)
312321
Collection(
313322
g,
314323
pv_node,
315-
[
316-
URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name)
317-
for pv_name, pv in enum.permissible_values.items()
318-
],
324+
[URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) for pv_name, pv in pv_items],
319325
)
320326
func(SH["in"], pv_node)
321327

@@ -469,7 +475,10 @@ def collect_child_properties(class_name: str, output: set) -> None:
469475

470476
list_node = BNode()
471477
ignored_properties.add(RDF.type)
472-
Collection(g, list_node, list(ignored_properties))
478+
props = list(ignored_properties)
479+
if self.deterministic:
480+
props = sorted(props, key=str)
481+
Collection(g, list_node, props)
473482

474483
return list_node
475484

0 commit comments

Comments
 (0)