From 70cf3f6ebae5aa292665514ce19231c51505d795 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 17:19:55 -0700 Subject: [PATCH 01/17] Add gfql_validate() preflight API for GFQL/Cypher --- CHANGELOG.md | 3 + docs/source/gfql/cypher.rst | 16 ++ docs/source/gfql/validation/fundamentals.rst | 20 +- graphistry/compute/ComputeMixin.py | 5 + graphistry/compute/gfql_validate.py | 263 ++++++++++++++++++ .../tests/compute/test_gfql_validate_only.py | 75 +++++ 6 files changed, 381 insertions(+), 1 deletion(-) create mode 100644 graphistry/compute/gfql_validate.py create mode 100644 graphistry/tests/compute/test_gfql_validate_only.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cbb90981a..66660197c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - **CI / docs preflight guard**: Added `bin/check_docs_latex_unicode.sh` and a fast `docs-latex-unicode-guard` CI job to fail early on non-BMP Unicode in docs-fed text sources before the slower Dockerized `test-docs` LaTeX build. - **Release process / deploy gate reminder**: Documented that tag-triggered PyPI publishes can pause in `waiting` on environment approval, and explicitly call out approving `Review deployments` for `pypi-release` before expecting the final PyPI job to complete. +### Added +- **GFQL/Cypher validate-only preflight API (#1320)**: Added `g.gfql_validate(...)` on `ComputeMixin` as a public no-execution validation entrypoint for GFQL chains/JSON-style queries and Cypher strings. The API returns structured diagnostics (`ok`, `diagnostics`, query/language metadata) instead of executing query operators. Cypher preflight runs parser+compiler checks and supports optional strict binder/schema mode (`strict=True`) using the bound graph schema catalog; chain/JSON preflight reuses existing `validate_chain_schema()` semantics (including `collect_all=True`). Added focused regression tests in `graphistry/tests/compute/test_gfql_validate_only.py` and docs updates in `docs/source/gfql/cypher.rst` + `docs/source/gfql/validation/fundamentals.rst`. + ### Internal - **GFQL / Cypher reentry follow-through cleanup (#989, post-#1260 extraction)**: In `graphistry/compute/gfql/cypher/reentry/runtime.py`, free-form intermediate MATCH plan construction now routes through the whole-row/free-form `ReentryPlan` contract instead of scalar-only fallback tagging. This makes the dedicated runtime `plan.free_form` lane reachable again and removes incidental scalar-only-path dependence for free-form reentry dispatch. - **GFQL native types T4 — Arrow/type bridge contracts and coercion semantics (#1312, #1262, #1046)**: Added `graphistry/compute/gfql/ir/arrow_bridge.py` with stable schema-level interchange helpers `to_arrow()` and `from_arrow()` for `RowSchema` + schema-confidence metadata. The bridge records per-field logical-type metadata (`gfql.logical_type`) and confidence (`gfql.schema_confidence`) for deterministic round-trips, supports strict vs widening coercion (`coercion='strict'|'widen'`) at export/import boundaries, preserves scalar nullability exactly, and defines structural-type fallback behavior (`NodeRef`/`EdgeRef`/`PathType` as widened string bridge fields in widen mode). Added focused regression coverage in `graphistry/tests/compute/gfql/test_ir_arrow_bridge.py` for round-trip fidelity, nullability behavior, confidence handling, and strict/widen coercion boundaries. diff --git a/docs/source/gfql/cypher.rst b/docs/source/gfql/cypher.rst index 220565caa4..2357e30ea8 100644 --- a/docs/source/gfql/cypher.rst +++ b/docs/source/gfql/cypher.rst @@ -457,9 +457,25 @@ execution, preflight it with the helper APIs: except GFQLValidationError as exc: print("Valid Cypher, but outside the current GFQL Cypher surface:", exc) +For a first-class preflight API on bound graphs, use ``g.gfql_validate(...)``: + +.. code-block:: python + + report = g.gfql_validate( + "MATCH (p:Person) RETURN p.name AS name ORDER BY name DESC LIMIT $top_n", + params={"top_n": 5}, + strict=True, # optional strict binder/schema mode + ) + + if not report["ok"]: + for diag in report["diagnostics"]: + print(diag["code"], diag["message"], diag.get("field")) + - Use ``parse_cypher()`` when you only want syntax and AST validation. - Use ``compile_cypher()`` for the strongest compiler preflight, because it also catches unsupported-but-valid query shapes in lowering. +- Use ``g.gfql_validate(...)`` when you want a stable validate-only entrypoint + that returns structured diagnostics and never executes query operators. - Use ``cypher_to_gfql()`` only when you specifically need a single GFQL ``Chain``. It is intentionally stricter than direct execution through ``g.gfql("...")``. diff --git a/docs/source/gfql/validation/fundamentals.rst b/docs/source/gfql/validation/fundamentals.rst index 64ab964fb0..2f582aab44 100644 --- a/docs/source/gfql/validation/fundamentals.rst +++ b/docs/source/gfql/validation/fundamentals.rst @@ -169,6 +169,24 @@ Use ``validate_chain_schema()`` to check compatibility without running the query result = g.gfql(chain.chain) print(f"Query executed: {len(result._nodes)} nodes") +For mixed GFQL + Cypher preflight on a bound graph, use ``g.gfql_validate(...)``: + +.. code-block:: python + + # Chain / JSON-style GFQL + report = g.gfql_validate([n({'type': 'customer'})], collect_all=True) + if not report["ok"]: + print(report["diagnostics"]) + + # Cypher + cypher_report = g.gfql_validate( + "MATCH (c:Customer) RETURN c.id AS id LIMIT $n", + params={"n": 10}, + strict=True, + ) + if not cypher_report["ok"]: + print(cypher_report["diagnostics"]) + Error Collection ^^^^^^^^^^^^^^^^ @@ -197,4 +215,4 @@ See Also -------- * :doc:`../spec/language` - Complete language specification -* :doc:`../overview` - GFQL overview \ No newline at end of file +* :doc:`../overview` - GFQL overview diff --git a/graphistry/compute/ComputeMixin.py b/graphistry/compute/ComputeMixin.py index 814ddec2c1..7134539821 100644 --- a/graphistry/compute/ComputeMixin.py +++ b/graphistry/compute/ComputeMixin.py @@ -10,6 +10,7 @@ from .chain import Chain, chain as chain_base from .chain_let import chain_let as chain_let_base from .gfql_unified import gfql as gfql_base +from .gfql_validate import gfql_validate as gfql_validate_base from .chain_remote import ( chain_remote as chain_remote_base, chain_remote_shape as chain_remote_shape_base @@ -508,6 +509,10 @@ def gfql(self, *args, **kwargs): return gfql_base(self, *args, **kwargs) gfql.__doc__ = gfql_base.__doc__ + def gfql_validate(self, *args, **kwargs): + return gfql_validate_base(self, *args, **kwargs) + gfql_validate.__doc__ = gfql_validate_base.__doc__ + def chain_remote(self, *args, **kwargs) -> Plottable: """ .. deprecated:: 2.XX.X diff --git a/graphistry/compute/gfql_validate.py b/graphistry/compute/gfql_validate.py new file mode 100644 index 0000000000..8ec0a6d7d2 --- /dev/null +++ b/graphistry/compute/gfql_validate.py @@ -0,0 +1,263 @@ +"""Validate-only GFQL/Cypher preflight helpers (no query execution).""" + +from __future__ import annotations + +import re +from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Tuple, Union, cast + +from graphistry.Plottable import Plottable +from graphistry.compute.ast import ASTLet, ASTObject, ASTNode, ASTEdge, from_json +from graphistry.compute.chain import Chain +from graphistry.compute.exceptions import ErrorCode, GFQLValidationError +from graphistry.compute.gfql.cypher.lowering import ( + CompiledCypherGraphQuery, + CompiledCypherQuery, + CompiledCypherUnionQuery, + compile_cypher_query, +) +from graphistry.compute.gfql.cypher.parser import parse_cypher +from graphistry.compute.gfql.frontends.cypher.binder import FrontendBinder +from graphistry.compute.gfql.ir.compilation import GraphSchemaCatalog, PlanContext +from graphistry.compute.gfql.same_path_types import ( + WhereComparison, + normalize_where_entries, + parse_where_json, +) +from graphistry.compute.validate.validate_schema import validate_chain_schema + + +GFQLValidationQuery = Union[ASTObject, List[ASTObject], ASTLet, Chain, dict, str] + +_CYPHER_LEAD_RE = re.compile( + r"^\s*(?:MATCH|OPTIONAL\s+MATCH|WITH|RETURN|UNWIND|CALL|CREATE|MERGE|DELETE|DETACH\s+DELETE|SET|REMOVE|FOREACH|GRAPH|USE)\b", + re.IGNORECASE, +) + + +def _looks_like_cypher_query(query: str) -> bool: + return _CYPHER_LEAD_RE.match(query) is not None + + +def _serialize_error(exc: Exception, *, stage: str) -> Dict[str, Any]: + if hasattr(exc, "to_dict") and callable(getattr(exc, "to_dict")): + out = cast(Dict[str, Any], exc.to_dict()) # GFQLValidationError surface + elif hasattr(exc, "code") and hasattr(exc, "message"): + out = { + "code": cast(Any, getattr(exc, "code")), + "message": cast(Any, getattr(exc, "message")), + } + context = cast(Any, getattr(exc, "context", None)) + if isinstance(context, dict): + out.update(context) + else: + out = { + "code": ErrorCode.E108, + "message": str(exc), + } + out["stage"] = stage + return out + + +def _build_schema_catalog(g: Plottable, *, strict: bool) -> GraphSchemaCatalog: + node_columns = () + edge_columns = () + if getattr(g, "_nodes", None) is not None: + node_columns = tuple(str(c) for c in cast(Any, g)._nodes.columns) + if getattr(g, "_edges", None) is not None: + edge_columns = tuple(str(c) for c in cast(Any, g)._edges.columns) + return GraphSchemaCatalog.from_schema_parts( + node_columns=node_columns, + edge_columns=edge_columns, + node_id_column=getattr(g, "_node", None), + edge_source_column=getattr(g, "_source", None), + edge_destination_column=getattr(g, "_destination", None), + metadata={"strict": strict}, + ) + + +def _validate_cypher( + g: Plottable, + query: str, + *, + params: Optional[Mapping[str, Any]], + strict: bool, +) -> Dict[str, Any]: + parsed = parse_cypher(query) + if strict: + strict_ctx = PlanContext(catalog=_build_schema_catalog(g, strict=True)) + FrontendBinder().bind(parsed, strict_ctx, strict_name_resolution=True) + compiled = compile_cypher_query(parsed, params=params) + compiled_kind: Literal["query", "union", "graph"] = "query" + if isinstance(compiled, CompiledCypherUnionQuery): + compiled_kind = "union" + elif isinstance(compiled, CompiledCypherGraphQuery): + compiled_kind = "graph" + else: + compiled = cast(CompiledCypherQuery, compiled) + return { + "ok": True, + "query_type": "chain", + "language": "cypher", + "diagnostics": [], + "compiled_kind": compiled_kind, + } + + +def _coerce_non_string_query( + query: GFQLValidationQuery, + *, + where: Optional[Sequence[WhereComparison]], +) -> Union[ASTObject, ASTLet, Chain]: + where_param: Optional[List[WhereComparison]] = None + if where is not None: + if isinstance(where, (list, tuple)): + where_param = normalize_where_entries(where) + else: + raise ValueError(f"where must be a list of comparisons, got {type(where).__name__}") + + out: Union[ASTObject, ASTLet, Chain, dict, List[ASTObject], str] = query + if isinstance(out, dict) and out.get("type") == "Let": + out = ASTLet.from_json(out) + elif isinstance(out, dict) and "chain" in out: + chain_items: List[ASTObject] = [] + for item in cast(List[Any], out["chain"]): + if isinstance(item, dict): + chain_items.append(from_json(item)) + elif isinstance(item, ASTObject): + chain_items.append(item) + else: + raise TypeError(f"Unsupported chain entry type: {type(item)}") + dict_where = parse_where_json(cast(Any, out).get("where")) + if where_param is not None and dict_where: + raise ValueError("where cannot be combined with dict chain that already includes where") + effective_where = where_param if where_param is not None else dict_where + if not chain_items and effective_where: + raise ValueError("where requires at least one named node/edge step; empty chains have no aliases") + out = Chain(chain_items, where=effective_where) + elif isinstance(out, dict): + wrapped_dict: Dict[str, Any] = {} + for key, value in out.items(): + if isinstance(value, (ASTNode, ASTEdge)): + wrapped_dict[key] = Chain([value]) + else: + wrapped_dict[key] = value + out = ASTLet(wrapped_dict) # type: ignore[arg-type] + elif isinstance(out, Chain): + if where_param: + if out.where: + raise ValueError("where provided for Chain that already includes where") + out = Chain(out.chain, where=where_param) + elif isinstance(out, ASTObject): + out = Chain([out], where=where_param) + elif isinstance(out, list): + converted_query: List[ASTObject] = [] + for item in out: + if isinstance(item, dict): + converted_query.append(from_json(item)) + else: + converted_query.append(item) + if not converted_query and where_param: + raise ValueError("where requires at least one named node/edge step; empty chains have no aliases") + out = Chain(converted_query, where=where_param) + else: + raise TypeError( + f"Query must be ASTObject, List[ASTObject], Chain, ASTLet, dict, or string. " + f"Got {type(out).__name__}" + ) + + if isinstance(out, (Chain, ASTLet, ASTObject)): + return out + raise TypeError( + f"Query must be ASTObject, List[ASTObject], Chain, ASTLet, dict, or string. Got {type(out).__name__}" + ) + + +def _validate_non_string_query( + g: Plottable, + query: GFQLValidationQuery, + *, + where: Optional[Sequence[WhereComparison]], + collect_all: bool, +) -> Dict[str, Any]: + coerced = _coerce_non_string_query(query, where=where) + if isinstance(coerced, Chain): + if collect_all: + errors = validate_chain_schema(g, coerced.chain, collect_all=True) or [] + return { + "ok": len(errors) == 0, + "query_type": "chain", + "language": "gfql", + "diagnostics": [cast(Any, e).to_dict() for e in errors], + } + validate_chain_schema(g, coerced.chain, collect_all=False) + return { + "ok": True, + "query_type": "chain", + "language": "gfql", + "diagnostics": [], + } + + # For DAG/non-chain AST forms, preserve existing AST structural validation + # surface without introducing a new schema simulator for chain-let graphs. + if collect_all: + errors = cast(Any, coerced).validate(collect_all=True) or [] + return { + "ok": len(errors) == 0, + "query_type": "dag" if isinstance(coerced, ASTLet) else "single", + "language": "gfql", + "diagnostics": [cast(Any, e).to_dict() for e in errors], + } + cast(Any, coerced).validate(collect_all=False) + return { + "ok": True, + "query_type": "dag" if isinstance(coerced, ASTLet) else "single", + "language": "gfql", + "diagnostics": [], + } + + +def gfql_validate( + g: Plottable, + query: GFQLValidationQuery, + *, + where: Optional[Sequence[WhereComparison]] = None, + language: Optional[Literal["cypher", "gremlin"]] = None, + params: Optional[Mapping[str, Any]] = None, + strict: bool = False, + collect_all: bool = False, +) -> Dict[str, Any]: + """Validate a GFQL/Cypher query without executing it. + + Returns structured diagnostics and never dispatches query execution operators. + """ + try: + if isinstance(query, str): + if where is not None: + raise ValueError("where cannot be combined with string queries; embed Cypher predicates in the query itself") + query_language = language or "cypher" + if query_language != "cypher": + raise GFQLValidationError( + ErrorCode.E108, + f"Unsupported GFQL string language '{query_language}'", + field="language", + value=query_language, + suggestion="Use language='cypher' for now; Gremlin string compilation is not implemented yet.", + language="gfql", + ) + if language is None and not _looks_like_cypher_query(query): + raise TypeError("Query must be ASTObject, List[ASTObject], Chain, ASTLet, or dict. Got str") + return _validate_cypher(g, query, params=params, strict=strict) + + if language is not None: + raise ValueError("language is only supported when query is a string") + if params is not None: + raise ValueError("params is only supported when query is a string") + return _validate_non_string_query(g, query, where=where, collect_all=collect_all) + except Exception as exc: + return { + "ok": False, + "query_type": "chain" if isinstance(query, str) else "single", + "language": "cypher" if isinstance(query, str) else "gfql", + "diagnostics": [_serialize_error(exc, stage="validate")], + } + diff --git a/graphistry/tests/compute/test_gfql_validate_only.py b/graphistry/tests/compute/test_gfql_validate_only.py new file mode 100644 index 0000000000..68933d422a --- /dev/null +++ b/graphistry/tests/compute/test_gfql_validate_only.py @@ -0,0 +1,75 @@ +import pandas as pd + +from graphistry.compute.ast import n +from graphistry.tests.test_compute import CGFull + + +def _mk_graph(): + nodes_df = pd.DataFrame( + { + "id": ["a", "b", "c"], + "label__Person": [True, True, False], + "name": ["Alice", "Bob", "Corp"], + "score": [3, 1, 2], + } + ) + edges_df = pd.DataFrame({"s": ["a", "b"], "d": ["b", "c"], "type": ["KNOWS", "WORKS_AT"]}) + return CGFull().nodes(nodes_df, "id").edges(edges_df, "s", "d") + + +def test_gfql_validate_exists_on_public_api(): + g = CGFull() + assert hasattr(g, "gfql_validate") + assert callable(g.gfql_validate) + + +def test_gfql_validate_chain_success(): + g = _mk_graph() + report = g.gfql_validate([n({"name": "Alice"})]) + assert report["ok"] is True + assert report["language"] == "gfql" + assert report["query_type"] == "chain" + assert report["diagnostics"] == [] + + +def test_gfql_validate_chain_failure_collect_all(): + g = _mk_graph() + report = g.gfql_validate([n({"missing_col": "x"})], collect_all=True) + assert report["ok"] is False + assert report["language"] == "gfql" + assert report["diagnostics"] + assert report["diagnostics"][0]["code"] == "column-not-found" + + +def test_gfql_validate_cypher_success(): + g = _mk_graph() + report = g.gfql_validate( + "MATCH (p:Person) RETURN p.name AS name ORDER BY name DESC LIMIT $top_n", + params={"top_n": 2}, + ) + assert report["ok"] is True + assert report["language"] == "cypher" + assert report["query_type"] == "chain" + assert report["compiled_kind"] == "query" + assert report["diagnostics"] == [] + + +def test_gfql_validate_cypher_strict_reports_schema_errors(): + g = _mk_graph() + report = g.gfql_validate("MATCH (p:Employee) RETURN p.name AS name", strict=True) + assert report["ok"] is False + assert report["language"] == "cypher" + assert report["diagnostics"] + assert report["diagnostics"][0]["code"] == "column-not-found" + + +def test_gfql_validate_does_not_execute_query_operators(monkeypatch): + g = _mk_graph() + + def _should_not_run(*args, **kwargs): + raise AssertionError("execution path should not be called by gfql_validate") + + monkeypatch.setattr("graphistry.compute.chain.chain", _should_not_run) + report = g.gfql_validate([n({"name": "Alice"})]) + assert report["ok"] is True + From 768c4c7cfdeab47f5799a26dad6ea0087314d078 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 17:23:19 -0700 Subject: [PATCH 02/17] Fix mypy tuple typing in gfql_validate schema catalog --- graphistry/compute/gfql_validate.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/graphistry/compute/gfql_validate.py b/graphistry/compute/gfql_validate.py index 8ec0a6d7d2..f9f02fc890 100644 --- a/graphistry/compute/gfql_validate.py +++ b/graphistry/compute/gfql_validate.py @@ -59,8 +59,8 @@ def _serialize_error(exc: Exception, *, stage: str) -> Dict[str, Any]: def _build_schema_catalog(g: Plottable, *, strict: bool) -> GraphSchemaCatalog: - node_columns = () - edge_columns = () + node_columns: Tuple[str, ...] = tuple() + edge_columns: Tuple[str, ...] = tuple() if getattr(g, "_nodes", None) is not None: node_columns = tuple(str(c) for c in cast(Any, g)._nodes.columns) if getattr(g, "_edges", None) is not None: @@ -260,4 +260,3 @@ def gfql_validate( "language": "cypher" if isinstance(query, str) else "gfql", "diagnostics": [_serialize_error(exc, stage="validate")], } - From d5ce7ee9f523151f928a3490b18593ad5fbee3c2 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 17:40:49 -0700 Subject: [PATCH 03/17] Extend GFQL prevalidation semantics across local and remote entrypoints --- CHANGELOG.md | 5 +- docs/source/gfql/cypher.rst | 2 + docs/source/gfql/validation/fundamentals.rst | 22 +++ graphistry/compute/chain_remote.py | 33 +++-- graphistry/compute/gfql_unified.py | 27 +++- graphistry/compute/gfql_validate.py | 131 +++++++++++++++++- .../tests/compute/test_chain_remote_v2.py | 19 ++- graphistry/tests/compute/test_gfql.py | 30 ++++ .../tests/compute/test_gfql_validate_only.py | 23 ++- 9 files changed, 266 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 66660197c7..dbdbb596a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - **Release process / deploy gate reminder**: Documented that tag-triggered PyPI publishes can pause in `waiting` on environment approval, and explicitly call out approving `Review deployments` for `pypi-release` before expecting the final PyPI job to complete. ### Added -- **GFQL/Cypher validate-only preflight API (#1320)**: Added `g.gfql_validate(...)` on `ComputeMixin` as a public no-execution validation entrypoint for GFQL chains/JSON-style queries and Cypher strings. The API returns structured diagnostics (`ok`, `diagnostics`, query/language metadata) instead of executing query operators. Cypher preflight runs parser+compiler checks and supports optional strict binder/schema mode (`strict=True`) using the bound graph schema catalog; chain/JSON preflight reuses existing `validate_chain_schema()` semantics (including `collect_all=True`). Added focused regression tests in `graphistry/tests/compute/test_gfql_validate_only.py` and docs updates in `docs/source/gfql/cypher.rst` + `docs/source/gfql/validation/fundamentals.rst`. +- **GFQL/Cypher validate-only preflight API (#1320)**: Added `g.gfql_validate(...)` on `ComputeMixin` as a public no-execution validation entrypoint for GFQL chains/JSON-style queries, Let/DAG queries, and Cypher strings. The API returns structured diagnostics (`ok`, `diagnostics`, query/language metadata) instead of executing query operators. Cypher preflight runs parser+compiler checks and supports optional strict binder/schema mode (`strict=True`) using the bound graph schema catalog; chain/JSON preflight reuses existing `validate_chain_schema()` semantics (including `collect_all=True`), and Let/DAG preflight now includes best-effort schema checks for direct chain-like bindings. + +### Changed +- **GFQL execution prevalidation semantics (#1320)**: `g.gfql(..., validate=True)` now runs local preflight validation before execution. `g.gfql_remote(..., validate=True)` now validates query payloads before implicit upload/network dispatch, so invalid queries fail locally prior to upload when possible. ### Internal - **GFQL / Cypher reentry follow-through cleanup (#989, post-#1260 extraction)**: In `graphistry/compute/gfql/cypher/reentry/runtime.py`, free-form intermediate MATCH plan construction now routes through the whole-row/free-form `ReentryPlan` contract instead of scalar-only fallback tagging. This makes the dedicated runtime `plan.free_form` lane reachable again and removes incidental scalar-only-path dependence for free-form reentry dispatch. diff --git a/docs/source/gfql/cypher.rst b/docs/source/gfql/cypher.rst index 2357e30ea8..06c3f137d0 100644 --- a/docs/source/gfql/cypher.rst +++ b/docs/source/gfql/cypher.rst @@ -476,6 +476,8 @@ For a first-class preflight API on bound graphs, use ``g.gfql_validate(...)``: catches unsupported-but-valid query shapes in lowering. - Use ``g.gfql_validate(...)`` when you want a stable validate-only entrypoint that returns structured diagnostics and never executes query operators. +- Use ``g.gfql(..., validate=True)`` when you want execution guarded by a + local preflight check. - Use ``cypher_to_gfql()`` only when you specifically need a single GFQL ``Chain``. It is intentionally stricter than direct execution through ``g.gfql("...")``. diff --git a/docs/source/gfql/validation/fundamentals.rst b/docs/source/gfql/validation/fundamentals.rst index 2f582aab44..cea48a48e6 100644 --- a/docs/source/gfql/validation/fundamentals.rst +++ b/docs/source/gfql/validation/fundamentals.rst @@ -187,6 +187,28 @@ For mixed GFQL + Cypher preflight on a bound graph, use ``g.gfql_validate(...)`` if not cypher_report["ok"]: print(cypher_report["diagnostics"]) +``g.gfql_validate(...)`` also supports Let/DAG payloads. It performs AST structural +validation for the DAG and schema checks for direct chain-like binding steps +(``Chain``, ``Node``, ``Edge``, ``Call``) without executing operators. + +Execution-time Preflight Toggles +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you want execution plus a preflight gate, ``g.gfql`` exposes an explicit flag: + +.. code-block:: python + + # Run preflight first; execute only if preflight passes + result = g.gfql( + "MATCH (c:Customer) RETURN c.id AS id LIMIT $n", + params={"n": 10}, + validate=True, + ) + +For remote execution, ``g.gfql_remote(..., validate=True)`` runs local query +prevalidation before implicit upload/network execution, so invalid queries fail +before data upload when possible. + Error Collection ^^^^^^^^^^^^^^^^ diff --git a/graphistry/compute/chain_remote.py b/graphistry/compute/chain_remote.py index 1689eb7cf2..c24ce63da3 100644 --- a/graphistry/compute/chain_remote.py +++ b/graphistry/compute/chain_remote.py @@ -16,6 +16,7 @@ from graphistry.compute.chain import Chain from graphistry.compute.gfql.cypher.lowering import compile_cypher_query from graphistry.compute.gfql.cypher.parser import parse_cypher +from graphistry.compute.gfql_validate import gfql_validate as gfql_preflight_validate, raise_first_diagnostic from graphistry.io.metadata import deserialize_plottable_metadata from graphistry.models.compute.chain_remote import OutputTypeGraph, FormatType, output_types_graph from graphistry.utils.json import JSONVal @@ -136,18 +137,8 @@ def chain_remote_generic( self._pygraphistry.refresh() api_token = self.session.api_token - if not dataset_id: - dataset_id = self._dataset_id - - if not dataset_id: - self = self.upload(validate=validate) - dataset_id = self._dataset_id - if output_type not in output_types_graph: raise ValueError(f"Unknown output_type, expected one of {output_types_graph}, got: {output_type}") - - if not dataset_id: - raise ValueError("Missing dataset_id; either pass in, or call on g2=g1.plot(render='g') in api=3 mode ahead of time") # Resolve engine: auto -> pandas/cudf based on graph DataFrame type engine_resolved = resolve_engine(engine, self) @@ -201,8 +192,26 @@ def chain_remote_generic( else: raise TypeError(f"gfql_remote() query must be Chain, List, ASTLet, Dict, or str. Got {type(chain)}") - if validate and not is_let: - Chain.from_json(chain_json) + if validate: + report = gfql_preflight_validate( + self, + chain, + params=params, + collect_all=False, + schema=False, + ) + if not bool(report.get("ok", False)): + raise_first_diagnostic(report) + + if not dataset_id: + dataset_id = self._dataset_id + + if not dataset_id: + self = self.upload(validate=validate) + dataset_id = self._dataset_id + + if not dataset_id: + raise ValueError("Missing dataset_id; either pass in, or call on g2=g1.plot(render='g') in api=3 mode ahead of time") # --- Build request body (dual-field for backward compat) --- if is_let: diff --git a/graphistry/compute/gfql_unified.py b/graphistry/compute/gfql_unified.py index 310be0cda2..79cd9ca7ca 100644 --- a/graphistry/compute/gfql_unified.py +++ b/graphistry/compute/gfql_unified.py @@ -55,6 +55,7 @@ from graphistry.compute.typing import DataFrameT, SeriesT from graphistry.compute.util.generate_safe_column_name import generate_safe_column_name from graphistry.compute.validate.validate_schema import validate_chain_schema +from graphistry.compute.gfql_validate import gfql_validate as gfql_preflight_validate, raise_first_diagnostic from graphistry.otel import otel_traced, otel_detail_enabled logger = setup_logger(__name__) @@ -1589,6 +1590,7 @@ def gfql(self: Plottable, where: Optional[Sequence[WhereComparison]] = None, language: Optional[Literal["cypher", "gremlin"]] = None, params: Optional[Mapping[str, Any]] = None, + validate: bool = False, shortest_path_backend: str = "auto") -> Plottable: """ Execute a GFQL query - either a chain or a DAG @@ -1603,6 +1605,7 @@ def gfql(self: Plottable, :param where: Optional same-path constraints for list/Chain queries :param language: Optional string-query language selector. Defaults to ``"cypher"`` when ``query`` is a string. :param params: Optional parameter dictionary for string-query compilation + :param validate: When ``True``, run local preflight validation before execution via ``g.gfql_validate(...)``. :param shortest_path_backend: Backend for shortestPath execution: ``"auto"`` (default), ``"igraph"`` (require igraph, raise if missing), ``"cugraph"`` (require cugraph, raise if missing), or ``"bfs"`` (always use DataFrame BFS). ``"auto"`` tries @@ -1800,11 +1803,30 @@ def policy(context: PolicyContext) -> None: if where_param and isinstance(query, (dict, ASTLet)): raise ValueError("where must be provided inside dict chain under the 'where' key") + if not isinstance(query, str): + if language is not None: + raise ValueError("language is only supported when query is a string") + if params is not None: + raise ValueError("params is only supported when query is a string") if isinstance(query, str): if where_param: raise ValueError("where cannot be combined with string queries; embed Cypher predicates in the query itself") if language is None and not _looks_like_cypher_query(query): raise TypeError("Query must be ASTObject, List[ASTObject], Chain, ASTLet, or dict. Got str") + + if validate: + report = gfql_preflight_validate( + dispatch_self, + query, + where=where_param, + language=language, + params=params, + collect_all=False, + ) + if not bool(report.get("ok", False)): + raise_first_diagnostic(report) + + if isinstance(query, str): compiled_query = _compile_string_query(query, language=language, params=params) if isinstance(compiled_query, CompiledCypherGraphQuery): return _execute_graph_query(self, compiled_query, engine=engine, policy=expanded_policy, context=context) @@ -1812,11 +1834,6 @@ def policy(context: PolicyContext) -> None: if compiled_query.graph_bindings or compiled_query.use_ref: return _execute_query_with_graph_context(self, compiled_query, engine=engine, policy=expanded_policy, context=context) query = compiled_query.chain - else: - if language is not None: - raise ValueError("language is only supported when query is a string") - if params is not None: - raise ValueError("params is only supported when query is a string") if isinstance(query, dict) and query.get("type") == "Let": from .ast import ASTLet as _ASTLet diff --git a/graphistry/compute/gfql_validate.py b/graphistry/compute/gfql_validate.py index f9f02fc890..20507e21dc 100644 --- a/graphistry/compute/gfql_validate.py +++ b/graphistry/compute/gfql_validate.py @@ -6,9 +6,9 @@ from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Tuple, Union, cast from graphistry.Plottable import Plottable -from graphistry.compute.ast import ASTLet, ASTObject, ASTNode, ASTEdge, from_json +from graphistry.compute.ast import ASTLet, ASTObject, ASTNode, ASTEdge, ASTCall, ASTRef, from_json from graphistry.compute.chain import Chain -from graphistry.compute.exceptions import ErrorCode, GFQLValidationError +from graphistry.compute.exceptions import ErrorCode, GFQLSyntaxError, GFQLValidationError from graphistry.compute.gfql.cypher.lowering import ( CompiledCypherGraphQuery, CompiledCypherQuery, @@ -147,6 +147,8 @@ def _coerce_non_string_query( if out.where: raise ValueError("where provided for Chain that already includes where") out = Chain(out.chain, where=where_param) + elif isinstance(out, ASTLet): + pass elif isinstance(out, ASTObject): out = Chain([out], where=where_param) elif isinstance(out, list): @@ -178,9 +180,26 @@ def _validate_non_string_query( *, where: Optional[Sequence[WhereComparison]], collect_all: bool, + schema: bool, ) -> Dict[str, Any]: coerced = _coerce_non_string_query(query, where=where) if isinstance(coerced, Chain): + if not schema: + if collect_all: + errors = cast(Any, coerced).validate(collect_all=True) or [] + return { + "ok": len(errors) == 0, + "query_type": "chain", + "language": "gfql", + "diagnostics": [cast(Any, e).to_dict() for e in errors], + } + cast(Any, coerced).validate(collect_all=False) + return { + "ok": True, + "query_type": "chain", + "language": "gfql", + "diagnostics": [], + } if collect_all: errors = validate_chain_schema(g, coerced.chain, collect_all=True) or [] return { @@ -197,8 +216,11 @@ def _validate_non_string_query( "diagnostics": [], } - # For DAG/non-chain AST forms, preserve existing AST structural validation - # surface without introducing a new schema simulator for chain-let graphs. + if isinstance(coerced, ASTLet): + return _validate_let_query(g, coerced, collect_all=collect_all, schema=schema) + + # For non-chain/non-let AST forms, preserve existing AST structural validation + # surface without introducing a new schema simulator. if collect_all: errors = cast(Any, coerced).validate(collect_all=True) or [] return { @@ -216,6 +238,65 @@ def _validate_non_string_query( } +def _validate_let_binding_schema_errors(g: Plottable, value: Any) -> List[Any]: + # Structural validation for AST forms is handled by ASTSerializable.validate(); + # this helper adds best-effort schema validation for bindings that execute + # directly against dataframe-like tables. + errors: List[Any] = [] + + if isinstance(value, ASTLet): + for nested in value.bindings.values(): + errors.extend(_validate_let_binding_schema_errors(g, nested)) + return errors + + if isinstance(value, Chain): + return validate_chain_schema(g, value.chain, collect_all=True) or [] + + if isinstance(value, (ASTNode, ASTEdge, ASTCall)): + return validate_chain_schema(g, [value], collect_all=True) or [] + + # ASTRef bindings execute against prior DAG bindings and may have schema + # transformations not visible from root graph statically; keep structural + # checks only to avoid false positives. + if isinstance(value, ASTRef): + return [] + + return [] + + +def _validate_let_query( + g: Plottable, + let_query: ASTLet, + *, + collect_all: bool, + schema: bool, +) -> Dict[str, Any]: + if collect_all: + errors = cast(Any, let_query).validate(collect_all=True) or [] + if schema: + for value in let_query.bindings.values(): + errors.extend(_validate_let_binding_schema_errors(g, value)) + return { + "ok": len(errors) == 0, + "query_type": "dag", + "language": "gfql", + "diagnostics": [cast(Any, e).to_dict() for e in errors], + } + + cast(Any, let_query).validate(collect_all=False) + if schema: + for value in let_query.bindings.values(): + binding_errors = _validate_let_binding_schema_errors(g, value) + if binding_errors: + raise cast(Any, binding_errors[0]) + return { + "ok": True, + "query_type": "dag", + "language": "gfql", + "diagnostics": [], + } + + def gfql_validate( g: Plottable, query: GFQLValidationQuery, @@ -225,6 +306,7 @@ def gfql_validate( params: Optional[Mapping[str, Any]] = None, strict: bool = False, collect_all: bool = False, + schema: bool = True, ) -> Dict[str, Any]: """Validate a GFQL/Cypher query without executing it. @@ -252,7 +334,7 @@ def gfql_validate( raise ValueError("language is only supported when query is a string") if params is not None: raise ValueError("params is only supported when query is a string") - return _validate_non_string_query(g, query, where=where, collect_all=collect_all) + return _validate_non_string_query(g, query, where=where, collect_all=collect_all, schema=schema) except Exception as exc: return { "ok": False, @@ -260,3 +342,42 @@ def gfql_validate( "language": "cypher" if isinstance(query, str) else "gfql", "diagnostics": [_serialize_error(exc, stage="validate")], } + + +def raise_first_diagnostic(report: Mapping[str, Any]) -> None: + diagnostics = report.get("diagnostics") + if not isinstance(diagnostics, list) or len(diagnostics) == 0: + raise GFQLValidationError( + ErrorCode.E108, + "GFQL validation failed without diagnostic details", + language=cast(Any, report.get("language")), + ) + + first = diagnostics[0] + if not isinstance(first, dict): + raise GFQLValidationError( + ErrorCode.E108, + "GFQL validation failed with invalid diagnostic payload", + value=first, + language=cast(Any, report.get("language")), + ) + + code = cast(Any, first.get("code")) or ErrorCode.E108 + message = cast(Any, first.get("message")) or "GFQL validation failed" + + # Keep core structured keys explicit and pass the rest through as context. + extra = { + key: value + for key, value in first.items() + if key not in {"code", "message", "field", "value", "suggestion", "operation_index"} + } + exc_cls = GFQLSyntaxError if code == ErrorCode.E107 else GFQLValidationError + raise exc_cls( + code, + message, + field=cast(Optional[str], first.get("field")), + value=first.get("value"), + suggestion=cast(Optional[str], first.get("suggestion")), + operation_index=cast(Optional[int], first.get("operation_index")), + **extra, + ) diff --git a/graphistry/tests/compute/test_chain_remote_v2.py b/graphistry/tests/compute/test_chain_remote_v2.py index 1566e332d2..f0e9b4c0fe 100644 --- a/graphistry/tests/compute/test_chain_remote_v2.py +++ b/graphistry/tests/compute/test_chain_remote_v2.py @@ -53,9 +53,9 @@ def __init__(self): self.branches = () -def _mock_plottable() -> MagicMock: +def _mock_plottable(dataset_id: str | None = "test-dataset-123") -> MagicMock: mock = MagicMock() - mock._dataset_id = "test-dataset-123" + mock._dataset_id = dataset_id mock._edges = pd.DataFrame({"s": [0], "d": [1]}) mock._nodes = pd.DataFrame({"id": [0, 1]}) mock._privacy = None @@ -271,3 +271,18 @@ def test_let_emits_warning(self) -> None: finally: _cr.warnings.warn = _orig # type: ignore assert any("Let/DAG" in str(a[0]) for a in captured) + + def test_validate_true_rejects_before_implicit_upload(self) -> None: + g = _mock_plottable(dataset_id=None) + + with patch("graphistry.compute.chain_remote.requests.post") as mock_post: + with pytest.raises(Exception): + chain_remote_generic( + g, + "MATCH (n RETURN n", + format="json", + validate=True, + ) + + g.upload.assert_not_called() + mock_post.assert_not_called() diff --git a/graphistry/tests/compute/test_gfql.py b/graphistry/tests/compute/test_gfql.py index 9e79e8bcbe..fd4f145bcf 100644 --- a/graphistry/tests/compute/test_gfql.py +++ b/graphistry/tests/compute/test_gfql.py @@ -1,6 +1,7 @@ import pandas as pd import pytest from typing import Any, Dict, List +from unittest.mock import patch from graphistry.compute.ast import ASTLet, ASTRef, n, e from graphistry.compute.chain import Chain from graphistry.compute.exceptions import ErrorCode, GFQLSyntaxError, GFQLValidationError @@ -258,6 +259,35 @@ def test_gfql_non_string_rejects_language_and_params(self): with pytest.raises(ValueError): g.gfql([n()], params={"x": 1}) + def test_gfql_validate_true_runs_preflight_before_compile(self): + g = _mk_people_company_graph3() + fake_report = { + "ok": False, + "query_type": "chain", + "language": "cypher", + "diagnostics": [ + {"code": ErrorCode.E108, "message": "synthetic preflight failure", "stage": "validate"} + ], + } + + with patch("graphistry.compute.gfql_unified.gfql_preflight_validate", return_value=fake_report): + with patch( + "graphistry.compute.gfql_unified._compile_string_query", + side_effect=AssertionError("compile should not run when preflight fails"), + ): + with pytest.raises(GFQLValidationError, match="synthetic preflight failure"): + g.gfql("MATCH (p) RETURN p", validate=True) + + def test_gfql_validate_false_skips_preflight(self): + g = _mk_people_company_graph3() + + with patch( + "graphistry.compute.gfql_unified.gfql_preflight_validate", + side_effect=AssertionError("preflight should not run when validate=False"), + ): + result = g.gfql([n()]) + assert result is not None + @pytest.mark.parametrize( ("direction", "expected"), [ diff --git a/graphistry/tests/compute/test_gfql_validate_only.py b/graphistry/tests/compute/test_gfql_validate_only.py index 68933d422a..633cfb0ce0 100644 --- a/graphistry/tests/compute/test_gfql_validate_only.py +++ b/graphistry/tests/compute/test_gfql_validate_only.py @@ -1,6 +1,7 @@ import pandas as pd -from graphistry.compute.ast import n +from graphistry.compute.ast import ASTLet, n +from graphistry.compute.chain import Chain from graphistry.tests.test_compute import CGFull @@ -73,3 +74,23 @@ def _should_not_run(*args, **kwargs): report = g.gfql_validate([n({"name": "Alice"})]) assert report["ok"] is True + +def test_gfql_validate_let_success(): + g = _mk_graph() + query = ASTLet({"people": Chain([n({"name": "Alice"})])}) + report = g.gfql_validate(query) + assert report["ok"] is True + assert report["language"] == "gfql" + assert report["query_type"] == "dag" + assert report["diagnostics"] == [] + + +def test_gfql_validate_let_schema_failure(): + g = _mk_graph() + query = ASTLet({"people": Chain([n({"missing_col": "x"})])}) + report = g.gfql_validate(query, collect_all=True) + assert report["ok"] is False + assert report["language"] == "gfql" + assert report["query_type"] == "dag" + assert report["diagnostics"] + assert report["diagnostics"][0]["code"] == "column-not-found" From ff4fa0c2fe0d98858912b7b85e94866986262275 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 17:46:47 -0700 Subject: [PATCH 04/17] Docs: clarify GFQL pre-exec validation entrypoints and coverage --- docs/source/gfql/validation/fundamentals.rst | 61 ++++++++++++-------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/docs/source/gfql/validation/fundamentals.rst b/docs/source/gfql/validation/fundamentals.rst index cea48a48e6..2b997d74dd 100644 --- a/docs/source/gfql/validation/fundamentals.rst +++ b/docs/source/gfql/validation/fundamentals.rst @@ -152,24 +152,23 @@ GFQL validates automatically - just write your queries and run them: Pre-Execution Validation Options ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Use ``validate_chain_schema()`` to check compatibility without running the query, then execute separately: +Use the inline GFQL entrypoints first: -.. code-block:: python - - from graphistry.compute.validate_schema import validate_chain_schema +1. ``g.gfql_validate(...)`` for validate-only preflight (no execution) +2. ``g.gfql(..., validate=True)`` for preflight + execution +3. ``validate_chain_schema()`` for low-level chain-schema checks only - # Step 1: Validate (no execution) - try: - validate_chain_schema(g, chain) # Only validates, doesn't execute - print("Chain is valid for this graph schema") - except GFQLSchemaError as e: - print(f"Schema incompatibility: {e}") +``g.gfql_validate(...)`` (validate-only, no execution) supports: - # Step 2: Execute (after validation passes) - result = g.gfql(chain.chain) - print(f"Query executed: {len(result._nodes)} nodes") +* **Query protocols**: Cypher strings, GFQL JSON/AST chain forms, and Let/DAG payloads +* **Predicate + structural validation**: yes (for all supported protocols) +* **Schema validation**: -For mixed GFQL + Cypher preflight on a bound graph, use ``g.gfql_validate(...)``: + * Chain/list/AST chain-like forms: yes (default ``schema=True``) + * Let/DAG: structural validation for the DAG + schema checks for direct chain-like bindings + (``Chain``, ``Node``, ``Edge``, ``Call``); ``Ref`` bindings stay structural-only + * Cypher strings: syntax/compile validation by default; use ``strict=True`` for schema-aware + name-resolution checks against the bound graph schema .. code-block:: python @@ -182,19 +181,15 @@ For mixed GFQL + Cypher preflight on a bound graph, use ``g.gfql_validate(...)`` cypher_report = g.gfql_validate( "MATCH (c:Customer) RETURN c.id AS id LIMIT $n", params={"n": 10}, - strict=True, + strict=True, # enable schema-aware Cypher name checks ) if not cypher_report["ok"]: print(cypher_report["diagnostics"]) -``g.gfql_validate(...)`` also supports Let/DAG payloads. It performs AST structural -validation for the DAG and schema checks for direct chain-like binding steps -(``Chain``, ``Node``, ``Edge``, ``Call``) without executing operators. - -Execution-time Preflight Toggles -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -If you want execution plus a preflight gate, ``g.gfql`` exposes an explicit flag: +``g.gfql(..., validate=True)`` supports the same Cypher + GFQL JSON/AST + Let query +inputs as ``g.gfql(...)``, runs local preflight first, and executes only when preflight +passes. Its preflight uses ``g.gfql_validate(...)`` defaults, so chain/JSON/AST/Let paths +include schema checks, while Cypher uses syntax/compile preflight (not strict schema binding). .. code-block:: python @@ -205,6 +200,26 @@ If you want execution plus a preflight gate, ``g.gfql`` exposes an explicit flag validate=True, ) +Use ``validate_chain_schema()`` when you specifically want the low-level chain-schema helper: + +.. code-block:: python + + from graphistry.compute.validate_schema import validate_chain_schema + + # Step 1: Validate (no execution) + try: + validate_chain_schema(g, chain) # Only validates, doesn't execute + print("Chain is valid for this graph schema") + except GFQLSchemaError as e: + print(f"Schema incompatibility: {e}") + + # Step 2: Execute (after validation passes) + result = g.gfql(chain.chain) + print(f"Query executed: {len(result._nodes)} nodes") + +Execution-time Preflight Toggles +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + For remote execution, ``g.gfql_remote(..., validate=True)`` runs local query prevalidation before implicit upload/network execution, so invalid queries fail before data upload when possible. From a2e1ff463d1ae04207abc6ac55c22e838a6229f6 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 17:47:42 -0700 Subject: [PATCH 05/17] Docs: prioritize inline Cypher preflight and document remote validate --- docs/source/gfql/cypher.rst | 48 ++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/docs/source/gfql/cypher.rst b/docs/source/gfql/cypher.rst index 06c3f137d0..da596007e1 100644 --- a/docs/source/gfql/cypher.rst +++ b/docs/source/gfql/cypher.rst @@ -440,24 +440,7 @@ Static Validation / Preflight Check ----------------------------------- If you want to know whether a query fits the current Cypher-in-GFQL subset before -execution, preflight it with the helper APIs: - -.. code-block:: python - - from graphistry.compute.exceptions import GFQLSyntaxError, GFQLValidationError - from graphistry.compute.gfql.cypher import parse_cypher, compile_cypher - - query = "MATCH (p:Person) RETURN p.name AS name" - - try: - parse_cypher(query) # grammar + AST checks - compile_cypher(query) # GFQL Cypher compiler / lowering checks - except GFQLSyntaxError as exc: - print("Invalid Cypher syntax for g.gfql(\"MATCH ...\"):", exc) - except GFQLValidationError as exc: - print("Valid Cypher, but outside the current GFQL Cypher surface:", exc) - -For a first-class preflight API on bound graphs, use ``g.gfql_validate(...)``: +execution, start with the bound-graph inline preflight APIs: .. code-block:: python @@ -471,22 +454,43 @@ For a first-class preflight API on bound graphs, use ``g.gfql_validate(...)``: for diag in report["diagnostics"]: print(diag["code"], diag["message"], diag.get("field")) -- Use ``parse_cypher()`` when you only want syntax and AST validation. -- Use ``compile_cypher()`` for the strongest compiler preflight, because it also - catches unsupported-but-valid query shapes in lowering. - Use ``g.gfql_validate(...)`` when you want a stable validate-only entrypoint that returns structured diagnostics and never executes query operators. - Use ``g.gfql(..., validate=True)`` when you want execution guarded by a local preflight check. +- Use ``g.gfql_remote(..., validate=True)`` when you want remote execution + guarded by local preflight before upload/network dispatch. +- Use ``parse_cypher()`` when you only want grammar/AST validation and access + to the parsed representation. +- Use ``compile_cypher()`` when you need low-level compiler/lowering output for + tooling or whitebox inspection. - Use ``cypher_to_gfql()`` only when you specifically need a single GFQL ``Chain``. It is intentionally stricter than direct execution through ``g.gfql("...")``. +Low-level helper example: + +.. code-block:: python + + from graphistry.compute.exceptions import GFQLSyntaxError, GFQLValidationError + from graphistry.compute.gfql.cypher import parse_cypher, compile_cypher + + query = "MATCH (p:Person) RETURN p.name AS name" + + try: + parsed = parse_cypher(query) # grammar + AST checks + compiled = compile_cypher(query) # compiler/lowering checks + except GFQLSyntaxError as exc: + print("Invalid Cypher syntax for g.gfql(\"MATCH ...\"):", exc) + except GFQLValidationError as exc: + print("Valid Cypher, but outside the current GFQL Cypher surface:", exc) + Common Rewrites --------------- - Need remote execution on Graphistry infrastructure instead of running against - the current bound graph? Prefer ``g.gfql_remote([...])`` for remote GFQL. + the current bound graph? Prefer ``g.gfql_remote(...)`` for remote GFQL, and + keep ``validate=True`` (default) for local preflight before upload. - Need remote database Cypher against Neo4j/Bolt-style backends instead of remote GFQL? Use ``graphistry.cypher("...")`` or ``g.cypher("...")``. - Need a pure GFQL chain object? Use ``cypher_to_gfql()`` when the query fits a From 1c520738b2c9e8221272aa59cfbb1e8cdd42a4e7 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 17:50:14 -0700 Subject: [PATCH 06/17] Align GFQL preflight defaults with strict local validation --- docs/source/gfql/cypher.rst | 8 +++++--- docs/source/gfql/validation/fundamentals.rst | 13 +++++++------ graphistry/compute/chain_remote.py | 1 + graphistry/compute/gfql_unified.py | 1 + graphistry/compute/gfql_validate.py | 2 +- graphistry/tests/compute/test_gfql.py | 8 ++++++++ graphistry/tests/compute/test_gfql_validate_only.py | 12 ++++++++++-- 7 files changed, 33 insertions(+), 12 deletions(-) diff --git a/docs/source/gfql/cypher.rst b/docs/source/gfql/cypher.rst index da596007e1..fcc25b565a 100644 --- a/docs/source/gfql/cypher.rst +++ b/docs/source/gfql/cypher.rst @@ -447,7 +447,7 @@ execution, start with the bound-graph inline preflight APIs: report = g.gfql_validate( "MATCH (p:Person) RETURN p.name AS name ORDER BY name DESC LIMIT $top_n", params={"top_n": 5}, - strict=True, # optional strict binder/schema mode + # strict=True is the default for local bound-graph preflight ) if not report["ok"]: @@ -457,9 +457,11 @@ execution, start with the bound-graph inline preflight APIs: - Use ``g.gfql_validate(...)`` when you want a stable validate-only entrypoint that returns structured diagnostics and never executes query operators. - Use ``g.gfql(..., validate=True)`` when you want execution guarded by a - local preflight check. + local preflight check. For Cypher strings, this uses schema-aware strict + preflight by default. - Use ``g.gfql_remote(..., validate=True)`` when you want remote execution - guarded by local preflight before upload/network dispatch. + guarded by local preflight before upload/network dispatch. For Cypher strings, + remote preflight uses ``strict=False`` by default because remote schema is authoritative. - Use ``parse_cypher()`` when you only want grammar/AST validation and access to the parsed representation. - Use ``compile_cypher()`` when you need low-level compiler/lowering output for diff --git a/docs/source/gfql/validation/fundamentals.rst b/docs/source/gfql/validation/fundamentals.rst index 2b997d74dd..ae92c7584c 100644 --- a/docs/source/gfql/validation/fundamentals.rst +++ b/docs/source/gfql/validation/fundamentals.rst @@ -167,8 +167,9 @@ Use the inline GFQL entrypoints first: * Chain/list/AST chain-like forms: yes (default ``schema=True``) * Let/DAG: structural validation for the DAG + schema checks for direct chain-like bindings (``Chain``, ``Node``, ``Edge``, ``Call``); ``Ref`` bindings stay structural-only - * Cypher strings: syntax/compile validation by default; use ``strict=True`` for schema-aware - name-resolution checks against the bound graph schema + * Cypher strings: syntax/compile + schema-aware name-resolution checks against + the bound graph schema by default (``strict=True``); pass ``strict=False`` + for syntax/compile-only preflight .. code-block:: python @@ -181,15 +182,14 @@ Use the inline GFQL entrypoints first: cypher_report = g.gfql_validate( "MATCH (c:Customer) RETURN c.id AS id LIMIT $n", params={"n": 10}, - strict=True, # enable schema-aware Cypher name checks ) if not cypher_report["ok"]: print(cypher_report["diagnostics"]) ``g.gfql(..., validate=True)`` supports the same Cypher + GFQL JSON/AST + Let query inputs as ``g.gfql(...)``, runs local preflight first, and executes only when preflight -passes. Its preflight uses ``g.gfql_validate(...)`` defaults, so chain/JSON/AST/Let paths -include schema checks, while Cypher uses syntax/compile preflight (not strict schema binding). +passes. Its preflight uses ``g.gfql_validate(...)`` defaults, so chain/JSON/AST/Let and +Cypher paths all run schema-aware checks by default on local bound-graph execution. .. code-block:: python @@ -222,7 +222,8 @@ Execution-time Preflight Toggles For remote execution, ``g.gfql_remote(..., validate=True)`` runs local query prevalidation before implicit upload/network execution, so invalid queries fail -before data upload when possible. +before data upload when possible. For Cypher strings, remote prevalidation uses +``strict=False`` by default because the authoritative schema is on the remote dataset. Error Collection ^^^^^^^^^^^^^^^^ diff --git a/graphistry/compute/chain_remote.py b/graphistry/compute/chain_remote.py index c24ce63da3..47f76cc098 100644 --- a/graphistry/compute/chain_remote.py +++ b/graphistry/compute/chain_remote.py @@ -197,6 +197,7 @@ def chain_remote_generic( self, chain, params=params, + strict=False, collect_all=False, schema=False, ) diff --git a/graphistry/compute/gfql_unified.py b/graphistry/compute/gfql_unified.py index 79cd9ca7ca..f20a8815b6 100644 --- a/graphistry/compute/gfql_unified.py +++ b/graphistry/compute/gfql_unified.py @@ -1821,6 +1821,7 @@ def policy(context: PolicyContext) -> None: where=where_param, language=language, params=params, + strict=True, collect_all=False, ) if not bool(report.get("ok", False)): diff --git a/graphistry/compute/gfql_validate.py b/graphistry/compute/gfql_validate.py index 20507e21dc..27c695c00c 100644 --- a/graphistry/compute/gfql_validate.py +++ b/graphistry/compute/gfql_validate.py @@ -304,7 +304,7 @@ def gfql_validate( where: Optional[Sequence[WhereComparison]] = None, language: Optional[Literal["cypher", "gremlin"]] = None, params: Optional[Mapping[str, Any]] = None, - strict: bool = False, + strict: bool = True, collect_all: bool = False, schema: bool = True, ) -> Dict[str, Any]: diff --git a/graphistry/tests/compute/test_gfql.py b/graphistry/tests/compute/test_gfql.py index fd4f145bcf..11fcbda5e4 100644 --- a/graphistry/tests/compute/test_gfql.py +++ b/graphistry/tests/compute/test_gfql.py @@ -288,6 +288,14 @@ def test_gfql_validate_false_skips_preflight(self): result = g.gfql([n()]) assert result is not None + def test_gfql_validate_true_catches_cypher_schema_errors_by_default(self): + g = _mk_people_company_graph3() + + with pytest.raises(GFQLValidationError) as exc_info: + g.gfql("MATCH (p:Employee) RETURN p.id AS id", validate=True) + + assert exc_info.value.code == ErrorCode.E301 + @pytest.mark.parametrize( ("direction", "expected"), [ diff --git a/graphistry/tests/compute/test_gfql_validate_only.py b/graphistry/tests/compute/test_gfql_validate_only.py index 633cfb0ce0..372f538a5d 100644 --- a/graphistry/tests/compute/test_gfql_validate_only.py +++ b/graphistry/tests/compute/test_gfql_validate_only.py @@ -55,15 +55,23 @@ def test_gfql_validate_cypher_success(): assert report["diagnostics"] == [] -def test_gfql_validate_cypher_strict_reports_schema_errors(): +def test_gfql_validate_cypher_default_reports_schema_errors(): g = _mk_graph() - report = g.gfql_validate("MATCH (p:Employee) RETURN p.name AS name", strict=True) + report = g.gfql_validate("MATCH (p:Employee) RETURN p.name AS name") assert report["ok"] is False assert report["language"] == "cypher" assert report["diagnostics"] assert report["diagnostics"][0]["code"] == "column-not-found" +def test_gfql_validate_cypher_can_disable_strict_schema_checks(): + g = _mk_graph() + report = g.gfql_validate("MATCH (p:Employee) RETURN p.name AS name", strict=False) + assert report["ok"] is True + assert report["language"] == "cypher" + assert report["diagnostics"] == [] + + def test_gfql_validate_does_not_execute_query_operators(monkeypatch): g = _mk_graph() From 124623600803dae074577813f873d48937f68101 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 17:51:45 -0700 Subject: [PATCH 07/17] Docs: use user-facing GFQL input terminology --- docs/source/gfql/cypher.rst | 4 ++-- docs/source/gfql/validation/fundamentals.rst | 25 ++++++++++---------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/docs/source/gfql/cypher.rst b/docs/source/gfql/cypher.rst index fcc25b565a..76abf5c379 100644 --- a/docs/source/gfql/cypher.rst +++ b/docs/source/gfql/cypher.rst @@ -462,8 +462,8 @@ execution, start with the bound-graph inline preflight APIs: - Use ``g.gfql_remote(..., validate=True)`` when you want remote execution guarded by local preflight before upload/network dispatch. For Cypher strings, remote preflight uses ``strict=False`` by default because remote schema is authoritative. -- Use ``parse_cypher()`` when you only want grammar/AST validation and access - to the parsed representation. +- Use ``parse_cypher()`` when you only want grammar validation and access to + the parsed representation. - Use ``compile_cypher()`` when you need low-level compiler/lowering output for tooling or whitebox inspection. - Use ``cypher_to_gfql()`` only when you specifically need a single GFQL diff --git a/docs/source/gfql/validation/fundamentals.rst b/docs/source/gfql/validation/fundamentals.rst index ae92c7584c..8ea0c04bfd 100644 --- a/docs/source/gfql/validation/fundamentals.rst +++ b/docs/source/gfql/validation/fundamentals.rst @@ -160,16 +160,17 @@ Use the inline GFQL entrypoints first: ``g.gfql_validate(...)`` (validate-only, no execution) supports: -* **Query protocols**: Cypher strings, GFQL JSON/AST chain forms, and Let/DAG payloads -* **Predicate + structural validation**: yes (for all supported protocols) +* **Input forms**: Cypher strings, GFQL JSON payloads, and GFQL Python objects + (for example ``Chain(...)``, ``[n(), e(), n()]``, and ``ASTLet(...)``) +* **Predicate + structural validation**: yes * **Schema validation**: - * Chain/list/AST chain-like forms: yes (default ``schema=True``) - * Let/DAG: structural validation for the DAG + schema checks for direct chain-like bindings - (``Chain``, ``Node``, ``Edge``, ``Call``); ``Ref`` bindings stay structural-only - * Cypher strings: syntax/compile + schema-aware name-resolution checks against - the bound graph schema by default (``strict=True``); pass ``strict=False`` - for syntax/compile-only preflight + * GFQL JSON and GFQL Python chain-like forms: yes (default ``schema=True``) + * GFQL Let/DAG forms: DAG structure + schema checks for direct graph-bound + steps; reference-based steps stay structural-only + * Cypher strings: syntax/compile + schema-aware name checks against the bound + graph schema by default (``strict=True``); pass ``strict=False`` for + syntax/compile-only preflight .. code-block:: python @@ -186,10 +187,10 @@ Use the inline GFQL entrypoints first: if not cypher_report["ok"]: print(cypher_report["diagnostics"]) -``g.gfql(..., validate=True)`` supports the same Cypher + GFQL JSON/AST + Let query -inputs as ``g.gfql(...)``, runs local preflight first, and executes only when preflight -passes. Its preflight uses ``g.gfql_validate(...)`` defaults, so chain/JSON/AST/Let and -Cypher paths all run schema-aware checks by default on local bound-graph execution. +``g.gfql(..., validate=True)`` accepts the same query inputs as ``g.gfql(...)`` +(Cypher string, GFQL JSON, GFQL Python objects), runs local preflight first, and +executes only when preflight passes. Its preflight uses ``g.gfql_validate(...)`` +defaults, so local bound-graph execution runs schema-aware checks by default. .. code-block:: python From f6cb12bcc12fd2c4e58a3b4775c235cff1ffa2ef Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 17:59:00 -0700 Subject: [PATCH 08/17] Unify GFQL validate=True semantics and input-type surfaces --- docs/source/gfql/cypher.rst | 3 +-- docs/source/gfql/validation/fundamentals.rst | 3 +-- graphistry/compute/ComputeMixin.py | 2 +- graphistry/compute/chain_remote.py | 8 ++++---- graphistry/compute/gfql_unified.py | 1 + .../tests/compute/test_chain_remote_v2.py | 18 ++++++++++++++++++ 6 files changed, 26 insertions(+), 9 deletions(-) diff --git a/docs/source/gfql/cypher.rst b/docs/source/gfql/cypher.rst index 76abf5c379..eb4647dd24 100644 --- a/docs/source/gfql/cypher.rst +++ b/docs/source/gfql/cypher.rst @@ -460,8 +460,7 @@ execution, start with the bound-graph inline preflight APIs: local preflight check. For Cypher strings, this uses schema-aware strict preflight by default. - Use ``g.gfql_remote(..., validate=True)`` when you want remote execution - guarded by local preflight before upload/network dispatch. For Cypher strings, - remote preflight uses ``strict=False`` by default because remote schema is authoritative. + guarded by local preflight before upload/network dispatch. - Use ``parse_cypher()`` when you only want grammar validation and access to the parsed representation. - Use ``compile_cypher()`` when you need low-level compiler/lowering output for diff --git a/docs/source/gfql/validation/fundamentals.rst b/docs/source/gfql/validation/fundamentals.rst index 8ea0c04bfd..05bb64e777 100644 --- a/docs/source/gfql/validation/fundamentals.rst +++ b/docs/source/gfql/validation/fundamentals.rst @@ -223,8 +223,7 @@ Execution-time Preflight Toggles For remote execution, ``g.gfql_remote(..., validate=True)`` runs local query prevalidation before implicit upload/network execution, so invalid queries fail -before data upload when possible. For Cypher strings, remote prevalidation uses -``strict=False`` by default because the authoritative schema is on the remote dataset. +before data upload when possible. Error Collection ^^^^^^^^^^^^^^^^ diff --git a/graphistry/compute/ComputeMixin.py b/graphistry/compute/ComputeMixin.py index 7134539821..c853523d73 100644 --- a/graphistry/compute/ComputeMixin.py +++ b/graphistry/compute/ComputeMixin.py @@ -596,7 +596,7 @@ def gfql_remote( def gfql_remote_shape( self, - chain: Union[Chain, List[ASTObject], Dict[str, JSONVal]], + chain: Union[Chain, List[ASTObject], ASTLet, Dict[str, JSONVal], str], api_token: Optional[str] = None, dataset_id: Optional[str] = None, format: Optional[FormatType] = None, diff --git a/graphistry/compute/chain_remote.py b/graphistry/compute/chain_remote.py index 47f76cc098..dcda340634 100644 --- a/graphistry/compute/chain_remote.py +++ b/graphistry/compute/chain_remote.py @@ -197,9 +197,9 @@ def chain_remote_generic( self, chain, params=params, - strict=False, + strict=True, collect_all=False, - schema=False, + schema=True, ) if not bool(report.get("ok", False)): raise_first_diagnostic(report) @@ -514,8 +514,8 @@ def chain_remote( Uses the latest bound `_dataset_id`, and uploads current dataset if not already bound. Note that rebinding calls of `edges()` and `nodes()` reset the `_dataset_id` binding. - :param chain: GFQL chain query as a Python object or in serialized JSON format - :type chain: Union[Chain, List[ASTObject], Dict[str, JSONVal]] + :param chain: GFQL query as a Python object, serialized GFQL JSON, or Cypher string + :type chain: Union[Chain, List[ASTObject], Dict[str, JSONVal], ASTLet, str] :param api_token: Optional JWT token. If not provided, refreshes JWT and uses that. :type api_token: Optional[str] diff --git a/graphistry/compute/gfql_unified.py b/graphistry/compute/gfql_unified.py index f20a8815b6..7edf2cbae8 100644 --- a/graphistry/compute/gfql_unified.py +++ b/graphistry/compute/gfql_unified.py @@ -1822,6 +1822,7 @@ def policy(context: PolicyContext) -> None: language=language, params=params, strict=True, + schema=True, collect_all=False, ) if not bool(report.get("ok", False)): diff --git a/graphistry/tests/compute/test_chain_remote_v2.py b/graphistry/tests/compute/test_chain_remote_v2.py index f0e9b4c0fe..63bc822660 100644 --- a/graphistry/tests/compute/test_chain_remote_v2.py +++ b/graphistry/tests/compute/test_chain_remote_v2.py @@ -286,3 +286,21 @@ def test_validate_true_rejects_before_implicit_upload(self) -> None: g.upload.assert_not_called() mock_post.assert_not_called() + + def test_validate_true_uses_full_local_preflight(self) -> None: + g = _mock_plottable() + ok_report = {"ok": True, "query_type": "chain", "language": "gfql", "diagnostics": []} + + with patch("graphistry.compute.chain_remote.gfql_preflight_validate", return_value=ok_report) as mock_validate: + with patch("graphistry.compute.chain_remote.requests.post") as mock_post: + mock_post.return_value = _JSON_RESPONSE + chain_remote_generic( + g, + [ASTNode(filter_dict={"type": "Person"})], + format="json", + validate=True, + ) + + kwargs = mock_validate.call_args.kwargs + assert kwargs["strict"] is True + assert kwargs["schema"] is True From ae133781416624396f85cd9633a963090d12a3e8 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 18:13:24 -0700 Subject: [PATCH 09/17] Remote preflight: avoid strict/schema false negatives for validate=true --- docs/source/gfql/cypher.rst | 3 ++- docs/source/gfql/validation/fundamentals.rst | 3 ++- graphistry/compute/chain_remote.py | 4 ++-- graphistry/tests/compute/test_chain_remote_v2.py | 6 +++--- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/source/gfql/cypher.rst b/docs/source/gfql/cypher.rst index eb4647dd24..76abf5c379 100644 --- a/docs/source/gfql/cypher.rst +++ b/docs/source/gfql/cypher.rst @@ -460,7 +460,8 @@ execution, start with the bound-graph inline preflight APIs: local preflight check. For Cypher strings, this uses schema-aware strict preflight by default. - Use ``g.gfql_remote(..., validate=True)`` when you want remote execution - guarded by local preflight before upload/network dispatch. + guarded by local preflight before upload/network dispatch. For Cypher strings, + remote preflight uses ``strict=False`` by default because remote schema is authoritative. - Use ``parse_cypher()`` when you only want grammar validation and access to the parsed representation. - Use ``compile_cypher()`` when you need low-level compiler/lowering output for diff --git a/docs/source/gfql/validation/fundamentals.rst b/docs/source/gfql/validation/fundamentals.rst index 05bb64e777..8ea0c04bfd 100644 --- a/docs/source/gfql/validation/fundamentals.rst +++ b/docs/source/gfql/validation/fundamentals.rst @@ -223,7 +223,8 @@ Execution-time Preflight Toggles For remote execution, ``g.gfql_remote(..., validate=True)`` runs local query prevalidation before implicit upload/network execution, so invalid queries fail -before data upload when possible. +before data upload when possible. For Cypher strings, remote prevalidation uses +``strict=False`` by default because the authoritative schema is on the remote dataset. Error Collection ^^^^^^^^^^^^^^^^ diff --git a/graphistry/compute/chain_remote.py b/graphistry/compute/chain_remote.py index dcda340634..4ad4e23b9c 100644 --- a/graphistry/compute/chain_remote.py +++ b/graphistry/compute/chain_remote.py @@ -197,9 +197,9 @@ def chain_remote_generic( self, chain, params=params, - strict=True, + strict=False, collect_all=False, - schema=True, + schema=False, ) if not bool(report.get("ok", False)): raise_first_diagnostic(report) diff --git a/graphistry/tests/compute/test_chain_remote_v2.py b/graphistry/tests/compute/test_chain_remote_v2.py index 63bc822660..b392f9b3ea 100644 --- a/graphistry/tests/compute/test_chain_remote_v2.py +++ b/graphistry/tests/compute/test_chain_remote_v2.py @@ -287,7 +287,7 @@ def test_validate_true_rejects_before_implicit_upload(self) -> None: g.upload.assert_not_called() mock_post.assert_not_called() - def test_validate_true_uses_full_local_preflight(self) -> None: + def test_validate_true_uses_remote_safe_local_preflight(self) -> None: g = _mock_plottable() ok_report = {"ok": True, "query_type": "chain", "language": "gfql", "diagnostics": []} @@ -302,5 +302,5 @@ def test_validate_true_uses_full_local_preflight(self) -> None: ) kwargs = mock_validate.call_args.kwargs - assert kwargs["strict"] is True - assert kwargs["schema"] is True + assert kwargs["strict"] is False + assert kwargs["schema"] is False From e915a51f6cf9bc7a48e8725ac204bbe06819aa29 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 18:18:14 -0700 Subject: [PATCH 10/17] Docs: make validation fundamentals cypher examples schema-agnostic --- docs/source/gfql/validation/fundamentals.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/gfql/validation/fundamentals.rst b/docs/source/gfql/validation/fundamentals.rst index 8ea0c04bfd..b2212812d0 100644 --- a/docs/source/gfql/validation/fundamentals.rst +++ b/docs/source/gfql/validation/fundamentals.rst @@ -181,7 +181,7 @@ Use the inline GFQL entrypoints first: # Cypher cypher_report = g.gfql_validate( - "MATCH (c:Customer) RETURN c.id AS id LIMIT $n", + "MATCH (c) RETURN c.id AS id LIMIT $n", params={"n": 10}, ) if not cypher_report["ok"]: @@ -196,7 +196,7 @@ defaults, so local bound-graph execution runs schema-aware checks by default. # Run preflight first; execute only if preflight passes result = g.gfql( - "MATCH (c:Customer) RETURN c.id AS id LIMIT $n", + "MATCH (c) RETURN c.id AS id LIMIT $n", params={"n": 10}, validate=True, ) From c9c2b8f5ab3027ae34a0dcc349a430772ea453bc Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 19:22:11 -0700 Subject: [PATCH 11/17] Treat all gfql_validate string inputs as Cypher by default --- docs/source/gfql/validation/fundamentals.rst | 1 + graphistry/compute/gfql_validate.py | 13 ------------- graphistry/tests/compute/test_gfql_validate_only.py | 9 +++++++++ 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/docs/source/gfql/validation/fundamentals.rst b/docs/source/gfql/validation/fundamentals.rst index b2212812d0..b922799950 100644 --- a/docs/source/gfql/validation/fundamentals.rst +++ b/docs/source/gfql/validation/fundamentals.rst @@ -162,6 +162,7 @@ Use the inline GFQL entrypoints first: * **Input forms**: Cypher strings, GFQL JSON payloads, and GFQL Python objects (for example ``Chain(...)``, ``[n(), e(), n()]``, and ``ASTLet(...)``) + String inputs are always validated as Cypher (no separate string-shape precheck). * **Predicate + structural validation**: yes * **Schema validation**: diff --git a/graphistry/compute/gfql_validate.py b/graphistry/compute/gfql_validate.py index 27c695c00c..7a203b7584 100644 --- a/graphistry/compute/gfql_validate.py +++ b/graphistry/compute/gfql_validate.py @@ -2,7 +2,6 @@ from __future__ import annotations -import re from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Tuple, Union, cast from graphistry.Plottable import Plottable @@ -28,16 +27,6 @@ GFQLValidationQuery = Union[ASTObject, List[ASTObject], ASTLet, Chain, dict, str] -_CYPHER_LEAD_RE = re.compile( - r"^\s*(?:MATCH|OPTIONAL\s+MATCH|WITH|RETURN|UNWIND|CALL|CREATE|MERGE|DELETE|DETACH\s+DELETE|SET|REMOVE|FOREACH|GRAPH|USE)\b", - re.IGNORECASE, -) - - -def _looks_like_cypher_query(query: str) -> bool: - return _CYPHER_LEAD_RE.match(query) is not None - - def _serialize_error(exc: Exception, *, stage: str) -> Dict[str, Any]: if hasattr(exc, "to_dict") and callable(getattr(exc, "to_dict")): out = cast(Dict[str, Any], exc.to_dict()) # GFQLValidationError surface @@ -326,8 +315,6 @@ def gfql_validate( suggestion="Use language='cypher' for now; Gremlin string compilation is not implemented yet.", language="gfql", ) - if language is None and not _looks_like_cypher_query(query): - raise TypeError("Query must be ASTObject, List[ASTObject], Chain, ASTLet, or dict. Got str") return _validate_cypher(g, query, params=params, strict=strict) if language is not None: diff --git a/graphistry/tests/compute/test_gfql_validate_only.py b/graphistry/tests/compute/test_gfql_validate_only.py index 372f538a5d..f5bb434a75 100644 --- a/graphistry/tests/compute/test_gfql_validate_only.py +++ b/graphistry/tests/compute/test_gfql_validate_only.py @@ -72,6 +72,15 @@ def test_gfql_validate_cypher_can_disable_strict_schema_checks(): assert report["diagnostics"] == [] +def test_gfql_validate_treats_all_strings_as_cypher(): + g = _mk_graph() + report = g.gfql_validate("hello world not cypher") + assert report["ok"] is False + assert report["language"] == "cypher" + assert report["diagnostics"] + assert "Got str" not in report["diagnostics"][0]["message"] + + def test_gfql_validate_does_not_execute_query_operators(monkeypatch): g = _mk_graph() From 945f78284c158b25ab3dc0b0fa913ca1ae96e8e3 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 19:33:49 -0700 Subject: [PATCH 12/17] Simplify string validation path across gfql entrypoints --- docs/source/gfql/validation/fundamentals.rst | 16 +++++++++++++++- graphistry/compute/gfql_unified.py | 13 ------------- graphistry/tests/compute/test_gfql.py | 8 ++++++++ .../tests/compute/test_gfql_validate_only.py | 9 +++++++++ 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/docs/source/gfql/validation/fundamentals.rst b/docs/source/gfql/validation/fundamentals.rst index b922799950..9178c2ad21 100644 --- a/docs/source/gfql/validation/fundamentals.rst +++ b/docs/source/gfql/validation/fundamentals.rst @@ -202,7 +202,13 @@ defaults, so local bound-graph execution runs schema-aware checks by default. validate=True, ) -Use ``validate_chain_schema()`` when you specifically want the low-level chain-schema helper: +Use ``validate_chain_schema()`` when you specifically want the low-level chain-schema helper. +It is intentionally narrower than ``g.gfql_validate(...)``: + +* validates chain operations against currently bound node/edge dataframe columns +* does **not** parse/compile Cypher strings +* does **not** run Let/DAG orchestration validation +* does **not** execute query operators .. code-block:: python @@ -227,6 +233,14 @@ prevalidation before implicit upload/network execution, so invalid queries fail before data upload when possible. For Cypher strings, remote prevalidation uses ``strict=False`` by default because the authoritative schema is on the remote dataset. +Grounded vs Ungrounded Validation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Schema checks are most useful when local graph tables are bound on ``g``. +If local node/edge tables are missing, GFQL JSON/AST chain validation can only +do structural/predicate checks, and column-existence checks are effectively +ungrounded. + Error Collection ^^^^^^^^^^^^^^^^ diff --git a/graphistry/compute/gfql_unified.py b/graphistry/compute/gfql_unified.py index 7edf2cbae8..f24908e8fe 100644 --- a/graphistry/compute/gfql_unified.py +++ b/graphistry/compute/gfql_unified.py @@ -2,7 +2,6 @@ # ruff: noqa: E501 from dataclasses import replace -import re from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Tuple, Union, cast from graphistry.Plottable import Plottable from graphistry.Engine import Engine, EngineAbstract, df_concat, df_cons, resolve_engine, safe_merge @@ -63,16 +62,6 @@ _REENTRY_WHOLE_ROW_SUGGESTION = "Carry a whole-row node alias through WITH before MATCH re-entry." _REENTRY_SCALAR_SUGGESTION = "Carry scalar columns through WITH before MATCH re-entry." -_CYPHER_LEAD_RE = re.compile( - r"^\s*(?:MATCH|OPTIONAL\s+MATCH|WITH|RETURN|UNWIND|CALL|CREATE|MERGE|DELETE|DETACH\s+DELETE|SET|REMOVE|FOREACH|GRAPH|USE)\b", - re.IGNORECASE, -) - - -def _looks_like_cypher_query(query: str) -> bool: - return _CYPHER_LEAD_RE.match(query) is not None - - def _series_to_pylist(values: Any) -> List[Any]: if hasattr(values, "to_arrow"): try: @@ -1811,8 +1800,6 @@ def policy(context: PolicyContext) -> None: if isinstance(query, str): if where_param: raise ValueError("where cannot be combined with string queries; embed Cypher predicates in the query itself") - if language is None and not _looks_like_cypher_query(query): - raise TypeError("Query must be ASTObject, List[ASTObject], Chain, ASTLet, or dict. Got str") if validate: report = gfql_preflight_validate( diff --git a/graphistry/tests/compute/test_gfql.py b/graphistry/tests/compute/test_gfql.py index 11fcbda5e4..d3675060c4 100644 --- a/graphistry/tests/compute/test_gfql.py +++ b/graphistry/tests/compute/test_gfql.py @@ -296,6 +296,14 @@ def test_gfql_validate_true_catches_cypher_schema_errors_by_default(self): assert exc_info.value.code == ErrorCode.E301 + def test_gfql_validate_true_treats_all_strings_as_cypher(self): + g = _mk_people_company_graph3() + + with pytest.raises(GFQLSyntaxError) as exc_info: + g.gfql("hello world not cypher", validate=True) + + assert exc_info.value.code == ErrorCode.E107 + @pytest.mark.parametrize( ("direction", "expected"), [ diff --git a/graphistry/tests/compute/test_gfql_validate_only.py b/graphistry/tests/compute/test_gfql_validate_only.py index f5bb434a75..5c64c90c85 100644 --- a/graphistry/tests/compute/test_gfql_validate_only.py +++ b/graphistry/tests/compute/test_gfql_validate_only.py @@ -111,3 +111,12 @@ def test_gfql_validate_let_schema_failure(): assert report["query_type"] == "dag" assert report["diagnostics"] assert report["diagnostics"][0]["code"] == "column-not-found" + + +def test_gfql_validate_chain_without_bound_tables_is_structural_only(): + g = CGFull() + report = g.gfql_validate([n({"missing_col": "x"})]) + assert report["ok"] is True + assert report["language"] == "gfql" + assert report["query_type"] == "chain" + assert report["diagnostics"] == [] From d39ffbc88ba86c2e111e7871c7e58440a2b3a6f5 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 19:41:27 -0700 Subject: [PATCH 13/17] Align chain-let string error expectations with Cypher validation --- graphistry/tests/compute/test_chain_let.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/graphistry/tests/compute/test_chain_let.py b/graphistry/tests/compute/test_chain_let.py index ae336a6f1f..318868a4ab 100644 --- a/graphistry/tests/compute/test_chain_let.py +++ b/graphistry/tests/compute/test_chain_let.py @@ -14,7 +14,7 @@ detect_cycles, determine_execution_order ) from graphistry.compute.execution_context import ExecutionContext -from graphistry.compute.exceptions import GFQLTypeError +from graphistry.compute.exceptions import GFQLTypeError, GFQLSyntaxError, ErrorCode from graphistry.tests.test_compute import CGFull @@ -547,9 +547,9 @@ def test_invalid_dag_type(self): """Test helpful error when dag parameter is wrong type""" g = CGFull() - with pytest.raises(TypeError) as exc_info: + with pytest.raises(GFQLSyntaxError) as exc_info: g.gfql("not a dag") - assert "Query must be ASTObject, List[ASTObject], Chain, ASTLet, or dict" in str(exc_info.value) + assert exc_info.value.code == ErrorCode.E107 # When passed a dict, gfql creates an ASTLet which validates with pytest.raises(GFQLTypeError) as exc_info: @@ -1249,10 +1249,9 @@ def test_chain_let_validates(self): g = CGFull().edges(pd.DataFrame({'s': ['a'], 'd': ['b']}), 's', 'd') # Invalid DAG should raise during validation - with pytest.raises(TypeError) as exc_info: + with pytest.raises(GFQLSyntaxError) as exc_info: g.gfql("not a dag") - - assert "Query must be ASTObject, List[ASTObject], Chain, ASTLet, or dict" in str(exc_info.value) + assert exc_info.value.code == ErrorCode.E107 def test_chain_let_output_selection(self): """Test output parameter selects specific binding""" From 8f6c593d42cfc16fb70ccfa9c77ea9986a41d9ef Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 19:47:30 -0700 Subject: [PATCH 14/17] Clarify GFQL string prevalidation semantics in changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dbdbb596a7..7e6d4c3685 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - **GFQL/Cypher validate-only preflight API (#1320)**: Added `g.gfql_validate(...)` on `ComputeMixin` as a public no-execution validation entrypoint for GFQL chains/JSON-style queries, Let/DAG queries, and Cypher strings. The API returns structured diagnostics (`ok`, `diagnostics`, query/language metadata) instead of executing query operators. Cypher preflight runs parser+compiler checks and supports optional strict binder/schema mode (`strict=True`) using the bound graph schema catalog; chain/JSON preflight reuses existing `validate_chain_schema()` semantics (including `collect_all=True`), and Let/DAG preflight now includes best-effort schema checks for direct chain-like bindings. ### Changed -- **GFQL execution prevalidation semantics (#1320)**: `g.gfql(..., validate=True)` now runs local preflight validation before execution. `g.gfql_remote(..., validate=True)` now validates query payloads before implicit upload/network dispatch, so invalid queries fail locally prior to upload when possible. +- **GFQL execution prevalidation semantics (#1320)**: `g.gfql(..., validate=True)` now runs local preflight validation before execution. `g.gfql_remote(..., validate=True)` now validates query payloads before implicit upload/network dispatch, so invalid queries fail locally prior to upload when possible. String query inputs are now treated consistently as Cypher during preflight (`g.gfql_validate("...")` and `g.gfql("...", validate=True)`), so users get Cypher parser/compiler diagnostics instead of shape-guessing type errors. ### Internal - **GFQL / Cypher reentry follow-through cleanup (#989, post-#1260 extraction)**: In `graphistry/compute/gfql/cypher/reentry/runtime.py`, free-form intermediate MATCH plan construction now routes through the whole-row/free-form `ReentryPlan` contract instead of scalar-only fallback tagging. This makes the dedicated runtime `plan.free_form` lane reachable again and removes incidental scalar-only-path dependence for free-form reentry dispatch. From 80d2519e60b94c25ada016d4d651994a08c5809f Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 19:53:47 -0700 Subject: [PATCH 15/17] Make gfql_validate fail-fast with structured exceptions --- CHANGELOG.md | 2 +- docs/source/gfql/cypher.rst | 10 +- docs/source/gfql/validation/fundamentals.rst | 32 +++-- docs/source/gfql/validation/llm.rst | 23 +++- graphistry/compute/chain_remote.py | 6 +- graphistry/compute/gfql_unified.py | 6 +- graphistry/compute/gfql_validate.py | 117 ++++++++++-------- graphistry/tests/compute/test_gfql.py | 14 +-- .../tests/compute/test_gfql_validate_only.py | 53 ++++---- 9 files changed, 153 insertions(+), 110 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e6d4c3685..508eeac5bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - **GFQL/Cypher validate-only preflight API (#1320)**: Added `g.gfql_validate(...)` on `ComputeMixin` as a public no-execution validation entrypoint for GFQL chains/JSON-style queries, Let/DAG queries, and Cypher strings. The API returns structured diagnostics (`ok`, `diagnostics`, query/language metadata) instead of executing query operators. Cypher preflight runs parser+compiler checks and supports optional strict binder/schema mode (`strict=True`) using the bound graph schema catalog; chain/JSON preflight reuses existing `validate_chain_schema()` semantics (including `collect_all=True`), and Let/DAG preflight now includes best-effort schema checks for direct chain-like bindings. ### Changed -- **GFQL execution prevalidation semantics (#1320)**: `g.gfql(..., validate=True)` now runs local preflight validation before execution. `g.gfql_remote(..., validate=True)` now validates query payloads before implicit upload/network dispatch, so invalid queries fail locally prior to upload when possible. String query inputs are now treated consistently as Cypher during preflight (`g.gfql_validate("...")` and `g.gfql("...", validate=True)`), so users get Cypher parser/compiler diagnostics instead of shape-guessing type errors. +- **GFQL execution prevalidation semantics (#1320)**: `g.gfql(..., validate=True)` now runs local preflight validation before execution. `g.gfql_remote(..., validate=True)` now validates query payloads before implicit upload/network dispatch, so invalid queries fail locally prior to upload when possible. String query inputs are now treated consistently as Cypher during preflight (`g.gfql_validate("...")` and `g.gfql("...", validate=True)`), so users get Cypher parser/compiler diagnostics instead of shape-guessing type errors. `g.gfql_validate(...)` now raises structured GFQL exceptions on invalid queries (instead of returning `ok=False`), and collect-all mode surfaces full diagnostics via exception context for LM/retry workflows. ### Internal - **GFQL / Cypher reentry follow-through cleanup (#989, post-#1260 extraction)**: In `graphistry/compute/gfql/cypher/reentry/runtime.py`, free-form intermediate MATCH plan construction now routes through the whole-row/free-form `ReentryPlan` contract instead of scalar-only fallback tagging. This makes the dedicated runtime `plan.free_form` lane reachable again and removes incidental scalar-only-path dependence for free-form reentry dispatch. diff --git a/docs/source/gfql/cypher.rst b/docs/source/gfql/cypher.rst index 76abf5c379..d6b0fbfbec 100644 --- a/docs/source/gfql/cypher.rst +++ b/docs/source/gfql/cypher.rst @@ -444,18 +444,18 @@ execution, start with the bound-graph inline preflight APIs: .. code-block:: python - report = g.gfql_validate( + g.gfql_validate( "MATCH (p:Person) RETURN p.name AS name ORDER BY name DESC LIMIT $top_n", params={"top_n": 5}, # strict=True is the default for local bound-graph preflight ) - if not report["ok"]: - for diag in report["diagnostics"]: - print(diag["code"], diag["message"], diag.get("field")) + # On failure: + # - GFQLSyntaxError for invalid syntax + # - GFQLValidationError for unsupported/scheme-invalid shapes - Use ``g.gfql_validate(...)`` when you want a stable validate-only entrypoint - that returns structured diagnostics and never executes query operators. + that never executes query operators and raises structured exceptions on invalid queries. - Use ``g.gfql(..., validate=True)`` when you want execution guarded by a local preflight check. For Cypher strings, this uses schema-aware strict preflight by default. diff --git a/docs/source/gfql/validation/fundamentals.rst b/docs/source/gfql/validation/fundamentals.rst index 9178c2ad21..394a627e2f 100644 --- a/docs/source/gfql/validation/fundamentals.rst +++ b/docs/source/gfql/validation/fundamentals.rst @@ -176,17 +176,31 @@ Use the inline GFQL entrypoints first: .. code-block:: python # Chain / JSON-style GFQL - report = g.gfql_validate([n({'type': 'customer'})], collect_all=True) - if not report["ok"]: - print(report["diagnostics"]) + g.gfql_validate([n({'type': 'customer'})], collect_all=True) # Cypher - cypher_report = g.gfql_validate( - "MATCH (c) RETURN c.id AS id LIMIT $n", - params={"n": 10}, - ) - if not cypher_report["ok"]: - print(cypher_report["diagnostics"]) + g.gfql_validate("MATCH (c) RETURN c.id AS id LIMIT $n", params={"n": 10}) + +Validation failures raise ``GFQLValidationError`` / ``GFQLSyntaxError`` with +structured, inspectable context: + +.. code-block:: python + + from graphistry.compute.exceptions import GFQLValidationError + + try: + g.gfql_validate([n({"missing_col": "x"})], collect_all=True) + except GFQLValidationError as exc: + payload = exc.to_dict() + # LM-friendly payload: + # { + # "code": "...", + # "message": "...", + # "query_type": "chain", + # "language": "gfql", + # "diagnostics": [...] + # } + print(payload) ``g.gfql(..., validate=True)`` accepts the same query inputs as ``g.gfql(...)`` (Cypher string, GFQL JSON, GFQL Python objects), runs local preflight first, and diff --git a/docs/source/gfql/validation/llm.rst b/docs/source/gfql/validation/llm.rst index d42516c1b2..a6c63f71c4 100644 --- a/docs/source/gfql/validation/llm.rst +++ b/docs/source/gfql/validation/llm.rst @@ -128,6 +128,27 @@ Combined Validation return {"success": True, "chain": chain} +Direct Preflight For Retry Loops +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For generate-validate-repair loops, you can run ``g.gfql_validate(...)`` and +convert raised exceptions into structured payloads: + +.. code-block:: python + + from graphistry.compute.exceptions import GFQLValidationError, GFQLSyntaxError + + def preflight_payload(g, query): + try: + g.gfql_validate(query, collect_all=True) + return {"ok": True} + except (GFQLValidationError, GFQLSyntaxError) as exc: + payload = exc.to_dict() + return { + "ok": False, + "error": payload, # includes code/message + diagnostics context + } + Automated Fix Suggestions ------------------------- @@ -181,4 +202,4 @@ See Also * :doc:`production` - Production patterns * :doc:`../spec/language` - Language specification -* :doc:`../spec/cypher_mapping` - Cypher to GFQL mapping \ No newline at end of file +* :doc:`../spec/cypher_mapping` - Cypher to GFQL mapping diff --git a/graphistry/compute/chain_remote.py b/graphistry/compute/chain_remote.py index 4ad4e23b9c..5c8faba995 100644 --- a/graphistry/compute/chain_remote.py +++ b/graphistry/compute/chain_remote.py @@ -16,7 +16,7 @@ from graphistry.compute.chain import Chain from graphistry.compute.gfql.cypher.lowering import compile_cypher_query from graphistry.compute.gfql.cypher.parser import parse_cypher -from graphistry.compute.gfql_validate import gfql_validate as gfql_preflight_validate, raise_first_diagnostic +from graphistry.compute.gfql_validate import gfql_validate as gfql_preflight_validate from graphistry.io.metadata import deserialize_plottable_metadata from graphistry.models.compute.chain_remote import OutputTypeGraph, FormatType, output_types_graph from graphistry.utils.json import JSONVal @@ -193,7 +193,7 @@ def chain_remote_generic( raise TypeError(f"gfql_remote() query must be Chain, List, ASTLet, Dict, or str. Got {type(chain)}") if validate: - report = gfql_preflight_validate( + gfql_preflight_validate( self, chain, params=params, @@ -201,8 +201,6 @@ def chain_remote_generic( collect_all=False, schema=False, ) - if not bool(report.get("ok", False)): - raise_first_diagnostic(report) if not dataset_id: dataset_id = self._dataset_id diff --git a/graphistry/compute/gfql_unified.py b/graphistry/compute/gfql_unified.py index f24908e8fe..9e77d8593a 100644 --- a/graphistry/compute/gfql_unified.py +++ b/graphistry/compute/gfql_unified.py @@ -54,7 +54,7 @@ from graphistry.compute.typing import DataFrameT, SeriesT from graphistry.compute.util.generate_safe_column_name import generate_safe_column_name from graphistry.compute.validate.validate_schema import validate_chain_schema -from graphistry.compute.gfql_validate import gfql_validate as gfql_preflight_validate, raise_first_diagnostic +from graphistry.compute.gfql_validate import gfql_validate as gfql_preflight_validate from graphistry.otel import otel_traced, otel_detail_enabled logger = setup_logger(__name__) @@ -1802,7 +1802,7 @@ def policy(context: PolicyContext) -> None: raise ValueError("where cannot be combined with string queries; embed Cypher predicates in the query itself") if validate: - report = gfql_preflight_validate( + gfql_preflight_validate( dispatch_self, query, where=where_param, @@ -1812,8 +1812,6 @@ def policy(context: PolicyContext) -> None: schema=True, collect_all=False, ) - if not bool(report.get("ok", False)): - raise_first_diagnostic(report) if isinstance(query, str): compiled_query = _compile_string_query(query, language=language, params=params) diff --git a/graphistry/compute/gfql_validate.py b/graphistry/compute/gfql_validate.py index 7a203b7584..48e903e6ba 100644 --- a/graphistry/compute/gfql_validate.py +++ b/graphistry/compute/gfql_validate.py @@ -47,6 +47,37 @@ def _serialize_error(exc: Exception, *, stage: str) -> Dict[str, Any]: return out +def _raise_diagnostics( + diagnostics: List[Dict[str, Any]], + *, + query_type: str, + language: str, +) -> None: + first = diagnostics[0] + code = cast(Any, first.get("code")) or ErrorCode.E108 + message = cast(Any, first.get("message")) or "GFQL validation failed" + if len(diagnostics) > 1: + message = f"GFQL validation failed with {len(diagnostics)} errors; first: {message}" + extra = { + key: value + for key, value in first.items() + if key not in {"code", "message", "field", "value", "suggestion", "operation_index"} + } + exc_cls = GFQLSyntaxError if code == ErrorCode.E107 else GFQLValidationError + raise exc_cls( + code, + message, + field=cast(Optional[str], first.get("field")), + value=first.get("value"), + suggestion=cast(Optional[str], first.get("suggestion")), + operation_index=cast(Optional[int], first.get("operation_index")), + diagnostics=diagnostics, + query_type=query_type, + language=language, + **extra, + ) + + def _build_schema_catalog(g: Plottable, *, strict: bool) -> GraphSchemaCatalog: node_columns: Tuple[str, ...] = tuple() edge_columns: Tuple[str, ...] = tuple() @@ -176,11 +207,14 @@ def _validate_non_string_query( if not schema: if collect_all: errors = cast(Any, coerced).validate(collect_all=True) or [] + diagnostics = [cast(Any, e).to_dict() for e in errors] + if diagnostics: + _raise_diagnostics(diagnostics, query_type="chain", language="gfql") return { - "ok": len(errors) == 0, + "ok": True, "query_type": "chain", "language": "gfql", - "diagnostics": [cast(Any, e).to_dict() for e in errors], + "diagnostics": [], } cast(Any, coerced).validate(collect_all=False) return { @@ -191,11 +225,14 @@ def _validate_non_string_query( } if collect_all: errors = validate_chain_schema(g, coerced.chain, collect_all=True) or [] + diagnostics = [cast(Any, e).to_dict() for e in errors] + if diagnostics: + _raise_diagnostics(diagnostics, query_type="chain", language="gfql") return { - "ok": len(errors) == 0, + "ok": True, "query_type": "chain", "language": "gfql", - "diagnostics": [cast(Any, e).to_dict() for e in errors], + "diagnostics": [], } validate_chain_schema(g, coerced.chain, collect_all=False) return { @@ -212,16 +249,19 @@ def _validate_non_string_query( # surface without introducing a new schema simulator. if collect_all: errors = cast(Any, coerced).validate(collect_all=True) or [] + diagnostics = [cast(Any, e).to_dict() for e in errors] + if diagnostics: + _raise_diagnostics(diagnostics, query_type="single", language="gfql") return { - "ok": len(errors) == 0, - "query_type": "dag" if isinstance(coerced, ASTLet) else "single", + "ok": True, + "query_type": "single", "language": "gfql", - "diagnostics": [cast(Any, e).to_dict() for e in errors], + "diagnostics": [], } cast(Any, coerced).validate(collect_all=False) return { "ok": True, - "query_type": "dag" if isinstance(coerced, ASTLet) else "single", + "query_type": "single", "language": "gfql", "diagnostics": [], } @@ -265,11 +305,14 @@ def _validate_let_query( if schema: for value in let_query.bindings.values(): errors.extend(_validate_let_binding_schema_errors(g, value)) + diagnostics = [cast(Any, e).to_dict() for e in errors] + if diagnostics: + _raise_diagnostics(diagnostics, query_type="dag", language="gfql") return { - "ok": len(errors) == 0, + "ok": True, "query_type": "dag", "language": "gfql", - "diagnostics": [cast(Any, e).to_dict() for e in errors], + "diagnostics": [], } cast(Any, let_query).validate(collect_all=False) @@ -299,7 +342,8 @@ def gfql_validate( ) -> Dict[str, Any]: """Validate a GFQL/Cypher query without executing it. - Returns structured diagnostics and never dispatches query execution operators. + Raises structured GFQL exceptions on validation failures and never dispatches + query execution operators. """ try: if isinstance(query, str): @@ -322,49 +366,12 @@ def gfql_validate( if params is not None: raise ValueError("params is only supported when query is a string") return _validate_non_string_query(g, query, where=where, collect_all=collect_all, schema=schema) + except GFQLValidationError: + raise except Exception as exc: - return { - "ok": False, - "query_type": "chain" if isinstance(query, str) else "single", - "language": "cypher" if isinstance(query, str) else "gfql", - "diagnostics": [_serialize_error(exc, stage="validate")], - } - - -def raise_first_diagnostic(report: Mapping[str, Any]) -> None: - diagnostics = report.get("diagnostics") - if not isinstance(diagnostics, list) or len(diagnostics) == 0: - raise GFQLValidationError( - ErrorCode.E108, - "GFQL validation failed without diagnostic details", - language=cast(Any, report.get("language")), - ) - - first = diagnostics[0] - if not isinstance(first, dict): - raise GFQLValidationError( - ErrorCode.E108, - "GFQL validation failed with invalid diagnostic payload", - value=first, - language=cast(Any, report.get("language")), + diagnostic = _serialize_error(exc, stage="validate") + _raise_diagnostics( + [diagnostic], + query_type="chain" if isinstance(query, str) else "single", + language="cypher" if isinstance(query, str) else "gfql", ) - - code = cast(Any, first.get("code")) or ErrorCode.E108 - message = cast(Any, first.get("message")) or "GFQL validation failed" - - # Keep core structured keys explicit and pass the rest through as context. - extra = { - key: value - for key, value in first.items() - if key not in {"code", "message", "field", "value", "suggestion", "operation_index"} - } - exc_cls = GFQLSyntaxError if code == ErrorCode.E107 else GFQLValidationError - raise exc_cls( - code, - message, - field=cast(Optional[str], first.get("field")), - value=first.get("value"), - suggestion=cast(Optional[str], first.get("suggestion")), - operation_index=cast(Optional[int], first.get("operation_index")), - **extra, - ) diff --git a/graphistry/tests/compute/test_gfql.py b/graphistry/tests/compute/test_gfql.py index d3675060c4..048a62cb65 100644 --- a/graphistry/tests/compute/test_gfql.py +++ b/graphistry/tests/compute/test_gfql.py @@ -261,16 +261,10 @@ def test_gfql_non_string_rejects_language_and_params(self): def test_gfql_validate_true_runs_preflight_before_compile(self): g = _mk_people_company_graph3() - fake_report = { - "ok": False, - "query_type": "chain", - "language": "cypher", - "diagnostics": [ - {"code": ErrorCode.E108, "message": "synthetic preflight failure", "stage": "validate"} - ], - } - - with patch("graphistry.compute.gfql_unified.gfql_preflight_validate", return_value=fake_report): + with patch( + "graphistry.compute.gfql_unified.gfql_preflight_validate", + side_effect=GFQLValidationError(ErrorCode.E108, "synthetic preflight failure"), + ): with patch( "graphistry.compute.gfql_unified._compile_string_query", side_effect=AssertionError("compile should not run when preflight fails"), diff --git a/graphistry/tests/compute/test_gfql_validate_only.py b/graphistry/tests/compute/test_gfql_validate_only.py index 5c64c90c85..097f84f99d 100644 --- a/graphistry/tests/compute/test_gfql_validate_only.py +++ b/graphistry/tests/compute/test_gfql_validate_only.py @@ -1,7 +1,9 @@ import pandas as pd +import pytest from graphistry.compute.ast import ASTLet, n from graphistry.compute.chain import Chain +from graphistry.compute.exceptions import ErrorCode, GFQLSyntaxError, GFQLValidationError from graphistry.tests.test_compute import CGFull @@ -35,11 +37,12 @@ def test_gfql_validate_chain_success(): def test_gfql_validate_chain_failure_collect_all(): g = _mk_graph() - report = g.gfql_validate([n({"missing_col": "x"})], collect_all=True) - assert report["ok"] is False - assert report["language"] == "gfql" - assert report["diagnostics"] - assert report["diagnostics"][0]["code"] == "column-not-found" + with pytest.raises(GFQLValidationError) as exc_info: + g.gfql_validate([n({"missing_col": "x"})], collect_all=True) + assert exc_info.value.code == ErrorCode.E301 + diagnostics = exc_info.value.context.get("diagnostics") + assert isinstance(diagnostics, list) and diagnostics + assert diagnostics[0]["code"] == ErrorCode.E301 def test_gfql_validate_cypher_success(): @@ -57,11 +60,9 @@ def test_gfql_validate_cypher_success(): def test_gfql_validate_cypher_default_reports_schema_errors(): g = _mk_graph() - report = g.gfql_validate("MATCH (p:Employee) RETURN p.name AS name") - assert report["ok"] is False - assert report["language"] == "cypher" - assert report["diagnostics"] - assert report["diagnostics"][0]["code"] == "column-not-found" + with pytest.raises(GFQLValidationError) as exc_info: + g.gfql_validate("MATCH (p:Employee) RETURN p.name AS name") + assert exc_info.value.code == ErrorCode.E301 def test_gfql_validate_cypher_can_disable_strict_schema_checks(): @@ -74,11 +75,10 @@ def test_gfql_validate_cypher_can_disable_strict_schema_checks(): def test_gfql_validate_treats_all_strings_as_cypher(): g = _mk_graph() - report = g.gfql_validate("hello world not cypher") - assert report["ok"] is False - assert report["language"] == "cypher" - assert report["diagnostics"] - assert "Got str" not in report["diagnostics"][0]["message"] + with pytest.raises(GFQLSyntaxError) as exc_info: + g.gfql_validate("hello world not cypher") + assert exc_info.value.code == ErrorCode.E107 + assert "Got str" not in str(exc_info.value) def test_gfql_validate_does_not_execute_query_operators(monkeypatch): @@ -105,12 +105,23 @@ def test_gfql_validate_let_success(): def test_gfql_validate_let_schema_failure(): g = _mk_graph() query = ASTLet({"people": Chain([n({"missing_col": "x"})])}) - report = g.gfql_validate(query, collect_all=True) - assert report["ok"] is False - assert report["language"] == "gfql" - assert report["query_type"] == "dag" - assert report["diagnostics"] - assert report["diagnostics"][0]["code"] == "column-not-found" + with pytest.raises(GFQLValidationError) as exc_info: + g.gfql_validate(query, collect_all=True) + assert exc_info.value.code == ErrorCode.E301 + assert exc_info.value.context.get("query_type") == "dag" + + +def test_gfql_validate_exception_payload_is_llm_friendly(): + g = _mk_graph() + with pytest.raises(GFQLValidationError) as exc_info: + g.gfql_validate([n({"missing_col": "x"})], collect_all=True) + payload = exc_info.value.to_dict() + assert payload["code"] == ErrorCode.E301 + assert payload["query_type"] == "chain" + assert payload["language"] == "gfql" + diagnostics = payload.get("diagnostics") + assert isinstance(diagnostics, list) and diagnostics + assert diagnostics[0]["code"] == ErrorCode.E301 def test_gfql_validate_chain_without_bound_tables_is_structural_only(): From 906f5b70d57394894ca0900ab666b0aec2103772 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 20:22:10 -0700 Subject: [PATCH 16/17] Fix mypy NoReturn inference in gfql_validate --- graphistry/compute/gfql_validate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/graphistry/compute/gfql_validate.py b/graphistry/compute/gfql_validate.py index 48e903e6ba..65d7096a73 100644 --- a/graphistry/compute/gfql_validate.py +++ b/graphistry/compute/gfql_validate.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Tuple, Union, cast +from typing import Any, Dict, List, Literal, Mapping, NoReturn, Optional, Sequence, Tuple, Union, cast from graphistry.Plottable import Plottable from graphistry.compute.ast import ASTLet, ASTObject, ASTNode, ASTEdge, ASTCall, ASTRef, from_json @@ -52,7 +52,7 @@ def _raise_diagnostics( *, query_type: str, language: str, -) -> None: +) -> NoReturn: first = diagnostics[0] code = cast(Any, first.get("code")) or ErrorCode.E108 message = cast(Any, first.get("message")) or "GFQL validation failed" From cbef5fdc33b40cd0f74c15c7eac413987e1b9378 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Tue, 5 May 2026 20:30:10 -0700 Subject: [PATCH 17/17] Docs: make cypher preflight example schema-agnostic --- docs/source/gfql/cypher.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/gfql/cypher.rst b/docs/source/gfql/cypher.rst index d6b0fbfbec..cd5ec8c9ca 100644 --- a/docs/source/gfql/cypher.rst +++ b/docs/source/gfql/cypher.rst @@ -445,7 +445,7 @@ execution, start with the bound-graph inline preflight APIs: .. code-block:: python g.gfql_validate( - "MATCH (p:Person) RETURN p.name AS name ORDER BY name DESC LIMIT $top_n", + "MATCH (p) RETURN p.name AS name ORDER BY name DESC LIMIT $top_n", params={"top_n": 5}, # strict=True is the default for local bound-graph preflight )