From 495ed8c2a7d9383850b4be08a2a9de8695b3009d Mon Sep 17 00:00:00 2001 From: Amin Ghadersohi Date: Mon, 4 May 2026 18:59:41 +0000 Subject: [PATCH 1/4] fix(mcp): relax column name regex, improve generate_chart validation errors and examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove overly strict regex pattern from ColumnRef.name, FilterConfig.column, and BigNumberChartConfig.temporal_column — sanitize_name/sanitize_column already handle XSS/SQL injection; the pattern rejected valid column names like "1Q_revenue" (digit-prefixed) or "order-date" (hyphenated) - Extend generate_chart docstring with usage examples for all supported chart types: pie, big_number (with/without trendline), pivot_table, mixed_timeseries, handlebars - Improve _enhance_validation_error fallback in SchemaValidator to produce type-specific, actionable messages instead of raw pydantic error strings (extract _format_single_error helper to reduce cyclomatic complexity) - Add tests verifying digit-prefixed/hyphenated column names now pass, and that XSS/SQL injection is still blocked by sanitize_name() --- superset/mcp_service/chart/schemas.py | 15 +++- .../mcp_service/chart/tool/generate_chart.py | 80 +++++++++++++++++++ .../chart/validation/schema_validator.py | 61 +++++++++++--- 3 files changed, 142 insertions(+), 14 deletions(-) diff --git a/superset/mcp_service/chart/schemas.py b/superset/mcp_service/chart/schemas.py index 1fdb4f43ab6d..32dc0c586bb6 100644 --- a/superset/mcp_service/chart/schemas.py +++ b/superset/mcp_service/chart/schemas.py @@ -669,7 +669,10 @@ class ColumnRef(BaseModel): ..., min_length=1, max_length=255, - pattern=r"^[a-zA-Z0-9_][a-zA-Z0-9_\s\-\.]*$", + # No regex pattern: sanitize_name() already blocks XSS/SQL injection; + # many valid column names (digit-prefixed, locale chars, etc.) would + # be rejected by a strict pattern while posing no security risk. + # Use get_dataset_info to find exact column names. validation_alias=AliasChoices("name", "column_name"), ) label: str | None = Field(None, max_length=500) @@ -743,7 +746,10 @@ class FilterConfig(BaseModel): ..., min_length=1, max_length=255, - pattern=r"^[a-zA-Z0-9_][a-zA-Z0-9_\s\-\.]*$", + # No regex pattern: sanitize_name() already blocks XSS/SQL injection; + # many valid column names (digit-prefixed, locale chars, etc.) would + # be rejected by a strict pattern while posing no security risk. + # Use get_dataset_info to find exact column names. validation_alias=AliasChoices("column", "col"), ) op: Literal[ @@ -1082,7 +1088,10 @@ class BigNumberChartConfig(UnknownFieldCheckMixin): ), min_length=1, max_length=255, - pattern=r"^[a-zA-Z0-9_][a-zA-Z0-9_\s\-\.]*$", + # No regex pattern: sanitize_name() already blocks XSS/SQL injection; + # many valid column names (digit-prefixed, locale chars, etc.) would + # be rejected by a strict pattern while posing no security risk. + # Use get_dataset_info to find exact column names. ) time_grain: TimeGrain | None = Field( None, diff --git a/superset/mcp_service/chart/tool/generate_chart.py b/superset/mcp_service/chart/tool/generate_chart.py index 646ac4d4c2a7..366685d22d3c 100644 --- a/superset/mcp_service/chart/tool/generate_chart.py +++ b/superset/mcp_service/chart/tool/generate_chart.py @@ -200,6 +200,86 @@ async def generate_chart( # noqa: C901 } ``` + + Example usage for Pie chart: + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "pie", + "dimension": {"name": "product_category"}, + "metric": {"name": "revenue", "aggregate": "SUM"}, + "donut": false + } + } + ``` + + Example usage for Big Number (no trendline): + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "big_number", + "metric": {"name": "total_sales", "aggregate": "SUM"} + } + } + ``` + + Example usage for Big Number with trendline: + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "big_number", + "metric": {"name": "revenue", "aggregate": "SUM"}, + "temporal_column": "order_date", + "time_grain": "P1M", + "show_trendline": true + } + } + ``` + + Example usage for Pivot Table: + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "pivot_table", + "rows": [{"name": "region"}], + "columns": [{"name": "product_category"}], + "metrics": [{"name": "revenue", "aggregate": "SUM"}] + } + } + ``` + + Example usage for Mixed Timeseries: + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "mixed_timeseries", + "x": {"name": "order_date"}, + "y": [{"name": "revenue", "aggregate": "SUM"}], + "primary_kind": "line", + "y_secondary": [{"name": "order_count", "aggregate": "COUNT"}], + "secondary_kind": "bar" + } + } + ``` + + Example usage for Handlebars: + ```json + { + "dataset_id": 123, + "config": { + "chart_type": "handlebars", + "handlebars_template": "{{#each data}}{{this.name}}{{/each}}", + "groupby": [{"name": "product"}], + "metrics": [{"name": "revenue", "aggregate": "SUM"}] + } + } + ``` + Example usage for Table chart: ```json { diff --git a/superset/mcp_service/chart/validation/schema_validator.py b/superset/mcp_service/chart/validation/schema_validator.py index 7cae450ff599..dc0eec9958d6 100644 --- a/superset/mcp_service/chart/validation/schema_validator.py +++ b/superset/mcp_service/chart/validation/schema_validator.py @@ -525,6 +525,38 @@ def _pre_validate_mixed_timeseries_config( return True, None + @staticmethod + def _format_single_error(err: Dict[str, Any]) -> tuple[str, str]: + """Return (detail_message, optional_suggestion) for one pydantic error.""" + loc_parts = [str(p) for p in err.get("loc", [])] + loc = " -> ".join(loc_parts) + msg = err.get("msg", "Validation failed") + err_type = err.get("type", "") + ctx = err.get("ctx", {}) or {} + field = loc_parts[-1] if loc_parts else "field" + + if err_type == "string_pattern_mismatch": + return ( + f"'{field}' value contains disallowed characters. " + "Column names must not contain HTML, script tags, or SQL " + "injection patterns. Use the exact column name from your dataset.", + "Use get_dataset_info to find exact column names", + ) + if err_type == "literal_error": + expected = ctx.get("expected", "") + return f"'{field}' has an invalid value. Expected one of: {expected}", "" + if err_type == "missing": + return ( + f"Required field '{field}' is missing", + "Check the chart_type examples in the tool description", + ) + if err_type == "value_error": + return ( + f"{loc}: {msg}", + "Use get_dataset_info to verify column names and types", + ) + return f"{loc}: {msg}", "" + @staticmethod def _enhance_validation_error( error: PydanticValidationError, request_data: Dict[str, Any] @@ -609,22 +641,29 @@ def _enhance_validation_error( error_code="BIG_NUMBER_VALIDATION_ERROR", ) - # Default enhanced error + # Default enhanced error: build actionable per-field messages error_details = [] - for err in errors[:3]: # Show first 3 errors - loc = " -> ".join(str(location) for location in err.get("loc", [])) - msg = err.get("msg", "Validation failed") - error_details.append(f"{loc}: {msg}") + extra_suggestions: list[str] = [] + for err in errors[:5]: # Surface up to 5 errors + detail, suggestion = SchemaValidator._format_single_error(err) + error_details.append(detail) + if suggestion: + extra_suggestions.append(suggestion) return ChartGenerationError( error_type="validation_error", message="Chart configuration validation failed", details="; ".join(error_details), - suggestions=[ - "Check that all required fields are present", - "Ensure field types match the schema", - "Use get_dataset_info to verify column names", - "Refer to the API documentation for field requirements", - ], + suggestions=list( + dict.fromkeys( + [ + "Check that all required fields are present", + "Ensure field types match the schema", + "Use get_dataset_info to verify column names", + "Refer to the API documentation for field requirements", + ] + + extra_suggestions + ) + ), error_code="VALIDATION_ERROR", ) From 52189106ce619fddec3090593691da04195cc317 Mon Sep 17 00:00:00 2001 From: Amin Ghadersohi Date: Mon, 4 May 2026 19:00:11 +0000 Subject: [PATCH 2/4] test(mcp): add tests for relaxed column name validation in generate_chart --- .../mcp_service/chart/test_chart_schemas.py | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py index 1da9d1bc3a74..5ae7a38fc4ac 100644 --- a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py +++ b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py @@ -778,3 +778,85 @@ def test_client_warnings_discarded_even_when_server_also_warns(self) -> None: assert len(req.sanitization_warnings) == 1 assert "chart_name" in req.sanitization_warnings[0] assert "injected" not in req.sanitization_warnings[0] + + +class TestColumnRefNameRelaxedPattern: + """ColumnRef.name no longer enforces a strict regex pattern. + + Many valid database column names were previously rejected: + - Names starting with a digit (e.g. "1Q_revenue") + - Names with locale-specific characters + The field_validator sanitize_name() still blocks XSS and SQL injection. + """ + + def test_digit_prefixed_name_accepted(self) -> None: + """Column names starting with a digit must now be accepted.""" + col = ColumnRef(name="1Q_revenue") + assert col.name == "1Q_revenue" + + def test_name_with_hyphen_accepted(self) -> None: + col = ColumnRef(name="order-date") + assert col.name == "order-date" + + def test_name_with_dot_accepted(self) -> None: + col = ColumnRef(name="schema.column") + assert col.name == "schema.column" + + def test_name_with_spaces_accepted(self) -> None: + col = ColumnRef(name="Total Revenue") + assert col.name == "Total Revenue" + + def test_xss_attempt_blocked(self) -> None: + """sanitize_name() still blocks XSS even without the regex.""" + with pytest.raises(ValidationError): + ColumnRef(name="") + + def test_sql_keyword_blocked(self) -> None: + """check_sql_keywords=True still blocks pure SQL statements.""" + with pytest.raises(ValidationError): + ColumnRef(name="1; DROP TABLE users; --") + + def test_empty_name_blocked(self) -> None: + with pytest.raises(ValidationError): + ColumnRef(name="") + + def test_table_chart_with_digit_prefixed_column(self) -> None: + """End-to-end: digit-prefixed column passes through GenerateChartRequest.""" + req = GenerateChartRequest( + dataset_id=1, + config={ + "chart_type": "table", + "columns": [ + {"name": "1Q_revenue"}, + {"name": "product_name"}, + ], + }, + ) + assert req.config.chart_type == "table" + + def test_xy_chart_with_hyphenated_column(self) -> None: + req = GenerateChartRequest( + dataset_id=1, + config={ + "chart_type": "xy", + "x": {"name": "order-date"}, + "y": [{"name": "1Q-revenue", "aggregate": "SUM"}], + }, + ) + assert req.config.chart_type == "xy" + + +class TestFilterConfigColumnRelaxedPattern: + """FilterConfig.column no longer enforces a strict regex pattern.""" + + def test_digit_prefixed_filter_column_accepted(self) -> None: + from superset.mcp_service.chart.schemas import FilterConfig + + f = FilterConfig(column="1Q_flag", op="=", value="active") + assert f.column == "1Q_flag" + + def test_hyphenated_filter_column_accepted(self) -> None: + from superset.mcp_service.chart.schemas import FilterConfig + + f = FilterConfig(column="order-status", op="=", value="shipped") + assert f.column == "order-status" From 38972f0150f86f99cea17f85d1371722f6fd50ac Mon Sep 17 00:00:00 2001 From: Amin Ghadersohi Date: Tue, 5 May 2026 06:21:49 +0000 Subject: [PATCH 3/4] fix(mcp): address review feedback on generate_chart schema rigidity PR - FilterConfig.column: add check_sql_keywords=True to sanitize_column (Copilot review: sanitize_column was missing SQL keyword checking) - BigNumberChartConfig.temporal_column: add sanitize_temporal_column field_validator using sanitize_user_input with check_sql_keywords=True (Copilot review: no validator after regex removal left field unprotected) - generate_chart docstring IMPORTANT: list all chart types, not just xy/table (Copilot review: IMPORTANT section was misleading after adding more examples) - Fix test_xss_attempt_blocked: nh3 strips HTML tags instead of rejecting, so rename to test_xss_tags_are_stripped (asserts tag is removed) and add test_event_handler_injection_blocked (on...= patterns ARE rejected) - Fix _format_single_error literal_error: preserve pydantic 'Input should be' message instead of replacing with custom format (broke existing test test_non_value_error_pydantic_body_is_surfaced) - Add test_sql_injection_in_filter_column_blocked to verify FilterConfig now rejects SQL injection column names --- superset/mcp_service/chart/schemas.py | 20 ++++++++++++++----- .../mcp_service/chart/tool/generate_chart.py | 3 ++- .../chart/validation/schema_validator.py | 6 +++--- .../mcp_service/chart/test_chart_schemas.py | 20 ++++++++++++++++--- 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/superset/mcp_service/chart/schemas.py b/superset/mcp_service/chart/schemas.py index 32dc0c586bb6..9632bec2b23f 100644 --- a/superset/mcp_service/chart/schemas.py +++ b/superset/mcp_service/chart/schemas.py @@ -781,7 +781,9 @@ def sanitize_column(cls, v: str) -> str: """Sanitize filter column name to prevent injection attacks.""" # sanitize_user_input raises ValueError when allow_empty=False (default) # so the return value is guaranteed to be a non-None str - return sanitize_user_input(v, "Filter column", max_length=255) # type: ignore[return-value] + return sanitize_user_input( # type: ignore[return-value] + v, "Filter column", max_length=255, check_sql_keywords=True + ) @field_validator("value") @classmethod @@ -1088,11 +1090,19 @@ class BigNumberChartConfig(UnknownFieldCheckMixin): ), min_length=1, max_length=255, - # No regex pattern: sanitize_name() already blocks XSS/SQL injection; - # many valid column names (digit-prefixed, locale chars, etc.) would - # be rejected by a strict pattern while posing no security risk. - # Use get_dataset_info to find exact column names. + # No regex pattern — see field description above. ) + + @field_validator("temporal_column") + @classmethod + def sanitize_temporal_column(cls, v: str | None) -> str | None: + """Sanitize temporal column name to prevent XSS and SQL injection.""" + if v is None: + return None + return sanitize_user_input( # type: ignore[return-value] + v, "Temporal column", max_length=255, check_sql_keywords=True + ) + time_grain: TimeGrain | None = Field( None, description=( diff --git a/superset/mcp_service/chart/tool/generate_chart.py b/superset/mcp_service/chart/tool/generate_chart.py index 366685d22d3c..697698bbda03 100644 --- a/superset/mcp_service/chart/tool/generate_chart.py +++ b/superset/mcp_service/chart/tool/generate_chart.py @@ -175,7 +175,8 @@ async def generate_chart( # noqa: C901 - Set save_chart=True to permanently save the chart - LLM clients MUST display returned chart URL to users - Use numeric dataset ID or UUID (NOT schema.table_name format) - - MUST include chart_type in config (either 'xy' or 'table') + - MUST include chart_type in config (one of: 'xy', 'table', 'pie', + 'big_number', 'pivot_table', 'mixed_timeseries', 'handlebars') IMPORTANT: The 'chart_type' field in the config is a DISCRIMINATOR that determines which chart configuration schema to use. It MUST be included and MUST match the diff --git a/superset/mcp_service/chart/validation/schema_validator.py b/superset/mcp_service/chart/validation/schema_validator.py index dc0eec9958d6..5fc5af709385 100644 --- a/superset/mcp_service/chart/validation/schema_validator.py +++ b/superset/mcp_service/chart/validation/schema_validator.py @@ -532,7 +532,6 @@ def _format_single_error(err: Dict[str, Any]) -> tuple[str, str]: loc = " -> ".join(loc_parts) msg = err.get("msg", "Validation failed") err_type = err.get("type", "") - ctx = err.get("ctx", {}) or {} field = loc_parts[-1] if loc_parts else "field" if err_type == "string_pattern_mismatch": @@ -543,8 +542,9 @@ def _format_single_error(err: Dict[str, Any]) -> tuple[str, str]: "Use get_dataset_info to find exact column names", ) if err_type == "literal_error": - expected = ctx.get("expected", "") - return f"'{field}' has an invalid value. Expected one of: {expected}", "" + # Preserve the pydantic message ("Input should be ...") which is + # already human-readable; just prefix with the field name for context. + return f"'{field}': {msg}", "" if err_type == "missing": return ( f"Required field '{field}' is missing", diff --git a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py index 5ae7a38fc4ac..68035308a141 100644 --- a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py +++ b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py @@ -806,10 +806,17 @@ def test_name_with_spaces_accepted(self) -> None: col = ColumnRef(name="Total Revenue") assert col.name == "Total Revenue" - def test_xss_attempt_blocked(self) -> None: - """sanitize_name() still blocks XSS even without the regex.""" + def test_xss_tags_are_stripped(self) -> None: + """sanitize_name() strips HTML tags via nh3 rather than rejecting them. + The dangerous payload is neutralized; the safe text content is preserved.""" + col = ColumnRef(name="") + assert "") + ColumnRef(name="col onclick=alert(1)") def test_sql_keyword_blocked(self) -> None: """check_sql_keywords=True still blocks pure SQL statements.""" @@ -860,3 +867,10 @@ def test_hyphenated_filter_column_accepted(self) -> None: f = FilterConfig(column="order-status", op="=", value="shipped") assert f.column == "order-status" + + def test_sql_injection_in_filter_column_blocked(self) -> None: + """FilterConfig.sanitize_column uses check_sql_keywords=True.""" + from superset.mcp_service.chart.schemas import FilterConfig + + with pytest.raises(ValidationError): + FilterConfig(column="col; DROP TABLE users; --", op="=", value="x") From a66b8d66472cc854dd7b6463003766cc72477965 Mon Sep 17 00:00:00 2001 From: Amin Ghadersohi Date: Tue, 5 May 2026 18:23:26 +0000 Subject: [PATCH 4/4] fix(mcp): remove unused type-ignore, fix test_script_tag assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove unused 'type: ignore[return-value]' from sanitize_temporal_column (mypy correctly infers the return type; comment was unnecessary) - Fix test_xss_tags_are_stripped → test_script_tag_blocked: nh3 strips the entire script element including its content, leaving an empty string that the allow_empty=False guard then rejects with ValidationError --- superset/mcp_service/chart/schemas.py | 2 +- .../mcp_service/chart/test_chart_schemas.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/superset/mcp_service/chart/schemas.py b/superset/mcp_service/chart/schemas.py index 9632bec2b23f..b09b6dc0556e 100644 --- a/superset/mcp_service/chart/schemas.py +++ b/superset/mcp_service/chart/schemas.py @@ -1099,7 +1099,7 @@ def sanitize_temporal_column(cls, v: str | None) -> str | None: """Sanitize temporal column name to prevent XSS and SQL injection.""" if v is None: return None - return sanitize_user_input( # type: ignore[return-value] + return sanitize_user_input( v, "Temporal column", max_length=255, check_sql_keywords=True ) diff --git a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py index 68035308a141..19d06601912f 100644 --- a/tests/unit_tests/mcp_service/chart/test_chart_schemas.py +++ b/tests/unit_tests/mcp_service/chart/test_chart_schemas.py @@ -806,12 +806,12 @@ def test_name_with_spaces_accepted(self) -> None: col = ColumnRef(name="Total Revenue") assert col.name == "Total Revenue" - def test_xss_tags_are_stripped(self) -> None: - """sanitize_name() strips HTML tags via nh3 rather than rejecting them. - The dangerous payload is neutralized; the safe text content is preserved.""" - col = ColumnRef(name="") - assert "") def test_event_handler_injection_blocked(self) -> None: """sanitize_name() rejects event-handler injection patterns (on...=)."""