From ab5bf0d0503819d9831769a01a37ee765e1c5cea Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Mon, 4 May 2026 09:54:25 -0600 Subject: [PATCH 01/14] refactor(db): composite PK on M2M association tables (sc-105349) Replace synthetic id INTEGER PRIMARY KEY with composite PRIMARY KEY (fk1, fk2) on the eight pure-junction tables: dashboard_roles, dashboard_slices, dashboard_user, report_schedule_user, rls_filter_roles, rls_filter_tables, slice_user, sqlatable_user. The redundant UNIQUE(fk1, fk2) on dashboard_slices and report_schedule_user is dropped (subsumed by the new PK). Migration handles dialect quirks: copy_from for tables with pre-existing UNIQUE (so SQLite's anonymous-constraint reflection doesn't matter), wrapped- subquery dedupe for MySQL (ERROR 1093), sa.Identity(always=False) on downgrade to backfill the restored id column without NOT NULL violations, and distinct PK names per direction (pk_ on upgrade,
_pkey on downgrade) to avoid round-trip index-name collisions on Postgres. ORM Table() definitions updated to match. UPDATING.md entry added with operator runbook (BI-tool impact, pre-flight inventory queries, dedupe-row- loss notice, pg_dump workaround, FK-NOT-NULL downgrade asymmetry note). Tests: 8 schema-shape assertions (post-upgrade), 8 duplicate-rejection unit tests, 8 distinct-pair sanity tests, 1 round-trip + idempotency test (in-memory SQLite via Alembic MigrationContext). Continuum-restore verification against the new shape is out of scope for this PR; it is the responsibility of the versioning epic (sc-103156). Co-Authored-By: Claude Opus 4.7 (1M context) --- UPDATING.md | 47 +++ superset/connectors/sqla/models.py | 35 ++- ...3611e32_composite_pk_association_tables.py | 289 ++++++++++++++++++ superset/models/dashboard.py | 37 ++- superset/models/slice.py | 15 +- superset/reports/models.py | 6 +- .../composite_pk_association_tables__tests.py | 131 ++++++++ .../composite_pk_round_trip__tests.py | 168 ++++++++++ .../composite_pk_association_tables_test.py | 132 ++++++++ 9 files changed, 833 insertions(+), 27 deletions(-) create mode 100644 superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py create mode 100644 tests/integration_tests/migrations/composite_pk_association_tables__tests.py create mode 100644 tests/integration_tests/migrations/composite_pk_round_trip__tests.py create mode 100644 tests/unit_tests/migrations/composite_pk_association_tables_test.py diff --git a/UPDATING.md b/UPDATING.md index 3d42f2b3d4e1..1ece31a1c6fd 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -310,6 +310,53 @@ See `superset/mcp_service/PRODUCTION.md` for deployment guides. } ``` +### Composite primary keys on many-to-many association tables + +The eight M:N association tables listed below have been changed from a synthetic surrogate `id INTEGER PRIMARY KEY` to a composite `PRIMARY KEY (fk1, fk2)` on the two foreign-key columns. The `id` column is dropped, and the two tables that previously carried a redundant `UNIQUE (fk1, fk2)` constraint have that constraint removed (it is now subsumed by the composite primary key). + +**Affected tables and their composite-PK column pairs:** + +| Table | Composite PK | +|---|---| +| `dashboard_roles` | `(dashboard_id, role_id)` | +| `dashboard_slices` | `(dashboard_id, slice_id)` | +| `dashboard_user` | `(user_id, dashboard_id)` | +| `report_schedule_user` | `(user_id, report_schedule_id)` | +| `rls_filter_roles` | `(role_id, rls_filter_id)` | +| `rls_filter_tables` | `(table_id, rls_filter_id)` | +| `slice_user` | `(user_id, slice_id)` | +| `sqlatable_user` | `(user_id, table_id)` | + +**Impact on external readers:** Any BI tool, custom report, backup script, or external integration that references these tables by their old surrogate `id` column (e.g., `SELECT id FROM dashboard_slices WHERE …`, `WHERE dashboard_slices.id IN (…)`) will break. Update such queries to project or filter on the FK pair (`dashboard_id, slice_id`) instead. The FK columns themselves are unchanged. + +**Pre-flight inventory queries.** Before applying the upgrade, operators are encouraged to run the queries below against their database to assess what the migration will change. Two classes of pre-existing data are not preserved by the migration: duplicate `(fk1, fk2)` rows (the migration keeps `MIN(id)` and deletes the rest) and rows with `NULL` in either FK column (the migration deletes them, since FK columns are promoted to `NOT NULL` for the composite PK). Compliance- or audit-sensitive operators should also `\copy` (Postgres) or `SELECT … INTO OUTFILE` (MySQL) the affected rows for their own records before upgrading. + +```sql +-- Duplicate (fk1, fk2) pairs (the migration will keep MIN(id) per group, delete the rest) +SELECT dashboard_id, role_id, COUNT(*) FROM dashboard_roles GROUP BY dashboard_id, role_id HAVING COUNT(*) > 1; +SELECT dashboard_id, slice_id, COUNT(*) FROM dashboard_slices GROUP BY dashboard_id, slice_id HAVING COUNT(*) > 1; +SELECT user_id, dashboard_id, COUNT(*) FROM dashboard_user GROUP BY user_id, dashboard_id HAVING COUNT(*) > 1; +SELECT user_id, report_schedule_id, COUNT(*) FROM report_schedule_user GROUP BY user_id, report_schedule_id HAVING COUNT(*) > 1; +SELECT role_id, rls_filter_id, COUNT(*) FROM rls_filter_roles GROUP BY role_id, rls_filter_id HAVING COUNT(*) > 1; +SELECT table_id, rls_filter_id, COUNT(*) FROM rls_filter_tables GROUP BY table_id, rls_filter_id HAVING COUNT(*) > 1; +SELECT user_id, slice_id, COUNT(*) FROM slice_user GROUP BY user_id, slice_id HAVING COUNT(*) > 1; +SELECT user_id, table_id, COUNT(*) FROM sqlatable_user GROUP BY user_id, table_id HAVING COUNT(*) > 1; + +-- Rows with a NULL in either FK (the migration will delete these) +SELECT COUNT(*) FROM dashboard_roles WHERE dashboard_id IS NULL OR role_id IS NULL; +SELECT COUNT(*) FROM dashboard_slices WHERE dashboard_id IS NULL OR slice_id IS NULL; +SELECT COUNT(*) FROM dashboard_user WHERE user_id IS NULL OR dashboard_id IS NULL; +SELECT COUNT(*) FROM report_schedule_user WHERE user_id IS NULL OR report_schedule_id IS NULL; +SELECT COUNT(*) FROM rls_filter_roles WHERE role_id IS NULL OR rls_filter_id IS NULL; +SELECT COUNT(*) FROM rls_filter_tables WHERE table_id IS NULL OR rls_filter_id IS NULL; +SELECT COUNT(*) FROM slice_user WHERE user_id IS NULL OR slice_id IS NULL; +SELECT COUNT(*) FROM sqlatable_user WHERE user_id IS NULL OR table_id IS NULL; +``` + +**Restoring an old `pg_dump` (or equivalent) against the new schema.** A dump taken before the migration includes `INSERT` statements that populate the now-removed `id` column. Restoring such a dump against the post-migration schema will fail. The supported workaround is to dump only the schema and reference data, then re-create the M:N associations from application data after restore — for example with `pg_dump --exclude-table-data` (or per-table `--exclude-table-data=dashboard_slices` etc.) for the eight junction tables, restore the rest, then run a one-shot script that re-INSERTs `(fk1, fk2)` pairs derived from your application export. Operators who need to restore an old dump verbatim should restore against a pre-migration Superset and then re-run the upgrade. + +**Intentional downgrade asymmetry.** The migration's `downgrade()` restores the surrogate `id` column and (for `dashboard_slices` and `report_schedule_user`) the original `UNIQUE (fk1, fk2)` constraint, but it does **not** restore the original `NULL`-allowed state on the FK columns — they remain `NOT NULL`. This is intentional: under SQLAlchemy's `secondary=` semantics, a `NULL` in either FK column of a junction table is meaningless (it cannot participate in either side of the relationship). Operators downgrading are not expected to need this restored. The asymmetry is documented for completeness so that round-trip schema diffs are not mistaken for migration bugs. + ## 6.0.0 - [33055](https://github.com/apache/superset/pull/33055): Upgrades Flask-AppBuilder to 5.0.0. The AUTH_OID authentication type has been deprecated and is no longer available as an option in Flask-AppBuilder. OpenID (OID) is considered a deprecated authentication protocol - if you are using AUTH_OID, you will need to migrate to an alternative authentication method such as OAuth, LDAP, or database authentication before upgrading. - [34871](https://github.com/apache/superset/pull/34871): Fixed Jest test hanging issue from Ant Design v5 upgrade. MessageChannel is now mocked in test environment to prevent rc-overflow from causing Jest to hang. Test environment only - no production impact. diff --git a/superset/connectors/sqla/models.py b/superset/connectors/sqla/models.py index 6fb132e16137..b080b64aff49 100644 --- a/superset/connectors/sqla/models.py +++ b/superset/connectors/sqla/models.py @@ -1212,9 +1212,18 @@ def data(self) -> dict[str, Any]: sqlatable_user = DBTable( "sqlatable_user", metadata, - Column("id", Integer, primary_key=True), - Column("user_id", Integer, ForeignKey("ab_user.id", ondelete="CASCADE")), - Column("table_id", Integer, ForeignKey("tables.id", ondelete="CASCADE")), + Column( + "user_id", + Integer, + ForeignKey("ab_user.id", ondelete="CASCADE"), + primary_key=True, + ), + Column( + "table_id", + Integer, + ForeignKey("tables.id", ondelete="CASCADE"), + primary_key=True, + ), ) @@ -2146,17 +2155,25 @@ def text(self, clause: str) -> TextClause: RLSFilterRoles = DBTable( "rls_filter_roles", metadata, - Column("id", Integer, primary_key=True), - Column("role_id", Integer, ForeignKey("ab_role.id"), nullable=False), - Column("rls_filter_id", Integer, ForeignKey("row_level_security_filters.id")), + Column("role_id", Integer, ForeignKey("ab_role.id"), primary_key=True), + Column( + "rls_filter_id", + Integer, + ForeignKey("row_level_security_filters.id"), + primary_key=True, + ), ) RLSFilterTables = DBTable( "rls_filter_tables", metadata, - Column("id", Integer, primary_key=True), - Column("table_id", Integer, ForeignKey("tables.id")), - Column("rls_filter_id", Integer, ForeignKey("row_level_security_filters.id")), + Column("table_id", Integer, ForeignKey("tables.id"), primary_key=True), + Column( + "rls_filter_id", + Integer, + ForeignKey("row_level_security_filters.id"), + primary_key=True, + ), ) diff --git a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py new file mode 100644 index 000000000000..2c841bc6171a --- /dev/null +++ b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py @@ -0,0 +1,289 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""composite_pk_association_tables + +Replace the unused synthetic ``id INTEGER PRIMARY KEY`` on eight many-to-many +association tables with a composite primary key on the two FK columns. Drops +the now-redundant ``UniqueConstraint(fk1, fk2)`` on the two tables that +already carry one. Pre-flight: deletes rows with NULL FK values (six tables +allow them today) and any duplicate ``(fk1, fk2)`` rows. + +Motivated by SQLAlchemy-Continuum issue #129 (M2M restore against junction +tables with surrogate PKs); also closes the data-integrity hole where six +of the eight tables lacked DB-level uniqueness. + +Revision ID: 2bee73611e32 +Revises: ce6bd21901ab +Create Date: 2026-05-01 23:36:34.050058 + +""" + +import logging +from typing import NamedTuple + +import sqlalchemy as sa +from alembic import op +from sqlalchemy import inspect +from sqlalchemy.engine import Connection + +# revision identifiers, used by Alembic. +revision = "2bee73611e32" +down_revision = "ce6bd21901ab" + +logger = logging.getLogger("alembic.env") + + +class AssociationTable(NamedTuple): + """A junction table being converted from surrogate-id PK to composite-FK PK.""" + + name: str + fk1: str + fk2: str + + +# Order is alphabetical by table name; deterministic for review and bisection. +AFFECTED_TABLES: list[AssociationTable] = [ + AssociationTable("dashboard_roles", "dashboard_id", "role_id"), + AssociationTable("dashboard_slices", "dashboard_id", "slice_id"), + AssociationTable("dashboard_user", "user_id", "dashboard_id"), + AssociationTable("report_schedule_user", "user_id", "report_schedule_id"), + AssociationTable("rls_filter_roles", "role_id", "rls_filter_id"), + AssociationTable("rls_filter_tables", "table_id", "rls_filter_id"), + AssociationTable("slice_user", "user_id", "slice_id"), + AssociationTable("sqlatable_user", "user_id", "table_id"), +] + +# These two tables already declare ``UniqueConstraint(fk1, fk2)`` in the model; +# the composite PK subsumes it, so the migration drops the redundant constraint. +TABLES_WITH_PRE_EXISTING_UNIQUE: set[str] = { + "dashboard_slices", + "report_schedule_user", +} + +# Six tables whose FK columns are nullable today. Promoting an FK to a primary +# key column makes it NOT NULL, so any existing NULL-FK rows would block the +# PK-add. We delete them in pre-flight (a junction-table row with a NULL FK +# is meaningless under SQLAlchemy ``secondary=`` semantics anyway). +TABLES_WITH_NULLABLE_FKS: set[str] = { + "dashboard_slices", + "dashboard_user", + "rls_filter_roles", + "rls_filter_tables", + "slice_user", + "sqlatable_user", +} + + +def _check_no_external_fks_to_id(conn: Connection) -> None: + """Raise ``RuntimeError`` if any foreign key in the database references one + of the eight junction-table ``id`` columns. Uses SQLAlchemy's ``Inspector`` + for dialect-agnostic introspection across PostgreSQL, MySQL, and SQLite.""" + affected = {t.name for t in AFFECTED_TABLES} + insp = inspect(conn) + for table_name in insp.get_table_names(): + if table_name in affected: + continue + for fk in insp.get_foreign_keys(table_name): + if fk["referred_table"] in affected and "id" in fk["referred_columns"]: + raise RuntimeError( + f"Cannot drop synthetic id from {fk['referred_table']}: " + f"external FK {fk.get('name', '')} on {table_name} " + f"references {fk['referred_table']}({fk['referred_columns']}). " + f"Drop or migrate the referencing FK before applying this " + f"migration." + ) + + +def _delete_null_fk_rows(conn: Connection, t: AssociationTable) -> int: + """Delete rows where ``t.fk1`` or ``t.fk2`` is NULL on ``t.name``. + + Returns the deletion count. Called only on tables in + ``TABLES_WITH_NULLABLE_FKS``. Required because primary-key columns must be + NOT NULL; the PK-add downstream would fail with a cryptic constraint + violation if any NULL-FK rows survived. + """ + # Identifiers come from the AFFECTED_TABLES whitelist, not user input. + sql = sa.text( + f"DELETE FROM {t.name} WHERE {t.fk1} IS NULL OR {t.fk2} IS NULL" # noqa: S608 + ) + result = conn.execute(sql) + n = result.rowcount or 0 + if n: + logger.warning( + "Deleted %d row(s) with NULL FK from %s before composite-PK promotion", + n, + t.name, + ) + return n + + +def _dedupe_by_min_id(conn: Connection, t: AssociationTable) -> int: + """Delete duplicate ``(t.fk1, t.fk2)`` rows from ``t.name`` keeping ``MIN(id)``. + + Returns the deletion count. Uses the wrapped-subquery form for MySQL + portability — MySQL rejects ``DELETE FROM t WHERE id NOT IN (SELECT MIN(id) + FROM t GROUP BY ...)`` with ERROR 1093 unless the inner SELECT is wrapped + to force materialization. + """ + # Identifiers come from the AFFECTED_TABLES whitelist, not user input. + sql = sa.text( + f"DELETE FROM {t.name} WHERE id NOT IN (" # noqa: S608 + f" SELECT keep_id FROM (" + f" SELECT MIN(id) AS keep_id FROM {t.name} " + f"GROUP BY {t.fk1}, {t.fk2}" + f" ) AS s" + f")" + ) + result = conn.execute(sql) + n = result.rowcount or 0 + if n: + logger.warning("Deduped %d duplicate row(s) from %s", n, t.name) + return n + + +def _assert_no_duplicates(conn: Connection, t: AssociationTable) -> None: + """Raise ``RuntimeError`` if any ``(t.fk1, t.fk2)`` duplicate group remains. + + Called after ``_dedupe_by_min_id`` to surface silent dialect-dependent + dedupe failures (e.g., a MySQL syntax issue) as an actionable error + before the PK-add fires with a less-helpful constraint-violation message. + """ + # Identifiers come from the AFFECTED_TABLES whitelist, not user input. + sql = sa.text( + f"SELECT COUNT(*) FROM (" # noqa: S608 + f" SELECT 1 FROM {t.name} GROUP BY {t.fk1}, {t.fk2} HAVING COUNT(*) > 1" + f") AS s" + ) + if remaining := conn.scalar(sql) or 0: + raise RuntimeError( + f"Dedupe failed for {t.name}: {remaining} duplicate " + f"({t.fk1}, {t.fk2}) groups remain after _dedupe_by_min_id. " + f"Check the dedupe SQL for dialect {conn.dialect.name}." + ) + + +def _build_pre_upgrade_table( + insp: sa.engine.reflection.Inspector, t: AssociationTable +) -> sa.Table: + """Build a ``Table`` object representing the pre-upgrade schema of ``t``, + explicitly *without* any redundant ``UniqueConstraint(t.fk1, t.fk2)``. + Used as ``copy_from`` to ``batch_alter_table`` so the rebuilt table + omits the unnamed UNIQUE constraint deterministically across dialects + (SQLite reflects unnamed UNIQUEs with ``name=None``, defeating the + standard ``batch_op.drop_constraint(name)`` path). + + Reflects column types and FK targets (with original FK constraint names + preserved) from the live database; only the redundant UNIQUE is omitted. + """ + md = sa.MetaData() + fks_for_col: dict[str, list[dict]] = {} + for fk in insp.get_foreign_keys(t.name): + for col_name in fk["constrained_columns"]: + fks_for_col.setdefault(col_name, []).append(fk) + + cols: list[sa.Column] = [] + for c in insp.get_columns(t.name): + col_kwargs = {"nullable": c.get("nullable", True)} + if c["name"] == "id": + col_kwargs["primary_key"] = True + col_kwargs["autoincrement"] = True + fk_args = [] + for fk in fks_for_col.get(c["name"], []): + idx = fk["constrained_columns"].index(c["name"]) + target = f"{fk['referred_table']}.{fk['referred_columns'][idx]}" + options = {} + if fk.get("options", {}).get("ondelete"): + options["ondelete"] = fk["options"]["ondelete"] + if fk.get("name"): + options["name"] = fk["name"] + fk_args.append(sa.ForeignKey(target, **options)) + cols.append(sa.Column(c["name"], c["type"], *fk_args, **col_kwargs)) + return sa.Table(t.name, md, *cols) + + +def upgrade() -> None: + conn = op.get_bind() + _check_no_external_fks_to_id(conn) + insp = inspect(conn) + + for t in AFFECTED_TABLES: + if t.name in TABLES_WITH_NULLABLE_FKS: + _delete_null_fk_rows(conn, t) + _dedupe_by_min_id(conn, t) + _assert_no_duplicates(conn, t) + + # For the two tables with a pre-existing redundant UNIQUE + # (``dashboard_slices``, ``report_schedule_user``) build an explicit + # ``copy_from`` Table that omits the UNIQUE; this deterministically + # drops it across all dialects, including SQLite where unnamed + # constraints reflect with ``name=None`` and can't be dropped by + # name. For the other six tables, reflection-based default + # ``batch_alter_table`` (auto-detect) is fine since there's no + # UNIQUE to drop. On PostgreSQL/MySQL, direct ALTER avoids the + # temp-table index-name collision; on SQLite, the auto-detect picks + # ``recreate=True`` because PK changes need it. + if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE: + with op.batch_alter_table( + t.name, + recreate="always", + copy_from=_build_pre_upgrade_table(insp, t), + ) as batch_op: + batch_op.drop_column("id") + batch_op.create_primary_key(f"pk_{t.name}", [t.fk1, t.fk2]) + else: + with op.batch_alter_table(t.name) as batch_op: + batch_op.drop_column("id") + batch_op.create_primary_key(f"pk_{t.name}", [t.fk1, t.fk2]) + + +def downgrade() -> None: + # Inverse order: undo upgrade transformations from last-applied to + # first-applied. Within each table, drop the composite PK, restore the + # surrogate ``id`` column, and re-add the original ``UNIQUE`` constraint + # on the two tables that previously carried one. + # + # Note: FK columns remain NOT NULL after downgrade (intentional asymmetry + # — see UPDATING.md). Restoring the original nullable state would require + # an explicit ``alter_column`` per FK per table for no operator value; + # junction-table NULL FKs were always meaningless under ``secondary=`` + # semantics. + # The downgrade names the restored PK ``
_pkey`` (matching Postgres' + # default constraint-naming convention, which was the original constraint + # name before this migration ran) so a downgrade-then-upgrade round-trip + # doesn't collide on the upgrade's ``pk_
`` name. + # + # Adding a NOT NULL ``id`` column to a table with existing rows requires + # a default that fires on the existing rows. ``sa.Identity()`` (Postgres + # 10+ / MySQL 8+) and ``sa.Sequence`` (with explicit nextval) both + # backfill existing rows during ALTER TABLE; bare ``autoincrement=True`` + # does not. ``Identity`` is the modern portable choice. + for t in reversed(AFFECTED_TABLES): + with op.batch_alter_table(t.name) as batch_op: + batch_op.drop_constraint(f"pk_{t.name}", type_="primary") + batch_op.add_column( + sa.Column( + "id", + sa.Integer, + sa.Identity(always=False), + nullable=False, + ) + ) + batch_op.create_primary_key(f"{t.name}_pkey", ["id"]) + if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE: + batch_op.create_unique_constraint( + f"uq_{t.name}_{t.fk1}_{t.fk2}", [t.fk1, t.fk2] + ) diff --git a/superset/models/dashboard.py b/superset/models/dashboard.py index f38d801719ea..bb1711678b70 100644 --- a/superset/models/dashboard.py +++ b/superset/models/dashboard.py @@ -35,7 +35,6 @@ String, Table, Text, - UniqueConstraint, ) from sqlalchemy.engine.base import Connection from sqlalchemy.orm import relationship, subqueryload @@ -93,37 +92,53 @@ def copy_dashboard(_mapper: Mapper, _connection: Connection, target: Dashboard) dashboard_slices = Table( "dashboard_slices", metadata, - Column("id", Integer, primary_key=True), - Column("dashboard_id", Integer, ForeignKey("dashboards.id", ondelete="CASCADE")), - Column("slice_id", Integer, ForeignKey("slices.id", ondelete="CASCADE")), - UniqueConstraint("dashboard_id", "slice_id"), + Column( + "dashboard_id", + Integer, + ForeignKey("dashboards.id", ondelete="CASCADE"), + primary_key=True, + ), + Column( + "slice_id", + Integer, + ForeignKey("slices.id", ondelete="CASCADE"), + primary_key=True, + ), ) dashboard_user = Table( "dashboard_user", metadata, - Column("id", Integer, primary_key=True), - Column("user_id", Integer, ForeignKey("ab_user.id", ondelete="CASCADE")), - Column("dashboard_id", Integer, ForeignKey("dashboards.id", ondelete="CASCADE")), + Column( + "user_id", + Integer, + ForeignKey("ab_user.id", ondelete="CASCADE"), + primary_key=True, + ), + Column( + "dashboard_id", + Integer, + ForeignKey("dashboards.id", ondelete="CASCADE"), + primary_key=True, + ), ) DashboardRoles = Table( "dashboard_roles", metadata, - Column("id", Integer, primary_key=True), Column( "dashboard_id", Integer, ForeignKey("dashboards.id", ondelete="CASCADE"), - nullable=False, + primary_key=True, ), Column( "role_id", Integer, ForeignKey("ab_role.id", ondelete="CASCADE"), - nullable=False, + primary_key=True, ), ) diff --git a/superset/models/slice.py b/superset/models/slice.py index 04c698ce95da..0012d082efb9 100644 --- a/superset/models/slice.py +++ b/superset/models/slice.py @@ -58,9 +58,18 @@ slice_user = Table( "slice_user", metadata, - Column("id", Integer, primary_key=True), - Column("user_id", Integer, ForeignKey("ab_user.id", ondelete="CASCADE")), - Column("slice_id", Integer, ForeignKey("slices.id", ondelete="CASCADE")), + Column( + "user_id", + Integer, + ForeignKey("ab_user.id", ondelete="CASCADE"), + primary_key=True, + ), + Column( + "slice_id", + Integer, + ForeignKey("slices.id", ondelete="CASCADE"), + primary_key=True, + ), ) logger = logging.getLogger(__name__) diff --git a/superset/reports/models.py b/superset/reports/models.py index f0abda8a9216..7564336ae11d 100644 --- a/superset/reports/models.py +++ b/superset/reports/models.py @@ -101,20 +101,18 @@ class ReportSourceFormat(StrEnum): report_schedule_user = Table( "report_schedule_user", metadata, - Column("id", Integer, primary_key=True), Column( "user_id", Integer, ForeignKey("ab_user.id", ondelete="CASCADE"), - nullable=False, + primary_key=True, ), Column( "report_schedule_id", Integer, ForeignKey("report_schedule.id", ondelete="CASCADE"), - nullable=False, + primary_key=True, ), - UniqueConstraint("user_id", "report_schedule_id"), ) diff --git a/tests/integration_tests/migrations/composite_pk_association_tables__tests.py b/tests/integration_tests/migrations/composite_pk_association_tables__tests.py new file mode 100644 index 000000000000..52b1942bdb24 --- /dev/null +++ b/tests/integration_tests/migrations/composite_pk_association_tables__tests.py @@ -0,0 +1,131 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Schema-shape assertion tests for the composite-PK association-tables +migration (revision 2bee73611e32). + +Builds the pre-migration shape against an isolated in-memory SQLite engine, +runs the migration's ``upgrade()``, and asserts the resulting shape matches +the data-model.md "After" specification: no ``id`` column, composite PK on +the two FK columns, and no redundant ``UNIQUE(fk1, fk2)`` on the two tables +that previously carried one. + +Continuum-restore verification is OUT OF SCOPE; that work lives in the +versioning epic (sc-103156). Cross-backend verification (PostgreSQL, MySQL) +is handled by the CI matrix (T034a). +""" + +from importlib import import_module + +import pytest +import sqlalchemy as sa +from alembic.migration import MigrationContext +from alembic.operations import Operations +from sqlalchemy import inspect + +# Import the migration module under test. +_migration = import_module( + "superset.migrations.versions." + "2026-05-01_23-36_2bee73611e32_composite_pk_association_tables" +) +AFFECTED_TABLES = _migration.AFFECTED_TABLES +TABLES_WITH_PRE_EXISTING_UNIQUE = _migration.TABLES_WITH_PRE_EXISTING_UNIQUE + + +@pytest.fixture(scope="module") +def post_upgrade_engine() -> sa.engine.Engine: + """An isolated in-memory SQLite engine with the migration applied to a + pre-migration-shaped seed schema. Used by the post-upgrade assertions + below. Module-scoped so the upgrade only runs once per test session.""" + engine = sa.create_engine("sqlite:///:memory:") + md = sa.MetaData() + for t in AFFECTED_TABLES: + cols: list[sa.SchemaItem] = [ + sa.Column("id", sa.Integer, primary_key=True), + sa.Column(t.fk1, sa.Integer, nullable=False), + sa.Column(t.fk2, sa.Integer, nullable=False), + ] + constraints: list[sa.SchemaItem] = [] + if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE: + constraints.append(sa.UniqueConstraint(t.fk1, t.fk2)) + sa.Table(t.name, md, *cols, *constraints) + md.create_all(engine) + + # Apply the migration's upgrade() against this engine via Alembic's + # MigrationContext, patching the migration module's ``op`` reference. + with engine.connect() as conn: + ctx = MigrationContext.configure(conn) + ops = Operations(ctx) + original_op = _migration.op + _migration.op = ops # type: ignore[attr-defined] + try: + _migration.upgrade() + finally: + _migration.op = original_op # type: ignore[attr-defined] + return engine + + +@pytest.mark.parametrize("t", AFFECTED_TABLES, ids=lambda t: t.name) +def test_no_id_column(post_upgrade_engine: sa.engine.Engine, t) -> None: + """The synthetic ``id`` column is gone from each affected table.""" + insp = inspect(post_upgrade_engine) + column_names = {c["name"] for c in insp.get_columns(t.name)} + assert "id" not in column_names, ( + f"{t.name} still has an 'id' column after migration; " + f"composite-PK conversion incomplete" + ) + + +@pytest.mark.parametrize("t", AFFECTED_TABLES, ids=lambda t: t.name) +def test_primary_key_is_composite_fks(post_upgrade_engine: sa.engine.Engine, t) -> None: + """The primary key of each affected table is exactly ``(fk1, fk2)``.""" + insp = inspect(post_upgrade_engine) + pk_cols = set(insp.get_pk_constraint(t.name).get("constrained_columns", [])) + assert pk_cols == {t.fk1, t.fk2}, ( + f"{t.name} primary key is {pk_cols}, expected {{{t.fk1}, {t.fk2}}}" + ) + + +@pytest.mark.parametrize( + "t", + [t for t in AFFECTED_TABLES if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE], + ids=lambda t: t.name, +) +def test_redundant_unique_dropped(post_upgrade_engine: sa.engine.Engine, t) -> None: + """For the two tables that previously carried a UNIQUE(fk1, fk2), that + constraint is now subsumed by the composite PK and must not appear + separately in the unique-constraint list.""" + insp = inspect(post_upgrade_engine) + redundant_pair = {t.fk1, t.fk2} + for uc in insp.get_unique_constraints(t.name): + cols = set(uc.get("column_names", [])) + assert cols != redundant_pair, ( + f"{t.name} still carries a redundant UniqueConstraint over " + f"{redundant_pair} (name={uc.get('name')!r}); " + f"composite-PK conversion incomplete" + ) + + +@pytest.mark.parametrize("t", AFFECTED_TABLES, ids=lambda t: t.name) +def test_fk_columns_not_null(post_upgrade_engine: sa.engine.Engine, t) -> None: + """PK promotion implicitly tightens the FK columns to NOT NULL.""" + insp = inspect(post_upgrade_engine) + cols_by_name = {c["name"]: c for c in insp.get_columns(t.name)} + for col in (t.fk1, t.fk2): + assert col in cols_by_name, f"{t.name} missing column {col}" + assert cols_by_name[col].get("nullable") is False, ( + f"{t.name}.{col} is nullable; expected NOT NULL after PK promotion" + ) diff --git a/tests/integration_tests/migrations/composite_pk_round_trip__tests.py b/tests/integration_tests/migrations/composite_pk_round_trip__tests.py new file mode 100644 index 000000000000..d83c9d113c3f --- /dev/null +++ b/tests/integration_tests/migrations/composite_pk_round_trip__tests.py @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Schema round-trip tests for the composite-PK association-tables migration +(revision 2bee73611e32). Builds the pre-migration shape against an in-memory +SQLite engine, runs the migration's ``upgrade()``, asserts the post-upgrade +shape, runs ``downgrade()``, asserts the prior shape is restored (modulo the +documented FK NOT NULL asymmetry), and re-runs ``upgrade()`` to verify +idempotency. + +This is run against an isolated in-memory engine via Alembic's +``MigrationContext`` so the test does not perturb the project's test DB. + +Cross-backend verification of the same migration against PostgreSQL and +MySQL is delegated to the CI matrix (see T034a in tasks.md) and to the +quickstart.md verification (T033). This file covers the SQLite slice. +""" + +from importlib import import_module +from typing import Any + +import pytest +import sqlalchemy as sa +from alembic.migration import MigrationContext +from alembic.operations import Operations +from sqlalchemy import inspect + +# Import the migration module under test. +_migration = import_module( + "superset.migrations.versions." + "2026-05-01_23-36_2bee73611e32_composite_pk_association_tables" +) +AFFECTED_TABLES = _migration.AFFECTED_TABLES +TABLES_WITH_PRE_EXISTING_UNIQUE = _migration.TABLES_WITH_PRE_EXISTING_UNIQUE + + +def _build_pre_migration_schema(engine: sa.engine.Engine) -> None: + """Recreate the eight tables in their pre-migration shape (surrogate + ``id INTEGER PRIMARY KEY`` plus an optional ``UNIQUE(fk1, fk2)`` on the + two tables that previously carried one). FKs to parent tables are + omitted to keep the test self-contained — we're testing schema + transformations, not FK enforcement.""" + md = sa.MetaData() + for t in AFFECTED_TABLES: + cols: list[sa.Column] = [ + sa.Column("id", sa.Integer, primary_key=True), + sa.Column(t.fk1, sa.Integer, nullable=False), + sa.Column(t.fk2, sa.Integer, nullable=False), + ] + constraints: list[sa.SchemaItem] = [] + if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE: + constraints.append(sa.UniqueConstraint(t.fk1, t.fk2)) + sa.Table(t.name, md, *cols, *constraints) + md.create_all(engine) + + +def _shape(engine: sa.engine.Engine, table: str) -> dict[str, Any]: + """Return a structural summary for asserting equality across runs.""" + insp = inspect(engine) + pk = insp.get_pk_constraint(table).get("constrained_columns", []) + columns = sorted(c["name"] for c in insp.get_columns(table)) + uniques = sorted( + tuple(sorted(uc.get("column_names", []))) + for uc in insp.get_unique_constraints(table) + ) + return {"columns": columns, "pk": sorted(pk), "uniques": uniques} + + +def _run_with_alembic_context(engine: sa.engine.Engine, fn) -> None: + """Run ``fn()`` (the migration's upgrade/downgrade body) inside a fresh + Alembic ``MigrationContext`` bound to ``engine``. Patches the + migration module's ``op`` to point at this context so its + ``op.get_bind()`` and ``op.batch_alter_table`` calls execute against + the in-memory engine.""" + with engine.connect() as conn: + ctx = MigrationContext.configure(conn) + ops = Operations(ctx) + original_op = _migration.op + _migration.op = ops # type: ignore[attr-defined] + try: + fn() + finally: + _migration.op = original_op # type: ignore[attr-defined] + + +def test_round_trip_against_in_memory_sqlite() -> None: + """Round-trip: pre-migration → upgrade → downgrade → upgrade again. + + Asserts: + - Post-upgrade shape: no ``id``, composite PK on (fk1, fk2), no + UNIQUE(fk1, fk2) on the two tables that previously carried one. + - Post-downgrade shape: ``id`` restored, PK back on (id), UNIQUE + re-added on the two tables. (FK columns remain NOT NULL — the + documented intentional asymmetry.) + - Post-re-upgrade idempotency: shape matches the first post-upgrade. + """ + engine = sa.create_engine("sqlite:///:memory:") + _build_pre_migration_schema(engine) + + pre_shape = {t.name: _shape(engine, t.name) for t in AFFECTED_TABLES} + + _run_with_alembic_context(engine, _migration.upgrade) + + for t in AFFECTED_TABLES: + s = _shape(engine, t.name) + assert "id" not in s["columns"], f"{t.name}: id still present post-upgrade: {s}" + assert s["pk"] == sorted([t.fk1, t.fk2]), ( + f"{t.name}: PK is {s['pk']}, expected {sorted([t.fk1, t.fk2])}" + ) + assert tuple(sorted([t.fk1, t.fk2])) not in s["uniques"], ( + f"{t.name}: redundant UNIQUE not dropped post-upgrade: {s['uniques']}" + ) + + post_upgrade_shape = {t.name: _shape(engine, t.name) for t in AFFECTED_TABLES} + + _run_with_alembic_context(engine, _migration.downgrade) + + for t in AFFECTED_TABLES: + s = _shape(engine, t.name) + assert "id" in s["columns"], f"{t.name}: id not restored post-downgrade: {s}" + assert s["pk"] == ["id"], f"{t.name}: PK is {s['pk']}, expected ['id']" + if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE: + assert tuple(sorted([t.fk1, t.fk2])) in s["uniques"], ( + f"{t.name}: UNIQUE not restored post-downgrade: {s['uniques']}" + ) + + _run_with_alembic_context(engine, _migration.upgrade) + + re_upgrade_shape = {t.name: _shape(engine, t.name) for t in AFFECTED_TABLES} + assert re_upgrade_shape == post_upgrade_shape, ( + "Re-upgrade shape differs from initial upgrade shape — " + "migration is not idempotent. " + f"diff: {set(re_upgrade_shape.items()) ^ set(post_upgrade_shape.items())}" + ) + + # Use pre_shape only to demonstrate it was captured (not asserted against + # because the round-trip downgrade intentionally diverges on FK NOT NULL). + _ = pre_shape + + +def test_migration_module_constants_are_consistent() -> None: + """Sanity-check the migration module's exported constants. Catches + accidental edits that misalign AFFECTED_TABLES with the auxiliary sets.""" + affected_names = {t.name for t in AFFECTED_TABLES} + assert _migration.TABLES_WITH_PRE_EXISTING_UNIQUE.issubset(affected_names) + assert _migration.TABLES_WITH_NULLABLE_FKS.issubset(affected_names) + # Order is alphabetical (deterministic for review/bisection). + assert [t.name for t in AFFECTED_TABLES] == sorted(affected_names) + + +@pytest.mark.skipif(True, reason="placeholder — see test_round_trip above") +def test_placeholder_for_future_postgres_round_trip() -> None: + """Reserved slot for a future Postgres-specific round-trip if local + SQLite divergence ever needs to be cross-checked against the real + backend. Today's CI matrix (T034a) handles this implicitly.""" diff --git a/tests/unit_tests/migrations/composite_pk_association_tables_test.py b/tests/unit_tests/migrations/composite_pk_association_tables_test.py new file mode 100644 index 000000000000..6c3115edaf65 --- /dev/null +++ b/tests/unit_tests/migrations/composite_pk_association_tables_test.py @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Unit tests for the composite-PK association-tables migration (revision +2bee73611e32). Verifies the post-migration constraint enforcement: duplicate +``(fk1, fk2)`` insertions fail with IntegrityError, distinct pairs succeed. + +Schema is built from the live ORM ``Table`` definitions via +``metadata.create_all(engine)`` against in-memory SQLite. This reflects the +post-T015–T018 ORM model state (composite-PK), independent of whether the +Alembic migration has run against the test DB. The two should agree. +""" + +import pytest +import sqlalchemy as sa +from sqlalchemy.exc import IntegrityError + +# (table_name, fk1_col, fk2_col, fk1_parent_table, fk2_parent_table) +# Parent-table names are needed to build the FK targets in the in-memory schema. +AFFECTED_TABLES = [ + ("dashboard_roles", "dashboard_id", "role_id", "dashboards", "ab_role"), + ("dashboard_slices", "dashboard_id", "slice_id", "dashboards", "slices"), + ("dashboard_user", "user_id", "dashboard_id", "ab_user", "dashboards"), + ( + "report_schedule_user", + "user_id", + "report_schedule_id", + "ab_user", + "report_schedule", + ), + ("rls_filter_roles", "role_id", "rls_filter_id", "ab_role", "rls_filter"), + ("rls_filter_tables", "table_id", "rls_filter_id", "tables", "rls_filter"), + ("slice_user", "user_id", "slice_id", "ab_user", "slices"), + ("sqlatable_user", "user_id", "table_id", "ab_user", "tables"), +] + + +def _build_in_memory_schema( + table_name: str, fk1: str, fk2: str, fk1_parent: str, fk2_parent: str +) -> tuple[sa.engine.Engine, sa.Table]: + """Build an in-memory SQLite schema with two minimal parent tables and + the junction table under test (composite-PK shape). Returns the engine + and the junction-table object for inserts.""" + metadata = sa.MetaData() + sa.Table( + fk1_parent, + metadata, + sa.Column("id", sa.Integer, primary_key=True), + ) + if fk2_parent != fk1_parent: + sa.Table( + fk2_parent, + metadata, + sa.Column("id", sa.Integer, primary_key=True), + ) + junction = sa.Table( + table_name, + metadata, + sa.Column( + fk1, + sa.Integer, + sa.ForeignKey(f"{fk1_parent}.id"), + primary_key=True, + ), + sa.Column( + fk2, + sa.Integer, + sa.ForeignKey(f"{fk2_parent}.id"), + primary_key=True, + ), + ) + engine = sa.create_engine("sqlite:///:memory:") + metadata.create_all(engine) + # Seed parent rows so the FK constraints can be satisfied. + # Identifiers come from the AFFECTED_TABLES test parameter list, not user input. + with engine.begin() as conn: + conn.execute( + sa.text(f"INSERT INTO {fk1_parent} (id) VALUES (1), (2)") # noqa: S608 + ) + if fk2_parent != fk1_parent: + conn.execute( + sa.text(f"INSERT INTO {fk2_parent} (id) VALUES (1), (2)") # noqa: S608 + ) + return engine, junction + + +@pytest.mark.parametrize("table,fk1,fk2,fk1_parent,fk2_parent", AFFECTED_TABLES) +def test_duplicate_insert_rejected( + table: str, fk1: str, fk2: str, fk1_parent: str, fk2_parent: str +) -> None: + """Inserting the same ``(fk1, fk2)`` pair twice raises ``IntegrityError``. + + Verifies SC-004 / FR-007 — the composite primary key enforces uniqueness + at the database level on every affected table. + """ + engine, junction = _build_in_memory_schema(table, fk1, fk2, fk1_parent, fk2_parent) + with engine.begin() as conn: + conn.execute(junction.insert().values({fk1: 1, fk2: 1})) + with pytest.raises(IntegrityError): + conn.execute(junction.insert().values({fk1: 1, fk2: 1})) + + +@pytest.mark.parametrize("table,fk1,fk2,fk1_parent,fk2_parent", AFFECTED_TABLES) +def test_distinct_pairs_accepted( + table: str, fk1: str, fk2: str, fk1_parent: str, fk2_parent: str +) -> None: + """Two distinct ``(fk1, fk2)`` pairs both succeed. + + Sanity check that the PK isn't accidentally a single-column constraint + (which would reject ``(1, 1)`` and ``(1, 2)`` as a duplicate on column 1). + """ + engine, junction = _build_in_memory_schema(table, fk1, fk2, fk1_parent, fk2_parent) + with engine.begin() as conn: + conn.execute(junction.insert().values({fk1: 1, fk2: 1})) + conn.execute(junction.insert().values({fk1: 1, fk2: 2})) + result = conn.execute( + sa.text(f"SELECT COUNT(*) FROM {table}") # noqa: S608 + ).scalar_one() + assert result == 2 From d0bf359bee30c69d04963213e813d95cc7cd9dcd Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Mon, 4 May 2026 10:14:59 -0600 Subject: [PATCH 02/14] fix(migration): always run NULL-FK cleanup; correct RLS test parent name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two cleanups from PR review: 1. ``dashboard_roles.dashboard_id`` was created nullable in revision e11ccdd12658 but was missing from ``TABLES_WITH_NULLABLE_FKS``. A production database with a stray NULL ``dashboard_id`` row would have failed the PK-add with a cryptic constraint violation. Fix by running the NULL-FK cleanup on every affected table — it is a no-op DELETE on tables whose FK columns are already NOT NULL, and it eliminates the risk of further drift in the hardcoded set. ``dashboard_roles`` is added to the documentation set; the runtime now does not consult it. 2. The unit-test parent-table name for ``rls_filter_roles`` and ``rls_filter_tables`` was ``rls_filter`` (does not exist) instead of the real parent ``row_level_security_filters``. Test passes either way (the in-memory FK is self-consistent), but the parameter is now accurate. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...3611e32_composite_pk_association_tables.py | 20 +++++++++++++------ .../composite_pk_association_tables_test.py | 16 +++++++++++++-- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py index 2c841bc6171a..ec637de0118d 100644 --- a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py +++ b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py @@ -74,11 +74,15 @@ class AssociationTable(NamedTuple): "report_schedule_user", } -# Six tables whose FK columns are nullable today. Promoting an FK to a primary -# key column makes it NOT NULL, so any existing NULL-FK rows would block the -# PK-add. We delete them in pre-flight (a junction-table row with a NULL FK -# is meaningless under SQLAlchemy ``secondary=`` semantics anyway). +# Tables whose FK columns are nullable in their original create_table +# migrations. ``dashboard_roles.dashboard_id`` (created in revision +# e11ccdd12658) is nullable; ``report_schedule_user`` is the only association +# table that was created with both FK columns ``NOT NULL``. The pre-flight +# NULL-FK cleanup is a cheap no-op DELETE when run against tables whose FKs +# are already NOT NULL, so we run it on every affected table to avoid drift +# bugs from this set going stale. TABLES_WITH_NULLABLE_FKS: set[str] = { + "dashboard_roles", "dashboard_slices", "dashboard_user", "rls_filter_roles", @@ -221,8 +225,12 @@ def upgrade() -> None: insp = inspect(conn) for t in AFFECTED_TABLES: - if t.name in TABLES_WITH_NULLABLE_FKS: - _delete_null_fk_rows(conn, t) + # Run NULL-FK cleanup unconditionally: it is a no-op DELETE on tables + # whose FK columns are already NOT NULL (cheap), and skipping it on a + # table whose FK was nullable would leave the PK-add to fail with a + # cryptic constraint violation. Cf. ``TABLES_WITH_NULLABLE_FKS`` above + # for documentation of which tables are known to have nullable FKs. + _delete_null_fk_rows(conn, t) _dedupe_by_min_id(conn, t) _assert_no_duplicates(conn, t) diff --git a/tests/unit_tests/migrations/composite_pk_association_tables_test.py b/tests/unit_tests/migrations/composite_pk_association_tables_test.py index 6c3115edaf65..05a69293a23b 100644 --- a/tests/unit_tests/migrations/composite_pk_association_tables_test.py +++ b/tests/unit_tests/migrations/composite_pk_association_tables_test.py @@ -41,8 +41,20 @@ "ab_user", "report_schedule", ), - ("rls_filter_roles", "role_id", "rls_filter_id", "ab_role", "rls_filter"), - ("rls_filter_tables", "table_id", "rls_filter_id", "tables", "rls_filter"), + ( + "rls_filter_roles", + "role_id", + "rls_filter_id", + "ab_role", + "row_level_security_filters", + ), + ( + "rls_filter_tables", + "table_id", + "rls_filter_id", + "tables", + "row_level_security_filters", + ), ("slice_user", "user_id", "slice_id", "ab_user", "slices"), ("sqlatable_user", "user_id", "table_id", "ab_user", "tables"), ] From 761579164654dcefee16d231612c96d313db357d Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Mon, 4 May 2026 10:38:40 -0600 Subject: [PATCH 03/14] docs(migration): address SQLAlchemy review follow-ups Four operator-experience improvements from the second review pass: 1. ``TABLES_WITH_NULLABLE_FKS`` is now explicitly documented as an informational set that is not consulted at runtime; the comment explains the previous ``dashboard_roles`` omission was the bug that motivated the always-run cleanup. 2. ``_delete_null_fk_rows`` docstring updated to match the "always run" semantics (was still claiming "called only on tables in TABLES_WITH_NULLABLE_FKS"). 3. ``_check_no_external_fks_to_id`` now documents its scope limitation: ``Inspector.get_table_names()`` returns the default schema only, so cross-schema FKs in non-standard multi-schema PostgreSQL deployments would not be caught. The single-schema case (Superset's documented deployment) is fully covered. 4. ``_dedupe_by_min_id`` now logs a sample of up to 10 discarded ``(fk1, fk2, id)`` tuples at WARN before deletion, so operators can audit which rows the ``MIN(id)`` policy drops. The keep- original policy is correct in practice but discards later re-grants on ownership tables; the sample makes that visible. 5. ``UPDATING.md`` documents the upgrade/downgrade primary-key name divergence (``pk_
`` vs ``
_pkey``) so operators using schema-comparison tools don't mistake it for migration drift. No schema or runtime-behaviour changes. All 44 migration tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- UPDATING.md | 2 + ...3611e32_composite_pk_association_tables.py | 59 +++++++++++++++---- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/UPDATING.md b/UPDATING.md index 1ece31a1c6fd..6bcb75136142 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -357,6 +357,8 @@ SELECT COUNT(*) FROM sqlatable_user WHERE user_id IS NULL OR table_id IS NULL; **Intentional downgrade asymmetry.** The migration's `downgrade()` restores the surrogate `id` column and (for `dashboard_slices` and `report_schedule_user`) the original `UNIQUE (fk1, fk2)` constraint, but it does **not** restore the original `NULL`-allowed state on the FK columns — they remain `NOT NULL`. This is intentional: under SQLAlchemy's `secondary=` semantics, a `NULL` in either FK column of a junction table is meaningless (it cannot participate in either side of the relationship). Operators downgrading are not expected to need this restored. The asymmetry is documented for completeness so that round-trip schema diffs are not mistaken for migration bugs. +**Constraint-name divergence between upgrade and downgrade.** The composite primary key created on upgrade is named `pk_
` (Alembic's default for `op.create_primary_key("pk_
", ...)`), while the surrogate `id` primary key restored on downgrade is named `
_pkey` (PostgreSQL's default convention for `PrimaryKeyConstraint("id")`). The two names alternate so that a round-trip (upgrade → downgrade → upgrade) does not collide on a pre-existing constraint name. Operators using schema-comparison tools (e.g. `pg_diff`, `migra`) against a downgraded database may see this as drift versus a fresh-install schema. It is cosmetic — no application code references either constraint name. + ## 6.0.0 - [33055](https://github.com/apache/superset/pull/33055): Upgrades Flask-AppBuilder to 5.0.0. The AUTH_OID authentication type has been deprecated and is no longer available as an option in Flask-AppBuilder. OpenID (OID) is considered a deprecated authentication protocol - if you are using AUTH_OID, you will need to migrate to an alternative authentication method such as OAuth, LDAP, or database authentication before upgrading. - [34871](https://github.com/apache/superset/pull/34871): Fixed Jest test hanging issue from Ant Design v5 upgrade. MessageChannel is now mocked in test environment to prevent rc-overflow from causing Jest to hang. Test environment only - no production impact. diff --git a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py index ec637de0118d..398e96cb755f 100644 --- a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py +++ b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py @@ -74,13 +74,15 @@ class AssociationTable(NamedTuple): "report_schedule_user", } -# Tables whose FK columns are nullable in their original create_table -# migrations. ``dashboard_roles.dashboard_id`` (created in revision -# e11ccdd12658) is nullable; ``report_schedule_user`` is the only association -# table that was created with both FK columns ``NOT NULL``. The pre-flight -# NULL-FK cleanup is a cheap no-op DELETE when run against tables whose FKs -# are already NOT NULL, so we run it on every affected table to avoid drift -# bugs from this set going stale. +# Documentation set: tables whose FK columns are nullable in their original +# create_table migrations (``dashboard_roles.dashboard_id`` from revision +# e11ccdd12658 is the most recent addition). ``report_schedule_user`` is the +# only affected table created with both FK columns ``NOT NULL`` and is +# intentionally absent here. This set is no longer consulted at runtime — the +# upgrade now runs the NULL-FK cleanup on every affected table because the +# DELETE is a cheap no-op when the columns are already NOT NULL, and that +# eliminates the risk of bugs from this set going stale (the +# ``dashboard_roles`` omission caught in PR review was exactly that bug). TABLES_WITH_NULLABLE_FKS: set[str] = { "dashboard_roles", "dashboard_slices", @@ -95,7 +97,18 @@ class AssociationTable(NamedTuple): def _check_no_external_fks_to_id(conn: Connection) -> None: """Raise ``RuntimeError`` if any foreign key in the database references one of the eight junction-table ``id`` columns. Uses SQLAlchemy's ``Inspector`` - for dialect-agnostic introspection across PostgreSQL, MySQL, and SQLite.""" + for dialect-agnostic introspection across PostgreSQL, MySQL, and SQLite. + + Scope limitation: ``Inspector.get_table_names()`` returns tables in the + connection's default schema only. On PostgreSQL deployments where Superset + metadata lives in a non-default schema, or on multi-schema deployments + that allow cross-schema FKs, an external FK in another schema would not + be detected. This is acceptable for the standard single-schema + deployment that Superset documents; operators with multi-schema + metadata should run the equivalent inventory query against + ``information_schema.referential_constraints`` themselves before + applying. + """ affected = {t.name for t in AFFECTED_TABLES} insp = inspect(conn) for table_name in insp.get_table_names(): @@ -115,10 +128,10 @@ def _check_no_external_fks_to_id(conn: Connection) -> None: def _delete_null_fk_rows(conn: Connection, t: AssociationTable) -> int: """Delete rows where ``t.fk1`` or ``t.fk2`` is NULL on ``t.name``. - Returns the deletion count. Called only on tables in - ``TABLES_WITH_NULLABLE_FKS``. Required because primary-key columns must be + Returns the deletion count. Required because primary-key columns must be NOT NULL; the PK-add downstream would fail with a cryptic constraint - violation if any NULL-FK rows survived. + violation if any NULL-FK rows survived. Run unconditionally on every + affected table — see ``TABLES_WITH_NULLABLE_FKS`` above for the rationale. """ # Identifiers come from the AFFECTED_TABLES whitelist, not user input. sql = sa.text( @@ -142,8 +155,22 @@ def _dedupe_by_min_id(conn: Connection, t: AssociationTable) -> int: portability — MySQL rejects ``DELETE FROM t WHERE id NOT IN (SELECT MIN(id) FROM t GROUP BY ...)`` with ERROR 1093 unless the inner SELECT is wrapped to force materialization. + + Logs a sample (up to 10) of the discarded ``(fk1, fk2, id)`` tuples at + WARN before deletion, so operators can audit which rows are dropped — the + "keep ``MIN(id)``" policy preserves the original row, which is correct + in practice but discards any later, semantically-identical re-grants. """ # Identifiers come from the AFFECTED_TABLES whitelist, not user input. + sample_sql = sa.text( + f"SELECT {t.fk1}, {t.fk2}, id FROM {t.name} WHERE id NOT IN (" # noqa: S608 + f" SELECT keep_id FROM (" + f" SELECT MIN(id) AS keep_id FROM {t.name} " + f"GROUP BY {t.fk1}, {t.fk2}" + f" ) AS s" + f") LIMIT 10" + ) + sample = list(conn.execute(sample_sql)) sql = sa.text( f"DELETE FROM {t.name} WHERE id NOT IN (" # noqa: S608 f" SELECT keep_id FROM (" @@ -155,7 +182,15 @@ def _dedupe_by_min_id(conn: Connection, t: AssociationTable) -> int: result = conn.execute(sql) n = result.rowcount or 0 if n: - logger.warning("Deduped %d duplicate row(s) from %s", n, t.name) + logger.warning( + "Deduped %d duplicate row(s) from %s; sample of discarded " + "(%s, %s, id) tuples (up to 10): %s", + n, + t.name, + t.fk1, + t.fk2, + sample, + ) return n From 4c50af18d2d2f06042484bbccf5a8cf7c09dacf4 Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Mon, 4 May 2026 15:35:14 -0600 Subject: [PATCH 04/14] refactor(migration): build pre-flight SQL via SQLAlchemy core (review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Beto's review comments on apache/superset#39859: replace ``sa.text(f"...")`` SQL construction in the three pre-flight helpers (``_delete_null_fk_rows``, ``_dedupe_by_min_id``, ``_assert_no_duplicates``) with SQLAlchemy core constructs (``sa.delete``, ``sa.select``, ``sa.func``, ``.subquery()``, ``.notin_()``). A small ``_table_clause()`` helper builds a lightweight ``TableClause`` exposing the columns the queries reference; the three helpers consume it. Removes all ``# noqa: S608`` comments — they are no longer needed because there is no string-interpolated SQL. Verified the compiled SQL is identical on Postgres, MySQL, and SQLite, including the MySQL ERROR 1093 workaround (the inner aggregation is wrapped in a derived table via ``.subquery()``, producing ``... NOT IN (SELECT keep_id FROM (SELECT min(id) ...) AS keep_min)``). Also drops the redundant ``f`` prefix on the two non-interpolating lines of the ``_check_no_external_fks_to_id`` error message. 44 migration tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...3611e32_composite_pk_association_tables.py | 83 ++++++++++--------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py index 398e96cb755f..8a128bfd7461 100644 --- a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py +++ b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py @@ -120,11 +120,19 @@ def _check_no_external_fks_to_id(conn: Connection) -> None: f"Cannot drop synthetic id from {fk['referred_table']}: " f"external FK {fk.get('name', '')} on {table_name} " f"references {fk['referred_table']}({fk['referred_columns']}). " - f"Drop or migrate the referencing FK before applying this " - f"migration." + "Drop or migrate the referencing FK before applying this " + "migration." ) +def _table_clause(t: AssociationTable) -> sa.sql.expression.TableClause: + """Build a lightweight SQLAlchemy ``TableClause`` for ``t`` exposing the + columns the helper queries reference (``id``, ``fk1``, ``fk2``). Used so + that the dedupe / cleanup / assert SQL can be expressed via SQLAlchemy + core constructs rather than via string interpolation.""" + return sa.table(t.name, sa.column("id"), sa.column(t.fk1), sa.column(t.fk2)) + + def _delete_null_fk_rows(conn: Connection, t: AssociationTable) -> int: """Delete rows where ``t.fk1`` or ``t.fk2`` is NULL on ``t.name``. @@ -133,11 +141,9 @@ def _delete_null_fk_rows(conn: Connection, t: AssociationTable) -> int: violation if any NULL-FK rows survived. Run unconditionally on every affected table — see ``TABLES_WITH_NULLABLE_FKS`` above for the rationale. """ - # Identifiers come from the AFFECTED_TABLES whitelist, not user input. - sql = sa.text( - f"DELETE FROM {t.name} WHERE {t.fk1} IS NULL OR {t.fk2} IS NULL" # noqa: S608 - ) - result = conn.execute(sql) + tbl = _table_clause(t) + stmt = sa.delete(tbl).where(sa.or_(tbl.c[t.fk1].is_(None), tbl.c[t.fk2].is_(None))) + result = conn.execute(stmt) n = result.rowcount or 0 if n: logger.warning( @@ -151,35 +157,35 @@ def _delete_null_fk_rows(conn: Connection, t: AssociationTable) -> int: def _dedupe_by_min_id(conn: Connection, t: AssociationTable) -> int: """Delete duplicate ``(t.fk1, t.fk2)`` rows from ``t.name`` keeping ``MIN(id)``. - Returns the deletion count. Uses the wrapped-subquery form for MySQL - portability — MySQL rejects ``DELETE FROM t WHERE id NOT IN (SELECT MIN(id) - FROM t GROUP BY ...)`` with ERROR 1093 unless the inner SELECT is wrapped - to force materialization. + Returns the deletion count. The ``NOT IN`` argument is wrapped in an + extra ``SELECT keep_id FROM (...) AS s`` derived table because MySQL + rejects ``DELETE FROM t WHERE id NOT IN (SELECT MIN(id) FROM t GROUP BY + ...)`` with ERROR 1093 unless the inner SELECT is materialized through + a derived table. SQLAlchemy's ``.subquery()`` produces that wrap. Logs a sample (up to 10) of the discarded ``(fk1, fk2, id)`` tuples at - WARN before deletion, so operators can audit which rows are dropped — the - "keep ``MIN(id)``" policy preserves the original row, which is correct - in practice but discards any later, semantically-identical re-grants. + WARN before deletion, so operators can audit which rows are dropped — + the "keep ``MIN(id)``" policy preserves the original row, which is + correct in practice but discards any later, semantically-identical + re-grants. """ - # Identifiers come from the AFFECTED_TABLES whitelist, not user input. - sample_sql = sa.text( - f"SELECT {t.fk1}, {t.fk2}, id FROM {t.name} WHERE id NOT IN (" # noqa: S608 - f" SELECT keep_id FROM (" - f" SELECT MIN(id) AS keep_id FROM {t.name} " - f"GROUP BY {t.fk1}, {t.fk2}" - f" ) AS s" - f") LIMIT 10" + tbl = _table_clause(t) + + keep_min = ( + sa.select(sa.func.min(tbl.c.id).label("keep_id")) + .group_by(tbl.c[t.fk1], tbl.c[t.fk2]) + .subquery("keep_min") ) - sample = list(conn.execute(sample_sql)) - sql = sa.text( - f"DELETE FROM {t.name} WHERE id NOT IN (" # noqa: S608 - f" SELECT keep_id FROM (" - f" SELECT MIN(id) AS keep_id FROM {t.name} " - f"GROUP BY {t.fk1}, {t.fk2}" - f" ) AS s" - f")" + keep_ids = sa.select(keep_min.c.keep_id) + discarded = tbl.c.id.notin_(keep_ids) + + sample_stmt = ( + sa.select(tbl.c[t.fk1], tbl.c[t.fk2], tbl.c.id).where(discarded).limit(10) ) - result = conn.execute(sql) + sample = list(conn.execute(sample_stmt)) + + delete_stmt = sa.delete(tbl).where(discarded) + result = conn.execute(delete_stmt) n = result.rowcount or 0 if n: logger.warning( @@ -201,13 +207,16 @@ def _assert_no_duplicates(conn: Connection, t: AssociationTable) -> None: dedupe failures (e.g., a MySQL syntax issue) as an actionable error before the PK-add fires with a less-helpful constraint-violation message. """ - # Identifiers come from the AFFECTED_TABLES whitelist, not user input. - sql = sa.text( - f"SELECT COUNT(*) FROM (" # noqa: S608 - f" SELECT 1 FROM {t.name} GROUP BY {t.fk1}, {t.fk2} HAVING COUNT(*) > 1" - f") AS s" + tbl = _table_clause(t) + duplicate_groups = ( + sa.select(sa.literal(1)) + .select_from(tbl) + .group_by(tbl.c[t.fk1], tbl.c[t.fk2]) + .having(sa.func.count() > 1) + .subquery("duplicate_groups") ) - if remaining := conn.scalar(sql) or 0: + count_stmt = sa.select(sa.func.count()).select_from(duplicate_groups) + if remaining := conn.scalar(count_stmt) or 0: raise RuntimeError( f"Dedupe failed for {t.name}: {remaining} duplicate " f"({t.fk1}, {t.fk2}) groups remain after _dedupe_by_min_id. " From 499763e2914cc88d354b7d89e19f7f25598d207d Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Mon, 4 May 2026 16:01:58 -0600 Subject: [PATCH 05/14] fix(migration): drop FKs before recreate on MySQL (sc-105349) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI test-mysql failed with: MySQLdb.OperationalError: (1826, "Duplicate foreign key constraint name 'fk_dashboard_slices_slice_id_slices'") Root cause: MySQL scopes foreign-key constraint names per-database, not per-table (PostgreSQL and SQLite scope per-table). The ``batch_alter_table(... recreate="always", copy_from=...)`` path used for ``dashboard_slices`` and ``report_schedule_user`` builds ``_alembic_tmp_
`` carrying the original FK names from ``copy_from`` while the original table still holds those names — MySQL rejects the temp-table creation with ERROR 1826. Fix: on MySQL only, drop the original FK constraints by name before the ``batch_alter_table`` runs. The ``copy_from`` re-creates them on the rebuilt table with their original names, so the post-migration shape is unchanged. On PostgreSQL and SQLite the original code path still runs unchanged. Local SQLite tests (44 passed, 1 skipped) still pass; CI will validate on MySQL. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...2bee73611e32_composite_pk_association_tables.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py index 8a128bfd7461..8d7b2846d342 100644 --- a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py +++ b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py @@ -289,6 +289,20 @@ def upgrade() -> None: # temp-table index-name collision; on SQLite, the auto-detect picks # ``recreate=True`` because PK changes need it. if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE: + # MySQL ERROR 1826: foreign-key constraint names are unique + # per-database, not per-table. ``recreate="always"`` builds + # ``_alembic_tmp_
`` with the original FK names from + # ``copy_from``, but the original table still holds those + # names until it's dropped, which fails on MySQL with + # ``Duplicate foreign key constraint name``. PostgreSQL and + # SQLite scope FK names per-table, so the recreate path + # works there as-is. Drop the original FKs by name first + # on MySQL; ``copy_from`` re-creates them on the rebuilt + # table with their original names. + if conn.dialect.name == "mysql": + for fk in insp.get_foreign_keys(t.name): + if fk_name := fk.get("name"): + op.drop_constraint(fk_name, t.name, type_="foreignkey") with op.batch_alter_table( t.name, recreate="always", From c5bf70560d34195f41490979b3407da6ccd4632f Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Tue, 5 May 2026 10:41:03 -0600 Subject: [PATCH 06/14] fix(migration): MySQL downgrade FK + AUTO_INCREMENT (sc-105349) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two MySQL-only failures in the downgrade path, found by running the full migration history against a fresh MySQL 8 container: 1. ``MySQLdb.OperationalError: (1553, "Cannot drop index 'PRIMARY': needed in a foreign key constraint")``. InnoDB uses the composite PK index to back the FK on the leftmost column. The downgrade tried to drop the composite PK before dropping the FKs, orphaning the FK's backing index. PostgreSQL and SQLite create separate indexes for FK columns and don't trip on this. 2. ``Field 'id' doesn't have a default value`` on subsequent INSERT. ``sa.Identity(always=False)`` only emits ``AUTO_INCREMENT`` on MySQL when the column is created with ``primary_key=True`` — our portable path adds the column first then creates the PK separately, so MySQL leaves the column without auto-generation. Existing rows would all collide on id=0; future inserts fail because no default. Postgres' ``GENERATED BY DEFAULT AS IDENTITY`` and SQLite's ``INTEGER PRIMARY KEY`` rowid alias don't have this gap. Fix: extract ``_downgrade_mysql_table()`` that emits the canonical MySQL idiom — drop FKs, then a single ALTER combining ``DROP PRIMARY KEY, ADD COLUMN id INT NOT NULL AUTO_INCREMENT, ADD PRIMARY KEY (id)`` (which backfills existing rows with sequential ids and preserves AUTO_INCREMENT), restore the redundant UNIQUE on the 2 tables that originally had it, and re-add the FKs with their original names. Postgres and SQLite keep the existing portable ``batch_alter_table`` path. Raw SQL is unavoidable for the combined-ALTER form; per the constitution it's allowed for dialect-specific DDL with no SQLA equivalent, with triple-quoted strings for legibility. Verified end-to-end: upgrade → downgrade → upgrade against a fresh MySQL 8 container with INSERT-without-id sanity check showing the restored ``id`` column auto-increments correctly. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...3611e32_composite_pk_association_tables.py | 108 +++++++++++++++--- 1 file changed, 94 insertions(+), 14 deletions(-) diff --git a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py index 8d7b2846d342..e8a77614561c 100644 --- a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py +++ b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py @@ -337,19 +337,99 @@ def downgrade() -> None: # 10+ / MySQL 8+) and ``sa.Sequence`` (with explicit nextval) both # backfill existing rows during ALTER TABLE; bare ``autoincrement=True`` # does not. ``Identity`` is the modern portable choice. + conn = op.get_bind() + insp = inspect(conn) + is_mysql = conn.dialect.name == "mysql" for t in reversed(AFFECTED_TABLES): - with op.batch_alter_table(t.name) as batch_op: - batch_op.drop_constraint(f"pk_{t.name}", type_="primary") - batch_op.add_column( - sa.Column( - "id", - sa.Integer, - sa.Identity(always=False), - nullable=False, - ) - ) - batch_op.create_primary_key(f"{t.name}_pkey", ["id"]) - if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE: - batch_op.create_unique_constraint( - f"uq_{t.name}_{t.fk1}_{t.fk2}", [t.fk1, t.fk2] + if is_mysql: + _downgrade_mysql_table(insp, t) + else: + with op.batch_alter_table(t.name) as batch_op: + batch_op.drop_constraint(f"pk_{t.name}", type_="primary") + batch_op.add_column( + sa.Column( + "id", + sa.Integer, + sa.Identity(always=False), + nullable=False, + ) ) + batch_op.create_primary_key(f"{t.name}_pkey", ["id"]) + if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE: + batch_op.create_unique_constraint( + f"uq_{t.name}_{t.fk1}_{t.fk2}", [t.fk1, t.fk2] + ) + + +def _downgrade_mysql_table( + insp: sa.engine.reflection.Inspector, t: AssociationTable +) -> None: + """MySQL-specific downgrade for one table. + + Two MySQL quirks force a dialect-specific path here: + + 1. **ERROR 1553 — ``Cannot drop index 'PRIMARY': needed in a foreign + key constraint``**. InnoDB uses the composite PK index to back the + FK on the leftmost column. Dropping the PK before the FKs orphans + that backing index. PostgreSQL and SQLite create separate indexes + for FK columns and don't need this dance. We drop the FKs first + and re-add them after the structural change. + + 2. **``Identity(always=False)`` on a non-PK column add does not emit + ``AUTO_INCREMENT`` on MySQL.** SQLAlchemy 1.4 only emits + ``AUTO_INCREMENT`` when the column has both ``Identity()`` and + ``primary_key=True`` at create time. Our portable path adds the + column first, then creates the PK separately — which works on + Postgres (the column gets ``GENERATED BY DEFAULT AS IDENTITY``) + and SQLite (``INTEGER PRIMARY KEY`` becomes a rowid alias) but + leaves MySQL without auto-generation, so existing rows can't be + backfilled and future ``INSERT`` statements fail with + ``Field 'id' doesn't have a default value``. The combined + ``DROP PRIMARY KEY, ADD COLUMN AUTO_INCREMENT, ADD PRIMARY KEY`` + in a single ALTER statement is the canonical MySQL idiom: MySQL + backfills existing rows with sequential values and the column + remains auto-incrementing for future inserts. + + Raw SQL is unavoidable here — there is no SQLAlchemy core equivalent + for the combined-ALTER form, and the constitution allows raw SQL for + dialect-specific DDL with no programmatic equivalent (preferring + triple-quoted strings for legibility). + """ + fks = insp.get_foreign_keys(t.name) + + for fk in fks: + if fk_name := fk.get("name"): + op.execute(f"ALTER TABLE `{t.name}` DROP FOREIGN KEY `{fk_name}`") + + op.execute( + f""" + ALTER TABLE `{t.name}` + DROP PRIMARY KEY, + ADD COLUMN id INT NOT NULL AUTO_INCREMENT, + ADD PRIMARY KEY (id) + """ + ) + + if t.name in TABLES_WITH_PRE_EXISTING_UNIQUE: + op.execute( + f""" + ALTER TABLE `{t.name}` + ADD UNIQUE INDEX `uq_{t.name}_{t.fk1}_{t.fk2}` + (`{t.fk1}`, `{t.fk2}`) + """ + ) + + for fk in fks: + ondelete = fk.get("options", {}).get("ondelete") + ondelete_clause = f" ON DELETE {ondelete}" if ondelete else "" + local_cols = ", ".join(f"`{c}`" for c in fk["constrained_columns"]) + ref_cols = ", ".join(f"`{c}`" for c in fk["referred_columns"]) + op.execute( + f""" + ALTER TABLE `{t.name}` + ADD CONSTRAINT `{fk["name"]}` + FOREIGN KEY ({local_cols}) + REFERENCES `{fk["referred_table"]}` ({ref_cols}) + {ondelete_clause} + """ + ) From da8f33ad242b6d81c62772c48cbbe0895607e9a3 Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Tue, 5 May 2026 10:46:01 -0600 Subject: [PATCH 07/14] fix(migration): explicit NOT NULL on FK columns for SQLite (sc-105349) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Found by running fresh-install + round-trip against a real SQLite DB: 6 of the 8 affected tables had FK columns that were originally declared nullable. PostgreSQL and MySQL implicitly promote the constituent columns of an ``ALTER TABLE ... ADD PRIMARY KEY`` to ``NOT NULL``; SQLite does not (it's a long-standing SQLite quirk — only ``INTEGER PRIMARY KEY`` enforces NOT NULL on a composite-PK column). Result: a fresh SQLite install would accept ``INSERT INTO dashboard_slices (NULL, 5)`` despite both columns being part of the composite PK. Our integration tests previously masked this: the test fixture seeds columns with ``nullable=False``, so the post-upgrade NOT NULL assertion passed regardless of whether the migration enforced it. Fix: add explicit ``batch_op.alter_column(fk, nullable=False)`` for both FK columns inside the per-table batch_alter_table block. On PostgreSQL and MySQL this is a no-op (PK already implies NOT NULL); on SQLite it adds the missing NOT NULL declaration so a fresh install matches the data-model.md "After" contract. Verified end-to-end: - Postgres + MySQL: column shape unchanged (still NOT NULL) - SQLite fresh install + round-trip: all 8 tables now have NOT NULL on FK columns, ``INSERT (NULL, 5)`` correctly rejected with IntegrityError on dashboard_slices, dashboard_user, sqlatable_user Co-Authored-By: Claude Opus 4.7 (1M context) --- ..._2bee73611e32_composite_pk_association_tables.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py index e8a77614561c..210a419d0eea 100644 --- a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py +++ b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py @@ -310,10 +310,23 @@ def upgrade() -> None: ) as batch_op: batch_op.drop_column("id") batch_op.create_primary_key(f"pk_{t.name}", [t.fk1, t.fk2]) + # SQLite quirk: composite PRIMARY KEY does not promote the + # constituent columns to NOT NULL (only ``INTEGER PRIMARY + # KEY`` does). PostgreSQL and MySQL implicitly promote the + # PK columns to NOT NULL when the constraint is added, + # so the explicit ``alter_column`` is a no-op on those + # backends but enforces the post-upgrade contract on + # SQLite. Without it, ``INSERT (NULL, 5)`` would succeed + # on SQLite despite the columns being part of the PK. + batch_op.alter_column(t.fk1, existing_type=sa.Integer, nullable=False) + batch_op.alter_column(t.fk2, existing_type=sa.Integer, nullable=False) else: with op.batch_alter_table(t.name) as batch_op: batch_op.drop_column("id") batch_op.create_primary_key(f"pk_{t.name}", [t.fk1, t.fk2]) + # See comment above re: SQLite composite-PK NOT NULL quirk. + batch_op.alter_column(t.fk1, existing_type=sa.Integer, nullable=False) + batch_op.alter_column(t.fk2, existing_type=sa.Integer, nullable=False) def downgrade() -> None: From 499a75f20db133c16ba4ddc54193a102b14192c1 Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Tue, 5 May 2026 11:07:10 -0600 Subject: [PATCH 08/14] fix(migration): rebase down_revision onto 33d7e0e21daa (sc-105349) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI cypress + playwright shards were red with: ERROR [flask_migrate] Error: Multiple head revisions are present for given argument 'head' The recent rebase onto master pulled in ``33d7e0e21daa_add_semantic_layers_and_views.py`` (from PR #37815, "semantic layer extension"), which had been authored against ``ce6bd21901ab`` as its parent — the same parent our migration referenced. After the rebase both migrations point at ``ce6bd21901ab``, producing two heads and breaking ``flask db upgrade head`` for any downstream consumer (CI's Cypress / Playwright shards spin up a real Superset instance via ``superset db upgrade``, which is why those shards failed first; the integration shards run against a precomputed schema and didn't surface this). Fix: chain our migration after the semantic-layer migration by pointing ``down_revision`` at ``33d7e0e21daa``. The chain is now linear: ... → ce6bd21901ab → 33d7e0e21daa (semantic layers) → 2bee73611e32 (composite PK, this PR) Verified with ``superset db heads`` (returns single head ``2bee73611e32``) and the local migration test suite (44 passed, 1 skipped). Co-Authored-By: Claude Opus 4.7 (1M context) --- ...5-01_23-36_2bee73611e32_composite_pk_association_tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py index 210a419d0eea..055ecd3c9700 100644 --- a/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py +++ b/superset/migrations/versions/2026-05-01_23-36_2bee73611e32_composite_pk_association_tables.py @@ -27,7 +27,7 @@ of the eight tables lacked DB-level uniqueness. Revision ID: 2bee73611e32 -Revises: ce6bd21901ab +Revises: 33d7e0e21daa Create Date: 2026-05-01 23:36:34.050058 """ @@ -42,7 +42,7 @@ # revision identifiers, used by Alembic. revision = "2bee73611e32" -down_revision = "ce6bd21901ab" +down_revision = "33d7e0e21daa" logger = logging.getLogger("alembic.env") From eb685d9476e7e725f836ad1252f62ef82b4b3e29 Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Thu, 7 May 2026 10:23:21 -0600 Subject: [PATCH 09/14] docs(UPDATING): add Postgres-targeted maintenance-window queries (sc-105349) Add a "Sizing the maintenance window on PostgreSQL" sub-section to the operator runbook. The simple per-table COUNT/duplicate/NULL queries that were already there are dialect-portable but only count rows; operators on PostgreSQL with large deployments need to characterize the migration's runtime cost before scheduling it. Adds four diagnostic queries: - Per-table size, row count (from pg_class.reltuples), and which migration path each table will take (recreate-rewrite vs direct ALTER). Sizes the work concretely. - Aggregated duplicate-row roll-up: dup_groups + total rows_dropped per table. Replaces eight separate per-table queries with one consolidated result for audit/dump-before-apply decisions. - External-FK pre-flight check (the same one the migration runs at upgrade time and aborts on). Lets operators surface any blocking external reference ahead of the maintenance window. Should be empty on a stock install. - Lock-window estimate for the two full-rewrite tables, using pg_relation_size and a conservative 100 MB/s rewrite throughput assumption. The other six use direct ALTER and are dominated by composite-index build time (seconds for low-millions-of-rows tables). Prompted by reviewer feedback on apache/superset#39859 from a large deployment asking how to size the maintenance window. The original pre-flight queries are kept for cross-dialect operators (MySQL, SQLite) since the new queries use PostgreSQL-specific catalog views. Co-Authored-By: Claude Opus 4.7 (1M context) --- UPDATING.md | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/UPDATING.md b/UPDATING.md index 6bcb75136142..1e6050a353b8 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -353,6 +353,108 @@ SELECT COUNT(*) FROM slice_user WHERE user_id IS NULL OR slice_id IS NULL; SELECT COUNT(*) FROM sqlatable_user WHERE user_id IS NULL OR table_id IS NULL; ``` +**Sizing the maintenance window on PostgreSQL.** The queries above are dialect-portable but only count rows. Operators on PostgreSQL can run the diagnostic queries below to characterize the migration's runtime cost ahead of time: per-table row count and on-disk size, an aggregated duplicate roll-up, the external-FK pre-flight check (the migration runs the same check and aborts if it returns rows), and a rewrite-time estimate for the two tables that go through the slower full-table-rebuild path. + +```sql +-- Per-table size, row count, and which migration path each will take. +-- Two tables ("dashboard_slices", "report_schedule_user") have a +-- redundant UNIQUE constraint that the migration drops via a full +-- table rewrite (op.batch_alter_table(recreate="always")). The other +-- six use direct ALTER TABLE, which is much cheaper. +WITH affected(name, has_unique) AS ( + VALUES + ('dashboard_roles', false), + ('dashboard_slices', true), + ('dashboard_user', false), + ('report_schedule_user', true), + ('rls_filter_roles', false), + ('rls_filter_tables', false), + ('slice_user', false), + ('sqlatable_user', false) +) +SELECT + a.name AS table_name, + CASE WHEN a.has_unique THEN 'recreate (full rewrite)' + ELSE 'direct ALTER' END AS migration_path, + c.reltuples::bigint AS estimated_rows, + pg_size_pretty(pg_total_relation_size(c.oid)) AS total_size, + pg_size_pretty(pg_relation_size(c.oid)) AS heap_size, + pg_size_pretty(pg_indexes_size(c.oid)) AS index_size +FROM affected a +JOIN pg_class c ON c.relname = a.name AND c.relkind = 'r' +ORDER BY pg_total_relation_size(c.oid) DESC; +``` + +```sql +-- Aggregated duplicate-row roll-up. +-- "dup_groups" is the number of (fk1, fk2) pairs that appear more +-- than once; "rows_dropped" is the total number of rows the +-- migration will delete during the dedupe pass (it keeps MIN(id) per +-- group and discards the rest). +SELECT 'dashboard_roles' AS t, COUNT(*) AS dup_groups, SUM(c) - COUNT(*) AS rows_dropped + FROM (SELECT COUNT(*) c FROM dashboard_roles GROUP BY dashboard_id, role_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'dashboard_slices', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM dashboard_slices GROUP BY dashboard_id, slice_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'dashboard_user', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM dashboard_user GROUP BY user_id, dashboard_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'report_schedule_user',COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM report_schedule_user GROUP BY user_id, report_schedule_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'rls_filter_roles', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM rls_filter_roles GROUP BY role_id, rls_filter_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'rls_filter_tables', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM rls_filter_tables GROUP BY table_id, rls_filter_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'slice_user', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM slice_user GROUP BY user_id, slice_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'sqlatable_user', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM sqlatable_user GROUP BY user_id, table_id HAVING COUNT(*) > 1) g +ORDER BY rows_dropped DESC NULLS LAST; +``` + +```sql +-- External-FK pre-flight check. +-- The migration runs the equivalent check at upgrade time and aborts +-- if any external FK references one of the soon-to-be-removed `id` +-- columns. Running it ahead of time lets you discover (and migrate) +-- any such reference before the maintenance window. On a stock +-- Superset install this should return zero rows. (Default schema +-- only; multi-schema deployments need to broaden the lookup.) +SELECT + rc.constraint_name, + kcu.table_schema || '.' || kcu.table_name AS referencing_table, + kcu.column_name AS referencing_column, + ccu.table_name AS referenced_table, + ccu.column_name AS referenced_column +FROM information_schema.referential_constraints rc +JOIN information_schema.key_column_usage kcu + ON kcu.constraint_name = rc.constraint_name + AND kcu.constraint_schema = rc.constraint_schema +JOIN information_schema.constraint_column_usage ccu + ON ccu.constraint_name = rc.constraint_name + AND ccu.constraint_schema = rc.constraint_schema +WHERE ccu.table_name IN ( + 'dashboard_roles','dashboard_slices','dashboard_user', + 'report_schedule_user','rls_filter_roles','rls_filter_tables', + 'slice_user','sqlatable_user') + AND ccu.column_name = 'id'; +``` + +```sql +-- Lock-window estimate for the two full-rewrite tables. +-- recreate="always" takes ACCESS EXCLUSIVE on the table for the full +-- rewrite. Use heap size combined with your hardware's effective +-- write throughput (~100-200 MB/s on commodity SSD; faster on NVMe) +-- to size the maintenance window. The other six tables use direct +-- ALTER and are dominated by composite-index build time, typically +-- seconds for tables in the low millions of rows. +SELECT + c.relname AS table_name, + pg_size_pretty(pg_relation_size(c.oid)) AS heap_size, + pg_relation_size(c.oid) / 1024 / 1024 AS heap_size_mb, + ROUND(pg_relation_size(c.oid) / 1024 / 1024 / 100.0, 1) AS est_rewrite_seconds_at_100mbs +FROM pg_class c +WHERE c.relname IN ('dashboard_slices', 'report_schedule_user'); +``` + **Restoring an old `pg_dump` (or equivalent) against the new schema.** A dump taken before the migration includes `INSERT` statements that populate the now-removed `id` column. Restoring such a dump against the post-migration schema will fail. The supported workaround is to dump only the schema and reference data, then re-create the M:N associations from application data after restore — for example with `pg_dump --exclude-table-data` (or per-table `--exclude-table-data=dashboard_slices` etc.) for the eight junction tables, restore the rest, then run a one-shot script that re-INSERTs `(fk1, fk2)` pairs derived from your application export. Operators who need to restore an old dump verbatim should restore against a pre-migration Superset and then re-run the upgrade. **Intentional downgrade asymmetry.** The migration's `downgrade()` restores the surrogate `id` column and (for `dashboard_slices` and `report_schedule_user`) the original `UNIQUE (fk1, fk2)` constraint, but it does **not** restore the original `NULL`-allowed state on the FK columns — they remain `NOT NULL`. This is intentional: under SQLAlchemy's `secondary=` semantics, a `NULL` in either FK column of a junction table is meaningless (it cannot participate in either side of the relationship). Operators downgrading are not expected to need this restored. The asymmetry is documented for completeness so that round-trip schema diffs are not mistaken for migration bugs. From 1656006c72b41f678ad80a818a933ab49f01c911 Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Thu, 7 May 2026 10:58:21 -0600 Subject: [PATCH 10/14] docs(UPDATING): add MySQL-targeted maintenance-window queries (sc-105349) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror of the PostgreSQL diagnostic queries added in 11148779ed, adapted for MySQL/InnoDB. One important difference: InnoDB rebuilds the clustered index on every PK change, so all eight tables undergo a full table rebuild on MySQL — not just the two that go through the explicit ``recreate="always"`` path. The lock-window estimate query is updated to cover all eight rather than just two, and the "migration_path" column makes the rebuild expectation explicit ("direct ALTER (still rebuilds InnoDB clustered index)"). Other notes: - ``information_schema.TABLES.TABLE_ROWS`` is an InnoDB estimate, analogous to PostgreSQL's ``reltuples``; documented inline. - ``KEY_COLUMN_USAGE`` carries both sides of the FK in a single row on MySQL, so the external-FK pre-flight check is simpler than the PostgreSQL version (no joins between three views). - The aggregated dedupe query is portable standard SQL; included verbatim for copy-paste convenience. Co-Authored-By: Claude Opus 4.7 (1M context) --- UPDATING.md | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/UPDATING.md b/UPDATING.md index 1e6050a353b8..fb2a0fa4a5cf 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -455,6 +455,95 @@ FROM pg_class c WHERE c.relname IN ('dashboard_slices', 'report_schedule_user'); ``` +**Sizing the maintenance window on MySQL.** Equivalent diagnostic queries for MySQL/InnoDB. One important difference from PostgreSQL: InnoDB rebuilds the clustered index on every PK change, so *all eight* tables undergo a full table rebuild on MySQL — not just the two that go through the explicit `recreate="always"` path. The lock-window estimate query below therefore covers all eight tables. + +```sql +-- Per-table size, row count, and which migration path each will take. +-- TABLE_ROWS is an InnoDB estimate (analogous to PostgreSQL's reltuples); +-- run SELECT COUNT(*) per table for an exact count if needed. +SELECT + TABLE_NAME AS table_name, + CASE WHEN TABLE_NAME IN ('dashboard_slices', 'report_schedule_user') + THEN 'recreate (explicit, drops UNIQUE)' + ELSE 'direct ALTER (still rebuilds InnoDB clustered index)' + END AS migration_path, + TABLE_ROWS AS estimated_rows, + CONCAT(ROUND((DATA_LENGTH + INDEX_LENGTH) / 1024 / 1024, 1), ' MB') AS total_size, + CONCAT(ROUND(DATA_LENGTH / 1024 / 1024, 1), ' MB') AS heap_size, + CONCAT(ROUND(INDEX_LENGTH / 1024 / 1024, 1), ' MB') AS index_size +FROM information_schema.TABLES +WHERE TABLE_SCHEMA = DATABASE() + AND TABLE_NAME IN ( + 'dashboard_roles', 'dashboard_slices', 'dashboard_user', + 'report_schedule_user', 'rls_filter_roles', 'rls_filter_tables', + 'slice_user', 'sqlatable_user' + ) +ORDER BY (DATA_LENGTH + INDEX_LENGTH) DESC; +``` + +```sql +-- Aggregated duplicate-row roll-up. Same SQL as the PostgreSQL version +-- (standard SQL); included here for copy-paste convenience. +SELECT 'dashboard_roles' AS t, COUNT(*) AS dup_groups, SUM(c) - COUNT(*) AS rows_dropped + FROM (SELECT COUNT(*) c FROM dashboard_roles GROUP BY dashboard_id, role_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'dashboard_slices', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM dashboard_slices GROUP BY dashboard_id, slice_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'dashboard_user', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM dashboard_user GROUP BY user_id, dashboard_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'report_schedule_user',COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM report_schedule_user GROUP BY user_id, report_schedule_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'rls_filter_roles', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM rls_filter_roles GROUP BY role_id, rls_filter_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'rls_filter_tables', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM rls_filter_tables GROUP BY table_id, rls_filter_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'slice_user', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM slice_user GROUP BY user_id, slice_id HAVING COUNT(*) > 1) g +UNION ALL SELECT 'sqlatable_user', COUNT(*), SUM(c) - COUNT(*) + FROM (SELECT COUNT(*) c FROM sqlatable_user GROUP BY user_id, table_id HAVING COUNT(*) > 1) g +ORDER BY rows_dropped DESC; +``` + +```sql +-- External-FK pre-flight check. KEY_COLUMN_USAGE on MySQL carries +-- both sides of the FK in a single row, so this is simpler than the +-- PostgreSQL version. Should return zero rows on a stock install. +SELECT + CONSTRAINT_NAME, + CONCAT(TABLE_SCHEMA, '.', TABLE_NAME) AS referencing_table, + COLUMN_NAME AS referencing_column, + REFERENCED_TABLE_NAME AS referenced_table, + REFERENCED_COLUMN_NAME AS referenced_column +FROM information_schema.KEY_COLUMN_USAGE +WHERE TABLE_SCHEMA = DATABASE() + AND REFERENCED_TABLE_NAME IN ( + 'dashboard_roles', 'dashboard_slices', 'dashboard_user', + 'report_schedule_user', 'rls_filter_roles', 'rls_filter_tables', + 'slice_user', 'sqlatable_user' + ) + AND REFERENCED_COLUMN_NAME = 'id'; +``` + +```sql +-- Lock-window estimate for ALL EIGHT tables (InnoDB rebuilds the +-- clustered index on PK change, so even "direct ALTER" is a rewrite). +-- ADD PRIMARY KEY is INPLACE but not LOCK=NONE — it allows concurrent +-- reads but blocks writes. Use heap size combined with your effective +-- rebuild throughput (~100-200 MB/s on commodity SSD; higher on NVMe). +SELECT + TABLE_NAME AS table_name, + CONCAT(ROUND(DATA_LENGTH / 1024 / 1024, 1), ' MB') AS heap_size, + ROUND(DATA_LENGTH / 1024 / 1024, 1) AS heap_size_mb, + ROUND(DATA_LENGTH / 1024 / 1024 / 100.0, 1) AS est_rewrite_seconds_at_100mbs +FROM information_schema.TABLES +WHERE TABLE_SCHEMA = DATABASE() + AND TABLE_NAME IN ( + 'dashboard_roles', 'dashboard_slices', 'dashboard_user', + 'report_schedule_user', 'rls_filter_roles', 'rls_filter_tables', + 'slice_user', 'sqlatable_user' + ) +ORDER BY DATA_LENGTH DESC; +``` + **Restoring an old `pg_dump` (or equivalent) against the new schema.** A dump taken before the migration includes `INSERT` statements that populate the now-removed `id` column. Restoring such a dump against the post-migration schema will fail. The supported workaround is to dump only the schema and reference data, then re-create the M:N associations from application data after restore — for example with `pg_dump --exclude-table-data` (or per-table `--exclude-table-data=dashboard_slices` etc.) for the eight junction tables, restore the rest, then run a one-shot script that re-INSERTs `(fk1, fk2)` pairs derived from your application export. Operators who need to restore an old dump verbatim should restore against a pre-migration Superset and then re-run the upgrade. **Intentional downgrade asymmetry.** The migration's `downgrade()` restores the surrogate `id` column and (for `dashboard_slices` and `report_schedule_user`) the original `UNIQUE (fk1, fk2)` constraint, but it does **not** restore the original `NULL`-allowed state on the FK columns — they remain `NOT NULL`. This is intentional: under SQLAlchemy's `secondary=` semantics, a `NULL` in either FK column of a junction table is meaningless (it cannot participate in either side of the relationship). Operators downgrading are not expected to need this restored. The asymmetry is documented for completeness so that round-trip schema diffs are not mistaken for migration bugs. From ab8bec7a37bfbc1b7232e565a1b35333aba219f2 Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Thu, 7 May 2026 11:41:53 -0600 Subject: [PATCH 11/14] build(docker): add MySQL compose override for dialect-swap evaluation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds ``docker-compose-mysql.yml``, a compose-override file that swaps the default Postgres metadata DB for MySQL 8 with one extra ``-f`` flag: docker compose -f docker-compose.yml -f docker-compose-mysql.yml up Useful for evaluating dialect-specific behaviour (e.g., the runtime cost of DDL migrations on a deployment whose production metadata DB is MySQL — the question raised by review feedback on this PR). Mirrors the connection settings used by CI's ``test-mysql`` shard: ``mysql+mysqldb`` dialect, charset ``utf8mb4`` with binary_prefix. Host port defaults to 13306 (configurable via ``DATABASE_PORT_MYSQL``) to avoid colliding with a native MySQL install on 3306. A separate volume (``db_home_mysql``) keeps MySQL data isolated from the Postgres ``db_home`` volume, so switching between the two with ``-f`` flag toggles doesn't corrupt either side. The Postgres-specific init scripts under ``docker/docker-entrypoint-initdb.d/`` are not mounted on the MySQL service (they are postgres-only). Examples / cypress fixtures still load via ``superset-init``'s post-startup steps, which run ``superset load-examples`` against whichever metadata DB is in use. Co-Authored-By: Claude Opus 4.7 (1M context) --- docker-compose-mysql.yml | 93 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 docker-compose-mysql.yml diff --git a/docker-compose-mysql.yml b/docker-compose-mysql.yml new file mode 100644 index 000000000000..4617eaaf0e2e --- /dev/null +++ b/docker-compose-mysql.yml @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Compose override that swaps the default Postgres metadata DB for MySQL 8. +# Useful for evaluating dialect-specific behaviour (e.g., DDL-migration +# cost on a deployment whose production metadata DB is MySQL). +# +# Usage: +# docker compose -f docker-compose.yml -f docker-compose-mysql.yml up +# docker compose -f docker-compose.yml -f docker-compose-mysql.yml down +# +# To switch back to Postgres, just drop the second `-f` flag — the MySQL +# data lives in a separate volume (`db_home_mysql`) so neither side is +# corrupted by switching dialects. +# +# Notes: +# - Mirrors the connection settings used by CI's `test-mysql` shard: +# dialect ``mysql+mysqldb``, charset utf8mb4 with binary_prefix. +# - Host port 13306 (configurable via DATABASE_PORT_MYSQL) to avoid +# colliding with a native MySQL install on 3306. +# - The Postgres-specific init scripts under +# docker/docker-entrypoint-initdb.d/ are not mounted (they are +# postgres-only); examples / cypress fixtures still load via +# `superset-init`'s post-startup steps. + +# Shared environment override applied to every Superset-side service that +# connects to the metadata DB. ``environment:`` takes precedence over the +# values inherited from the env_file in docker-compose.yml. +x-mysql-env: &mysql-env + DATABASE_DIALECT: mysql+mysqldb + DATABASE_HOST: db + DATABASE_PORT: "3306" + DATABASE_DB: superset + DATABASE_USER: superset + DATABASE_PASSWORD: superset + SQLALCHEMY_DATABASE_URI: "mysql+mysqldb://superset:superset@db:3306/superset?charset=utf8mb4&binary_prefix=true" + +services: + db: + image: mysql:8.0 + environment: + MYSQL_DATABASE: superset + MYSQL_USER: superset + MYSQL_PASSWORD: superset + MYSQL_ROOT_PASSWORD: root + ports: + - "127.0.0.1:${DATABASE_PORT_MYSQL:-13306}:3306" + volumes: + - db_home_mysql:/var/lib/mysql + command: + - --default-authentication-plugin=caching_sha2_password + - --character-set-server=utf8mb4 + - --collation-server=utf8mb4_0900_ai_ci + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost -uroot -proot --silent"] + interval: 5s + timeout: 5s + retries: 20 + + superset: + environment: *mysql-env + + superset-init: + environment: *mysql-env + + superset-worker: + environment: *mysql-env + + superset-worker-beat: + environment: *mysql-env + + superset-node: + environment: *mysql-env + + superset-tests-worker: + environment: *mysql-env + +volumes: + db_home_mysql: From 3c699c62d8fef82c9c55564bdd1f91d84b4c73e1 Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Thu, 7 May 2026 11:53:57 -0600 Subject: [PATCH 12/14] fix(docker): MySQL examples DB + EXAMPLES_PORT override (sc-105349) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix two follow-on issues reported when starting the dev stack with docker-compose-mysql.yml: 1. ``superset-init`` step 4 (load-examples) fails with ``MySQLdb.OperationalError: (2002, "Can't connect to server on 'db'")`` because the analytics-examples DB connection inherits ``EXAMPLES_PORT=5432`` (Postgres port) from ``docker/.env``. The override flipped ``DATABASE_DIALECT`` to ``mysql+mysqldb`` but left the EXAMPLES_* group on Postgres defaults, so the URI became ``mysql+mysqldb://examples:examples@db:5432/examples`` — MySQL container has no listener on 5432. Fix: add ``EXAMPLES_HOST/PORT/DB/USER/PASSWORD`` and a complete ``SUPERSET__SQLALCHEMY_EXAMPLES_URI`` to the ``mysql-env`` anchor. 2. The Postgres init scripts under ``docker/docker-entrypoint-initdb.d/`` (``cypress-init.sh``, ``examples-init.sh``) get mounted on the MySQL container too — compose merges volume lists. They invoke ``psql`` which doesn't exist in the MySQL image, abort with ``psql: command not found``, and prevent the ``examples`` DB from being created. Fix: add a MySQL-specific init script ``docker/mysql-init/examples-init.sql`` that creates the ``examples`` database and user, and mount it at ``/docker-entrypoint-initdb.d`` in the override. Compose's later-takes-precedence rule on duplicate volume targets displaces the Postgres init dir, so the MySQL container only sees the MySQL-compatible script. (Used a plain duplicate-target mount rather than the ``!override`` tag because pre-commit's ``check-yaml`` doesn't recognize Compose's custom YAML tags.) Recovery for an existing failed MySQL stack: ``docker compose -f docker-compose.yml -f docker-compose-mysql.yml down``, then ``docker volume rm superset_db_home_mysql`` (so the new init script runs on the next fresh boot), then ``up`` again. Co-Authored-By: Claude Opus 4.7 (1M context) --- docker-compose-mysql.yml | 24 ++++++++++++++++++++++ docker/mysql-init/examples-init.sql | 32 +++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 docker/mysql-init/examples-init.sql diff --git a/docker-compose-mysql.yml b/docker-compose-mysql.yml index 4617eaaf0e2e..13f4c99236cb 100644 --- a/docker-compose-mysql.yml +++ b/docker-compose-mysql.yml @@ -48,6 +48,17 @@ x-mysql-env: &mysql-env DATABASE_USER: superset DATABASE_PASSWORD: superset SQLALCHEMY_DATABASE_URI: "mysql+mysqldb://superset:superset@db:3306/superset?charset=utf8mb4&binary_prefix=true" + # Override the analytics-examples DB connection too. ``EXAMPLES_PORT`` + # in docker/.env is hardcoded to 5432 (the Postgres port); without + # this override the examples connection would try MySQL on 5432 and + # fail. The examples user/DB are created by docker/mysql-init/ + # examples-init.sql on first MySQL boot. + EXAMPLES_HOST: db + EXAMPLES_PORT: "3306" + EXAMPLES_DB: examples + EXAMPLES_USER: examples + EXAMPLES_PASSWORD: examples + SUPERSET__SQLALCHEMY_EXAMPLES_URI: "mysql+mysqldb://examples:examples@db:3306/examples?charset=utf8mb4&binary_prefix=true" services: db: @@ -57,10 +68,23 @@ services: MYSQL_USER: superset MYSQL_PASSWORD: superset MYSQL_ROOT_PASSWORD: root + # The original 5432 port mapping is harmless on a MySQL container + # (nothing listens on 5432 inside it) but we add 13306->3306 so the + # MySQL port is reachable from the host without colliding with a + # native MySQL on 3306. Compose merges port lists. ports: - "127.0.0.1:${DATABASE_PORT_MYSQL:-13306}:3306" + # Override the init-scripts mount by re-binding the same target path + # to a MySQL-compatible directory. Compose merges volume lists by + # target path; later definitions win on conflict, so this displaces + # the Postgres-specific ``./docker/docker-entrypoint-initdb.d`` mount + # from docker-compose.yml. Without this, MySQL would try to run + # ``cypress-init.sh`` (which invokes ``psql``, not in the MySQL + # image), abort the init phase, and never create the ``examples`` + # database. Add the MySQL data volume separately. volumes: - db_home_mysql:/var/lib/mysql + - ./docker/mysql-init:/docker-entrypoint-initdb.d command: - --default-authentication-plugin=caching_sha2_password - --character-set-server=utf8mb4 diff --git a/docker/mysql-init/examples-init.sql b/docker/mysql-init/examples-init.sql new file mode 100644 index 000000000000..68dabe38671d --- /dev/null +++ b/docker/mysql-init/examples-init.sql @@ -0,0 +1,32 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- MySQL counterpart to docker/docker-entrypoint-initdb.d/examples-init.sh. +-- Creates the analytics-examples database and user that Superset's +-- ``load-examples`` command writes to. Mounted by docker-compose-mysql.yml +-- at /docker-entrypoint-initdb.d/ so the MySQL image's first-boot +-- entrypoint runs it automatically. (The Postgres init scripts under +-- docker/docker-entrypoint-initdb.d/ are NOT mounted on the MySQL +-- service — they invoke psql, which doesn't exist in the MySQL image.) + +CREATE DATABASE IF NOT EXISTS examples + CHARACTER SET utf8mb4 + COLLATE utf8mb4_0900_ai_ci; + +CREATE USER IF NOT EXISTS 'examples'@'%' IDENTIFIED BY 'examples'; +GRANT ALL PRIVILEGES ON examples.* TO 'examples'@'%'; +FLUSH PRIVILEGES; From 283109f61af863e186b2d64d3620a90f30201015 Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Thu, 7 May 2026 13:35:59 -0600 Subject: [PATCH 13/14] build(scripts): add stress-test data generator for migration timing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ``scripts/seed_junction_load.py``, a backend-agnostic script that bulk-inserts synthetic parent rows (dashboards, slices, users, roles, tables, dbs) and many-to-many junction rows for the four largest association tables targeted by the composite-PK migration: ``dashboard_slices``, ``slice_user``, ``dashboard_user``, ``dashboard_roles``. Designed for measuring migration runtime at varying scales — run with a series of size flags (100K / 1M / 5M / 10M for the target table) and time the migration at each scale to verify the predicted ``O(N log N)`` extrapolation against real numbers. Properties: - **Reproducible**: deterministic cross-product walk through parent IDs produces a stable pair sequence; re-running is replayable. - **Idempotent**: re-running with the same target is a no-op; with a higher target, only new rows are added. - **Backend-agnostic**: connects via Superset's standard ``DATABASE_*`` env vars (or ``SUPERSET__SQLALCHEMY_DATABASE_URI``). Branches on dialect for ``BINARY(16)`` vs ``UUID`` vs TEXT/BLOB UUID columns. - **Batched**: bulk INSERT 10K rows per statement. - **Per-phase timing**: logs elapsed wall time for the parents phase, the junctions phase as a whole, and per junction-table. - **Avoidance set**: loads existing junction pairs into a Python set so re-runs on top of pre-existing data don't collide on the uniqueness constraint. Usage (inside the Superset container): docker exec superset-superset-1 \\ /app/.venv/bin/python /app/scripts/seed_junction_load.py \\ --dashboard-slices 1000000 Defaults target a "large multi-team install" shape: 1M ``dashboard_slices``, 100K each ``slice_user`` / ``dashboard_user``, 10K ``dashboard_roles``. Override per-table via flags. Tested locally on MySQL (the user's current eval stack): - 200/100/100/50 row mini-run produced expected counts. - Re-running at the same target is a no-op (idempotent). - ``--dry-run`` plans without writing. Junction tables not yet covered (``sqlatable_user``, ``rls_filter_*``, ``report_schedule_user``) are typically small in production and require additional parent seeding (RLS filters, report schedules) that wasn't worth the scope here. Adding them is straightforward by extending ``JUNCTIONS`` and writing the corresponding parent seeder. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/seed_junction_load.py | 567 ++++++++++++++++++++++++++++++++++ 1 file changed, 567 insertions(+) create mode 100644 scripts/seed_junction_load.py diff --git a/scripts/seed_junction_load.py b/scripts/seed_junction_load.py new file mode 100644 index 000000000000..74a891c5035d --- /dev/null +++ b/scripts/seed_junction_load.py @@ -0,0 +1,567 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ---------------------------------------------------------------------- +# Stress-test data generator for the composite-PK migration (sc-105349). +# +# Bulk-inserts synthetic parent rows and many-to-many junction rows for +# the eight association tables that the composite-PK migration touches. +# Useful for measuring migration runtime at varying scales — run this at +# 100K / 1M / 5M / 10M rows and time the migration at each scale to +# verify the O(N log N) extrapolation. +# +# Idempotent: rerunning with the same target is a no-op; rerunning with +# a higher target adds rows up to the new total. Batched bulk INSERTs +# (10K rows per statement) make it fast on Postgres, MySQL, and SQLite. +# +# Usage (inside the Superset container): +# +# docker exec superset-superset-1 \\ +# /app/.venv/bin/python /app/scripts/seed_junction_load.py \\ +# --dashboard-slices 1000000 \\ +# --slice-user 100000 \\ +# --dashboard-user 100000 +# +# Run with no flags for the defaults shown below. Use ``--dry-run`` to +# print the planned inserts without writing anything. +# +# The script connects via Superset's standard ``DATABASE_*`` env vars +# (or ``SUPERSET__SQLALCHEMY_DATABASE_URI`` if set), so it works +# automatically inside the Superset container regardless of which +# metadata DB backend is in use. + +from __future__ import annotations + +import argparse +import logging +import os +import sys +import time +from contextlib import contextmanager +from typing import Iterator +from uuid import uuid4 + +import sqlalchemy as sa +from sqlalchemy.engine import Connection, Engine + +logger = logging.getLogger("seed_junction_load") + +# Bulk INSERT batch size. Larger values = fewer statements but more memory. +BATCH = 10_000 + +# Default per-junction-table target row counts. Tuned to mimic the shape +# of a large multi-team Superset install. Override via CLI flags. +DEFAULTS: dict[str, int] = { + "dashboard_slices": 1_000_000, + "slice_user": 100_000, + "dashboard_user": 100_000, + "dashboard_roles": 10_000, +} + +# (junction_table, fk1_col, fk2_col, parent1_table, parent2_table) +# parents reference id columns; we generate (fk1, fk2) pairs by sampling +# from the parents' existing IDs. +JUNCTIONS: list[tuple[str, str, str, str, str]] = [ + ("dashboard_slices", "dashboard_id", "slice_id", "dashboards", "slices"), + ("slice_user", "user_id", "slice_id", "ab_user", "slices"), + ("dashboard_user", "user_id", "dashboard_id", "ab_user", "dashboards"), + ("dashboard_roles", "dashboard_id", "role_id", "dashboards", "ab_role"), +] + + +# ---------------------------------------------------------------------- +# Connection setup +# ---------------------------------------------------------------------- + + +def build_engine() -> Engine: + """Build a SQLAlchemy engine from Superset env vars.""" + if uri := os.environ.get("SUPERSET__SQLALCHEMY_DATABASE_URI"): + logger.info("Using SUPERSET__SQLALCHEMY_DATABASE_URI from env") + return sa.create_engine(uri) + + try: + dialect = os.environ["DATABASE_DIALECT"] + user = os.environ["DATABASE_USER"] + password = os.environ["DATABASE_PASSWORD"] + host = os.environ["DATABASE_HOST"] + port = os.environ["DATABASE_PORT"] + db = os.environ["DATABASE_DB"] + except KeyError as exc: + sys.exit( + f"Missing env var {exc}; either set DATABASE_DIALECT/USER/PASSWORD/" + f"HOST/PORT/DB or SUPERSET__SQLALCHEMY_DATABASE_URI before running." + ) + + uri = f"{dialect}://{user}:{password}@{host}:{port}/{db}" + logger.info( + "Built URI from DATABASE_* env vars (dialect=%s, host=%s)", dialect, host + ) + return sa.create_engine(uri) + + +# ---------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------- + + +def uuid_value(dialect_name: str) -> bytes | str: + """Return a UUID in the form the active dialect expects. + + MySQL stores UUIDs as ``BINARY(16)`` (16 raw bytes); Postgres has a + native ``UUID`` type that accepts strings; SQLite stores them as + BLOB/TEXT and accepts either. Branching here keeps the seed script + backend-agnostic without depending on Superset's custom column types. + """ + if dialect_name.startswith("mysql"): + return uuid4().bytes + return str(uuid4()) + + +@contextmanager +def time_phase(name: str) -> Iterator[None]: + """Log elapsed wall time for a named phase.""" + start = time.monotonic() + logger.info("[%s] starting", name) + try: + yield + finally: + elapsed = time.monotonic() - start + logger.info("[%s] done in %.2fs", name, elapsed) + + +def count_rows(conn: Connection, table: str) -> int: + return conn.scalar(sa.text(f"SELECT COUNT(*) FROM {table}")) or 0 # noqa: S608 + + +def existing_ids(conn: Connection, table: str, limit: int | None = None) -> list[int]: + sql = f"SELECT id FROM {table} ORDER BY id" # noqa: S608 + if limit is not None: + sql += f" LIMIT {limit}" + return [row[0] for row in conn.execute(sa.text(sql))] + + +# ---------------------------------------------------------------------- +# Parent seeders +# +# Each function ensures the named parent table has at least ``target`` +# rows by inserting synthetic ones with minimal-but-valid columns. +# Returns nothing; subsequent code reads back IDs via ``existing_ids``. +# ---------------------------------------------------------------------- + + +def seed_dashboards(conn: Connection, target: int, dry_run: bool) -> None: + current = count_rows(conn, "dashboards") + if current >= target: + logger.info( + "dashboards: %d rows (target %d) — no insert needed", current, target + ) + return + needed = target - current + logger.info("dashboards: %d → %d (+%d)", current, target, needed) + if dry_run: + return + + dialect = conn.engine.dialect.name + sql = sa.text( + "INSERT INTO dashboards (uuid, dashboard_title, slug, published) " + "VALUES (:uuid, :title, :slug, :published)" + ) + for batch_start in range(0, needed, BATCH): + rows = [ + { + "uuid": uuid_value(dialect), + "title": f"seed_dashboard_{current + i}", + "slug": f"seed-dashboard-{current + i}-{uuid4().hex[:8]}", + "published": False, + } + for i in range(batch_start, min(batch_start + BATCH, needed)) + ] + conn.execute(sql, rows) + logger.info(" dashboards: inserted %d / %d", batch_start + len(rows), needed) + + +def seed_dbs(conn: Connection, dry_run: bool) -> int: + """Ensure at least one row exists in ``dbs`` (parent of ``tables``). + Returns the id to use as ``database_id`` when seeding ``tables``.""" + ids = existing_ids(conn, "dbs", limit=1) + if ids: + return ids[0] + if dry_run: + return -1 # placeholder + dialect = conn.engine.dialect.name + logger.info("dbs: inserting one synthetic database (no rows present)") + conn.execute( + sa.text( + "INSERT INTO dbs (uuid, database_name, sqlalchemy_uri, expose_in_sqllab) " + "VALUES (:uuid, :name, :uri, :expose)" + ), + { + "uuid": uuid_value(dialect), + "name": f"seed_db_{uuid4().hex[:8]}", + "uri": "sqlite:///seed.db", + "expose": False, + }, + ) + return existing_ids(conn, "dbs", limit=1)[0] + + +def seed_tables(conn: Connection, target: int, dry_run: bool) -> None: + current = count_rows(conn, "tables") + if current >= target: + logger.info("tables: %d rows (target %d) — no insert needed", current, target) + return + needed = target - current + logger.info("tables: %d → %d (+%d)", current, target, needed) + if dry_run: + return + + database_id = seed_dbs(conn, dry_run=False) + dialect = conn.engine.dialect.name + sql = sa.text( + "INSERT INTO tables (uuid, table_name, database_id) " + "VALUES (:uuid, :name, :db_id)" + ) + for batch_start in range(0, needed, BATCH): + rows = [ + { + "uuid": uuid_value(dialect), + "name": f"seed_table_{current + i}", + "db_id": database_id, + } + for i in range(batch_start, min(batch_start + BATCH, needed)) + ] + conn.execute(sql, rows) + logger.info(" tables: inserted %d / %d", batch_start + len(rows), needed) + + +def seed_slices(conn: Connection, target: int, dry_run: bool) -> None: + current = count_rows(conn, "slices") + if current >= target: + logger.info("slices: %d rows (target %d) — no insert needed", current, target) + return + needed = target - current + logger.info("slices: %d → %d (+%d)", current, target, needed) + if dry_run: + return + + # Slices reference tables.id; ensure at least one ``tables`` row exists + # so the FK is satisfiable (datasource_id is nullable but we set it for + # realism). The migration test doesn't care, but a real Superset that + # re-renders these slices does. + seed_tables(conn, target=1, dry_run=False) + table_id = existing_ids(conn, "tables", limit=1)[0] + dialect = conn.engine.dialect.name + sql = sa.text( + "INSERT INTO slices " + "(uuid, slice_name, datasource_id, datasource_type, viz_type) " + "VALUES (:uuid, :name, :ds_id, :ds_type, :viz)" + ) + for batch_start in range(0, needed, BATCH): + rows = [ + { + "uuid": uuid_value(dialect), + "name": f"seed_slice_{current + i}", + "ds_id": table_id, + "ds_type": "table", + "viz": "table", + } + for i in range(batch_start, min(batch_start + BATCH, needed)) + ] + conn.execute(sql, rows) + logger.info(" slices: inserted %d / %d", batch_start + len(rows), needed) + + +def seed_users(conn: Connection, target: int, dry_run: bool) -> None: + current = count_rows(conn, "ab_user") + if current >= target: + logger.info("ab_user: %d rows (target %d) — no insert needed", current, target) + return + needed = target - current + logger.info("ab_user: %d → %d (+%d)", current, target, needed) + if dry_run: + return + + sql = sa.text( + "INSERT INTO ab_user (first_name, last_name, username, email, active) " + "VALUES (:first, :last, :username, :email, :active)" + ) + for batch_start in range(0, needed, BATCH): + rows = [ + { + "first": "seed", + "last": f"user_{current + i}", + "username": f"seed_user_{current + i}_{uuid4().hex[:8]}", + "email": f"seed_user_{current + i}_{uuid4().hex[:8]}@example.invalid", + "active": True, + } + for i in range(batch_start, min(batch_start + BATCH, needed)) + ] + conn.execute(sql, rows) + logger.info(" ab_user: inserted %d / %d", batch_start + len(rows), needed) + + +def seed_roles(conn: Connection, target: int, dry_run: bool) -> None: + current = count_rows(conn, "ab_role") + if current >= target: + logger.info("ab_role: %d rows (target %d) — no insert needed", current, target) + return + needed = target - current + logger.info("ab_role: %d → %d (+%d)", current, target, needed) + if dry_run: + return + + sql = sa.text("INSERT INTO ab_role (name) VALUES (:name)") + for batch_start in range(0, needed, BATCH): + rows = [ + {"name": f"seed_role_{current + i}_{uuid4().hex[:8]}"} + for i in range(batch_start, min(batch_start + BATCH, needed)) + ] + conn.execute(sql, rows) + logger.info(" ab_role: inserted %d / %d", batch_start + len(rows), needed) + + +# ---------------------------------------------------------------------- +# Junction seeder +# ---------------------------------------------------------------------- + + +def _load_existing_pairs( + conn: Connection, junction: str, fk1_col: str, fk2_col: str +) -> set[tuple[int, int]]: + """Load existing ``(fk1, fk2)`` pairs from a junction table into a set. + + Used so the seeder can skip them when generating new pairs (junction + tables enforce uniqueness on the FK pair). Memory is ~32 bytes/tuple + on CPython, so 10M existing pairs is ~320MB — acceptable for a dev + machine. The junction / column names come from ``JUNCTIONS``, not + user input, so the f-string interpolation is safe. + """ + sql_text = f"SELECT {fk1_col}, {fk2_col} FROM {junction}" # noqa: S608 + return {(row[0], row[1]) for row in conn.execute(sa.text(sql_text))} + + +def _generate_new_pairs( + p1_ids: list[int], + p2_ids: list[int], + existing_pairs: set[tuple[int, int]], +) -> Iterator[tuple[int, int]]: + """Yield ``(fk1, fk2)`` pairs from the parent1 × parent2 cross-product + that are not already in ``existing_pairs``.""" + for fk1 in p1_ids: + for fk2 in p2_ids: + if (fk1, fk2) not in existing_pairs: + yield (fk1, fk2) + + +def seed_junction( + conn: Connection, + junction: str, + fk1_col: str, + fk2_col: str, + parent1: str, + parent2: str, + target: int, + dry_run: bool, +) -> None: + """Bulk-insert junction rows up to ``target`` rows total. + + Generates ``(fk1, fk2)`` pairs by walking the cross-product of + parent1 IDs × parent2 IDs in row-major order, skipping pairs that + already exist. Walking the cross-product deterministically keeps + the script replayable: re-running with the same target is a no-op, + and re-running with a higher target appends new pairs in a stable + order regardless of how many runs preceded. + """ + current = count_rows(conn, junction) + if current >= target: + logger.info( + "%s: %d rows (target %d) — no insert needed", junction, current, target + ) + return + needed = target - current + logger.info("%s: %d → %d (+%d)", junction, current, target, needed) + if dry_run: + return + + p1_ids = existing_ids(conn, parent1) + p2_ids = existing_ids(conn, parent2) + max_pairs = len(p1_ids) * len(p2_ids) + if max_pairs < target: + sys.exit( + f"Cannot reach {target} rows in {junction}: " + f"only {max_pairs} unique pairs available " + f"({len(p1_ids)} × {len(p2_ids)}). " + f"Increase parent targets and rerun." + ) + + existing_pairs: set[tuple[int, int]] = ( + _load_existing_pairs(conn, junction, fk1_col, fk2_col) if current > 0 else set() + ) + if existing_pairs: + logger.info( + " %s: loaded %d existing pairs into avoidance set", + junction, + len(existing_pairs), + ) + + insert_sql = sa.text( + f"INSERT INTO {junction} ({fk1_col}, {fk2_col}) " # noqa: S608 + f"VALUES (:fk1, :fk2)" + ) + + inserted = 0 + batch: list[dict[str, int]] = [] + for fk1, fk2 in _generate_new_pairs(p1_ids, p2_ids, existing_pairs): + batch.append({"fk1": fk1, "fk2": fk2}) + inserted += 1 + if len(batch) == BATCH or inserted == needed: + conn.execute(insert_sql, batch) + logger.info(" %s: inserted %d / %d", junction, inserted, needed) + batch = [] + if inserted == needed: + return + if inserted < needed: + sys.exit( + f"Ran out of unique pairs at {inserted}/{needed} for {junction} " + f"(parents have {len(p1_ids)} × {len(p2_ids)} = {max_pairs} pairs, " + f"{len(existing_pairs)} already present)" + ) + + +# ---------------------------------------------------------------------- +# Orchestration +# ---------------------------------------------------------------------- + + +def required_parent_count(target_pairs: int, other_parent: int) -> int: + """How many rows we need in this parent so that + (this_parent × other_parent) ≥ target_pairs.""" + if other_parent == 0: + # Bootstrapping: assume we'll create at least 1 + other_parent = 1 + return -(-target_pairs // other_parent) # ceil(target_pairs / other_parent) + + +def _compute_parent_requirements(targets: dict[str, int]) -> dict[str, int]: + """For each parent table, return the minimum row count needed so that + parent1 × parent2 ≥ target for every junction it participates in. + + Allocates ceil(sqrt(target)) rows per parent, balanced across the two + parents of each junction. The actual junction seeder will then walk + the cross-product to produce the target number of unique pairs. + """ + parent_req: dict[str, int] = {} + for junction, _, _, p1, p2 in JUNCTIONS: + target = targets.get(junction, 0) + if target == 0: + continue + sqrt_n = int(target**0.5) + 1 + parent_req[p1] = max(parent_req.get(p1, 0), sqrt_n) + parent_req[p2] = max(parent_req.get(p2, 0), sqrt_n) + return parent_req + + +def _seed_parents(conn: Connection, parent_req: dict[str, int], dry_run: bool) -> None: + """Seed parent tables in dependency order: + independent parents (ab_user, ab_role) first, then dashboards / slices / + tables (which transitively depend on dbs, seeded inside seed_tables).""" + if "ab_user" in parent_req: + seed_users(conn, parent_req["ab_user"], dry_run) + if "ab_role" in parent_req: + seed_roles(conn, parent_req["ab_role"], dry_run) + if "dashboards" in parent_req: + seed_dashboards(conn, parent_req["dashboards"], dry_run) + if "slices" in parent_req: + seed_slices(conn, parent_req["slices"], dry_run) + if "tables" in parent_req: + seed_tables(conn, parent_req["tables"], dry_run) + + +def _seed_all_junctions( + conn: Connection, targets: dict[str, int], dry_run: bool +) -> None: + for junction, fk1, fk2, p1, p2 in JUNCTIONS: + target = targets.get(junction, 0) + if target == 0: + continue + with time_phase(f"junction:{junction}"): + seed_junction(conn, junction, fk1, fk2, p1, p2, target, dry_run) + + +def run(targets: dict[str, int], dry_run: bool) -> None: + engine = build_engine() + with engine.begin() as conn: + parent_req = _compute_parent_requirements(targets) + logger.info("Required parent row counts: %s", parent_req) + + with time_phase("parents"): + _seed_parents(conn, parent_req, dry_run) + + with time_phase("junctions"): + _seed_all_junctions(conn, targets, dry_run) + + +# ---------------------------------------------------------------------- +# CLI +# ---------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + for table, default in DEFAULTS.items(): + parser.add_argument( + f"--{table.replace('_', '-')}", + type=int, + default=default, + help=f"target row count for {table} (default: {default:,})", + ) + parser.add_argument( + "--dry-run", + "-n", + action="store_true", + help="print planned inserts without writing to the DB", + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="increase log verbosity", + ) + args = parser.parse_args() + + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S", + ) + + targets = {table: getattr(args, table) for table in DEFAULTS} + + logger.info("Targets: %s", targets) + logger.info("Dry run: %s", args.dry_run) + + with time_phase("total"): + run(targets, dry_run=args.dry_run) + + +if __name__ == "__main__": + main() From 10c36ac4cf6e888004736968158f4d6398e7858a Mon Sep 17 00:00:00 2001 From: Mike Bridge Date: Thu, 7 May 2026 14:17:03 -0600 Subject: [PATCH 14/14] feat(scripts): add --dirty-duplicates-pct to seed_junction_load.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the stress-test seed script with an optional duplicate-row injection step, used to measure the empirical cost of the migration's ``_dedupe_by_min_id`` phase. Usage: after running the normal seed at a given scale, add ``--dirty-duplicates-pct 5`` (or any non-zero value) to inject that percentage of duplicate ``(fk1, fk2)`` rows into each non-UNIQUE junction (slice_user, dashboard_user, dashboard_roles — dashboard_slices is skipped because its UNIQUE constraint, present both pre- and post-migration, rejects duplicates). Pre-condition: requires the DB to be at the pre-migration revision (33d7e0e21daa). The post-migration composite PK rejects duplicates, so attempting to inject on the upgraded schema errors out. Empirical result on MySQL @ 10M dashboard_slices + ~2.1M other junction rows + 105K injected duplicates (5% on the 3 non-UNIQUE tables): Upgrade time: 1m 36s vs clean baseline 1m 37s → dedupe cost is within measurement noise; the table-scan that the migration already performs dominates whether or not duplicates exist. This empirically confirms what the cost-model predicted: the ``_dedupe_by_min_id`` GROUP BY scan is the dominant cost of that phase, and the actual per-duplicate DELETE is negligible. NULL-FK injection deliberately skipped — would require altering the six non-UNIQUE FK columns from NOT NULL back to nullable (the migration's downgrade keeps them NOT NULL by design), which adds per-backend ALTER complexity for a code path that's structurally identical in cost shape (DELETE WHERE col IS NULL is the same scan shape as the dedupe scan). Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/seed_junction_load.py | 119 +++++++++++++++++++++++++++++++++- 1 file changed, 117 insertions(+), 2 deletions(-) diff --git a/scripts/seed_junction_load.py b/scripts/seed_junction_load.py index 74a891c5035d..cc42a6bfce9c 100644 --- a/scripts/seed_junction_load.py +++ b/scripts/seed_junction_load.py @@ -83,6 +83,11 @@ ("dashboard_roles", "dashboard_id", "role_id", "dashboards", "ab_role"), ] +# Junction tables that originally carried ``UNIQUE(fk1, fk2)`` and therefore +# cannot accept duplicate ``(fk1, fk2)`` pairs even on the pre-migration +# (downgrade) schema. The other JUNCTIONS allow duplicates pre-migration. +JUNCTIONS_WITH_UNIQUE: set[str] = {"dashboard_slices", "report_schedule_user"} + # ---------------------------------------------------------------------- # Connection setup @@ -504,7 +509,95 @@ def _seed_all_junctions( seed_junction(conn, junction, fk1, fk2, p1, p2, target, dry_run) -def run(targets: dict[str, int], dry_run: bool) -> None: +def inject_duplicates( + conn: Connection, + junction: str, + fk1_col: str, + fk2_col: str, + pct: float, + dry_run: bool, +) -> None: + """Insert duplicate ``(fk1, fk2)`` rows on a non-UNIQUE junction table. + + Used to stress-test the migration's ``_dedupe_by_min_id`` phase, which + is otherwise a no-op on cleanly-seeded data. Computes ``count = + current_rows * pct / 100`` and inserts that many rows by re-sampling + existing ``(fk1, fk2)`` pairs in row-major order. The synthetic + duplicates land on top of distinct existing pairs (one duplicate per + distinct pair, then wraps), so the migration's dedupe finds and + deletes them. + + **Pre-condition: the table must NOT have UNIQUE on (fk1, fk2)**, i.e., + the schema must be the pre-migration shape (after running + ``superset db downgrade``). On the post-migration schema the composite + PK rejects duplicates and this function will error. + """ + if pct == 0: + return + current = count_rows(conn, junction) + count = int(current * pct / 100) + if count == 0: + logger.info( + "%s: 0 duplicates to inject (current=%d, pct=%g)", + junction, + current, + pct, + ) + return + logger.info( + "%s: injecting %d duplicate rows (%g%% of %d existing)", + junction, + count, + pct, + current, + ) + if dry_run: + return + + select_sql = sa.text( + f"SELECT {fk1_col}, {fk2_col} FROM {junction} ORDER BY id LIMIT :n" # noqa: S608 + ) + sample = conn.execute(select_sql, {"n": count}).fetchall() + if not sample: + logger.warning("%s: no rows to duplicate (table is empty)", junction) + return + + insert_sql = sa.text( + f"INSERT INTO {junction} ({fk1_col}, {fk2_col}) " # noqa: S608 + f"VALUES (:fk1, :fk2)" + ) + inserted = 0 + while inserted < count: + batch: list[dict[str, int]] = [] + while len(batch) < BATCH and inserted < count: + row = sample[inserted % len(sample)] + batch.append({"fk1": row[0], "fk2": row[1]}) + inserted += 1 + conn.execute(insert_sql, batch) + logger.info(" %s: injected %d / %d duplicates", junction, inserted, count) + + +def _inject_dirty_data(conn: Connection, dirty_pct: float, dry_run: bool) -> None: + """Inject duplicate rows on every non-UNIQUE seeded junction. + + The two tables that originally carried ``UNIQUE(fk1, fk2)`` are + skipped because their composite-PK successor (and their pre-migration + UNIQUE constraint) both reject duplicate inserts. + """ + if dirty_pct == 0: + return + for junction, fk1, fk2, _, _ in JUNCTIONS: + if junction in JUNCTIONS_WITH_UNIQUE: + logger.info( + "%s: skipping duplicate injection (table has UNIQUE on FK pair)", + junction, + ) + continue + with time_phase(f"dirty:{junction}"): + inject_duplicates(conn, junction, fk1, fk2, dirty_pct, dry_run) + + +def run(targets: dict[str, int], dry_run: bool, dirty_duplicates_pct: float) -> None: engine = build_engine() with engine.begin() as conn: parent_req = _compute_parent_requirements(targets) @@ -516,6 +609,10 @@ def run(targets: dict[str, int], dry_run: bool) -> None: with time_phase("junctions"): _seed_all_junctions(conn, targets, dry_run) + if dirty_duplicates_pct > 0: + with time_phase("dirty-duplicates"): + _inject_dirty_data(conn, dirty_duplicates_pct, dry_run) + # ---------------------------------------------------------------------- # CLI @@ -540,6 +637,19 @@ def main() -> None: action="store_true", help="print planned inserts without writing to the DB", ) + parser.add_argument( + "--dirty-duplicates-pct", + type=float, + default=0, + help=( + "after seeding distinct pairs, inject this percentage of duplicate " + "rows on each non-UNIQUE junction (slice_user, dashboard_user, " + "dashboard_roles). Stress-tests the migration's _dedupe_by_min_id " + "phase. Requires the DB to be at the pre-migration revision " + "(33d7e0e21daa) — the post-migration composite PK rejects " + "duplicates and this will error. Default: 0 (no duplicates)." + ), + ) parser.add_argument( "--verbose", "-v", @@ -558,9 +668,14 @@ def main() -> None: logger.info("Targets: %s", targets) logger.info("Dry run: %s", args.dry_run) + logger.info("Dirty duplicates pct: %g", args.dirty_duplicates_pct) with time_phase("total"): - run(targets, dry_run=args.dry_run) + run( + targets, + dry_run=args.dry_run, + dirty_duplicates_pct=args.dirty_duplicates_pct, + ) if __name__ == "__main__":