From 8e6c7b74f00db3e80f0fd55acdcdd743b99f308b Mon Sep 17 00:00:00 2001 From: Dmitrii Andreev Date: Fri, 15 May 2026 13:05:30 -0500 Subject: [PATCH 1/2] HYPERFLEET-862 - feat: add Tier 2 E2E tests for deletion edge cases --- e2e/cluster/stuck_deletion.go | 228 ++++++++++++ pkg/helper/adapter.go | 5 + test-design/README.md | 3 +- test-design/testcases/delete-cluster.md | 336 +----------------- .../testcases/update-delete-test-matrix.md | 14 +- .../cl-stuck/adapter-config.yaml | 21 ++ .../cl-stuck/adapter-task-config.yaml | 112 ++++++ testdata/adapter-configs/cl-stuck/values.yaml | 31 ++ 8 files changed, 407 insertions(+), 343 deletions(-) create mode 100644 e2e/cluster/stuck_deletion.go create mode 100644 testdata/adapter-configs/cl-stuck/adapter-config.yaml create mode 100644 testdata/adapter-configs/cl-stuck/adapter-task-config.yaml create mode 100644 testdata/adapter-configs/cl-stuck/values.yaml diff --git a/e2e/cluster/stuck_deletion.go b/e2e/cluster/stuck_deletion.go new file mode 100644 index 0000000..305a0b5 --- /dev/null +++ b/e2e/cluster/stuck_deletion.go @@ -0,0 +1,228 @@ +package cluster + +import ( + "context" + "net/http" + "os" + + "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" //nolint:staticcheck // dot import for test readability + + "github.com/openshift-hyperfleet/hyperfleet-e2e/pkg/api/openapi" + "github.com/openshift-hyperfleet/hyperfleet-e2e/pkg/client" + "github.com/openshift-hyperfleet/hyperfleet-e2e/pkg/helper" + "github.com/openshift-hyperfleet/hyperfleet-e2e/pkg/labels" +) + +var _ = ginkgo.Describe("[Suite: cluster][negative] Stuck Deletion -- Adapter Unable to Finalize Prevents Hard-Delete", + ginkgo.Serial, + ginkgo.Label(labels.Tier2, labels.Negative), + func() { + var ( + h *helper.Helper + adapterChartPath string + apiChartPath string + baseDeployOpts helper.AdapterDeploymentOptions + ) + + ginkgo.BeforeEach(func(ctx context.Context) { + h = helper.New() + + ginkgo.By("Clone adapter Helm chart repository") + var cleanupAdapterChart func() error + var err error + adapterChartPath, cleanupAdapterChart, err = h.CloneHelmChart(ctx, helper.HelmChartCloneOptions{ + Component: "adapter", + RepoURL: h.Cfg.AdapterDeployment.ChartRepo, + Ref: h.Cfg.AdapterDeployment.ChartRef, + ChartPath: h.Cfg.AdapterDeployment.ChartPath, + WorkDir: helper.TestWorkDir, + }) + Expect(err).NotTo(HaveOccurred(), "failed to clone adapter Helm chart") + + ginkgo.DeferCleanup(func(ctx context.Context) { + ginkgo.By("Cleanup cloned adapter Helm chart") + if err := cleanupAdapterChart(); err != nil { + ginkgo.GinkgoWriter.Printf("Warning: failed to cleanup adapter chart: %v\n", err) + } + }) + + ginkgo.By("Clone API Helm chart repository") + var cleanupAPIChart func() error + apiChartPath, cleanupAPIChart, err = h.CloneHelmChart(ctx, helper.HelmChartCloneOptions{ + Component: "api", + RepoURL: h.Cfg.APIDeployment.ChartRepo, + Ref: h.Cfg.APIDeployment.ChartRef, + ChartPath: h.Cfg.APIDeployment.ChartPath, + WorkDir: helper.TestWorkDir, + }) + Expect(err).NotTo(HaveOccurred(), "failed to clone API Helm chart") + + ginkgo.DeferCleanup(func(ctx context.Context) { + ginkgo.By("Cleanup cloned API Helm chart") + if err := cleanupAPIChart(); err != nil { + ginkgo.GinkgoWriter.Printf("Warning: failed to cleanup API chart: %v\n", err) + } + }) + + baseDeployOpts = helper.AdapterDeploymentOptions{ + Namespace: h.Cfg.Namespace, + ChartPath: adapterChartPath, + } + }) + + ginkgo.It("should prevent hard-delete when an adapter cannot finalize", + func(ctx context.Context) { + adapterName := "cl-stuck" + + err := os.Setenv("ADAPTER_NAME", adapterName) + Expect(err).NotTo(HaveOccurred(), "failed to set ADAPTER_NAME environment variable") + ginkgo.DeferCleanup(func() { + _ = os.Unsetenv("ADAPTER_NAME") + }) + + releaseName := helper.GenerateAdapterReleaseName(helper.ResourceTypeClusters, adapterName) + + ginkgo.By("Deploy dedicated stuck-adapter") + deployOpts := baseDeployOpts + deployOpts.ReleaseName = releaseName + deployOpts.AdapterName = adapterName + + err = h.DeployAdapter(ctx, deployOpts) + ginkgo.DeferCleanup(func(ctx context.Context) { + ginkgo.By("Uninstall stuck-adapter") + if err := h.UninstallAdapter(ctx, releaseName, h.Cfg.Namespace); err != nil { + ginkgo.GinkgoWriter.Printf("Warning: failed to uninstall adapter %s: %v\n", releaseName, err) + } + + ginkgo.By("Clean up Pub/Sub subscription") + subscriptionID := h.Cfg.Namespace + "-" + helper.ResourceTypeClusters + "-" + adapterName + if err := h.DeletePubSubSubscription(ctx, subscriptionID); err != nil { + ginkgo.GinkgoWriter.Printf("Warning: failed to delete Pub/Sub subscription %s: %v\n", subscriptionID, err) + } + }) + Expect(err).NotTo(HaveOccurred(), "failed to deploy stuck-adapter") + ginkgo.GinkgoWriter.Printf("Deployed stuck-adapter: release=%s\n", releaseName) + + ginkgo.By("Upgrade API to add stuck-adapter to required adapters") + originalAdapters := h.GetAPIRequiredClusterAdapters() + updatedAdapters := append(append([]string{}, originalAdapters...), adapterName) + + // Register API config restore AFTER adapter cleanup registration (LIFO → executes FIRST) + ginkgo.DeferCleanup(func(ctx context.Context) { + ginkgo.By("Restore API required adapters to original config") + if err := h.RestoreAPIRequiredAdaptersWithRetry(ctx, apiChartPath, h.Cfg.Namespace, originalAdapters, 3); err != nil { + ginkgo.GinkgoWriter.Printf("CRITICAL: %v\n", err) + } + }) + + err = h.UpgradeAPIRequiredAdapters(ctx, apiChartPath, h.Cfg.Namespace, updatedAdapters) + Expect(err).NotTo(HaveOccurred(), "failed to upgrade API with stuck-adapter in required adapters") + + deploymentName, err := h.GetDeploymentName(ctx, h.Cfg.Namespace, releaseName) + Expect(err).NotTo(HaveOccurred(), "failed to find stuck-adapter deployment name") + + ginkgo.By("Create cluster and wait for Reconciled with all adapters including stuck-adapter") + cluster, err := h.Client.CreateClusterFromPayload(ctx, h.TestDataPath("payloads/clusters/cluster-request.json")) + Expect(err).NotTo(HaveOccurred(), "failed to create cluster") + Expect(cluster.Id).NotTo(BeNil(), "cluster ID should be generated") + clusterID := *cluster.Id + ginkgo.GinkgoWriter.Printf("Created cluster ID: %s, Name: %s\n", clusterID, cluster.Name) + + ginkgo.DeferCleanup(func(ctx context.Context) { + ginkgo.By("Cleanup test cluster " + clusterID) + if err := h.CleanupTestCluster(ctx, clusterID); err != nil { + ginkgo.GinkgoWriter.Printf("Warning: failed to cleanup cluster %s: %v\n", clusterID, err) + } + }) + + Eventually(h.PollCluster(ctx, clusterID), h.Cfg.Timeouts.Cluster.Reconciled, h.Cfg.Polling.Interval). + Should(helper.HaveResourceCondition(client.ConditionTypeReconciled, openapi.ResourceConditionStatusTrue)) + + ginkgo.By("Verify stuck-adapter reported Applied=True") + Eventually(func(g Gomega) { + statuses, err := h.Client.GetClusterStatuses(ctx, clusterID) + g.Expect(err).NotTo(HaveOccurred(), "failed to get cluster statuses") + + var found bool + for _, s := range statuses.Items { + if s.Adapter == adapterName { + found = true + g.Expect(h.HasAdapterCondition(s.Conditions, + client.ConditionTypeApplied, openapi.AdapterConditionStatusTrue)).To(BeTrue(), + "stuck-adapter should have Applied=True before scale-down") + break + } + } + g.Expect(found).To(BeTrue(), "stuck-adapter should be present in statuses") + }, h.Cfg.Timeouts.Adapter.Processing, h.Cfg.Polling.Interval).Should(Succeed()) + + ginkgo.By("Scale down stuck-adapter to simulate unavailability") + err = h.ScaleDeployment(ctx, h.Cfg.Namespace, deploymentName, 0) + Expect(err).NotTo(HaveOccurred(), "failed to scale down stuck-adapter") + + ginkgo.By("Soft-delete the cluster") + deletedCluster, err := h.Client.DeleteCluster(ctx, clusterID) + Expect(err).NotTo(HaveOccurred(), "DELETE request should succeed with 202") + Expect(deletedCluster.DeletedTime).NotTo(BeNil(), "soft-deleted cluster should have deleted_time set") + + ginkgo.By("Wait for healthy adapters to report Finalized=True") + Eventually(func(g Gomega) { + statuses, err := h.Client.GetClusterStatuses(ctx, clusterID) + g.Expect(err).NotTo(HaveOccurred(), "failed to get cluster statuses") + + adapterMap := make(map[string]openapi.AdapterStatus, len(statuses.Items)) + for _, s := range statuses.Items { + adapterMap[s.Adapter] = s + } + + for _, name := range originalAdapters { + adapter, exists := adapterMap[name] + g.Expect(exists).To(BeTrue(), "adapter %s should be present", name) + g.Expect(h.HasAdapterCondition(adapter.Conditions, + client.ConditionTypeFinalized, openapi.AdapterConditionStatusTrue)).To(BeTrue(), + "adapter %s should have Finalized=True", name) + } + }, h.Cfg.Timeouts.Adapter.Processing, h.Cfg.Polling.Interval).Should(Succeed()) + + ginkgo.By("Verify cluster remains stuck in soft-deleted state (not hard-deleted)") + Consistently(func(g Gomega) { + cl, err := h.Client.GetCluster(ctx, clusterID) + g.Expect(err).NotTo(HaveOccurred(), "cluster should still be accessible (not hard-deleted)") + g.Expect(cl.DeletedTime).NotTo(BeNil(), "cluster should still be soft-deleted") + + g.Expect(h.HasResourceCondition(cl.Status.Conditions, + client.ConditionTypeReconciled, openapi.ResourceConditionStatusFalse)).To(BeTrue(), + "cluster Reconciled should remain False while stuck-adapter is unavailable") + + statuses, err := h.Client.GetClusterStatuses(ctx, clusterID) + g.Expect(err).NotTo(HaveOccurred(), "failed to get cluster statuses") + + for _, s := range statuses.Items { + if s.Adapter == adapterName { + g.Expect(h.HasAdapterCondition(s.Conditions, + client.ConditionTypeFinalized, openapi.AdapterConditionStatusTrue)).To(BeFalse(), + "stuck-adapter should NOT have Finalized=True while scaled to 0") + break + } + } + }, h.Cfg.Timeouts.Adapter.Processing/2, h.Cfg.Polling.Interval).Should(Succeed()) + + ginkgo.GinkgoWriter.Printf("Verified: cluster stuck in soft-deleted state, healthy adapters finalized but stuck-adapter has not\n") + + ginkgo.By("Restore stuck-adapter by scaling up") + err = h.ScaleDeployment(ctx, h.Cfg.Namespace, deploymentName, 1) + Expect(err).NotTo(HaveOccurred(), "failed to scale up stuck-adapter") + + ginkgo.By("Verify cluster is hard-deleted after stuck-adapter recovery") + Eventually(h.PollClusterHTTPStatus(ctx, clusterID), h.Cfg.Timeouts.Cluster.Reconciled, h.Cfg.Polling.Interval). + Should(Equal(http.StatusNotFound)) + + ginkgo.By("Verify downstream K8s namespace is cleaned up") + Eventually(h.PollNamespacesByPrefix(ctx, clusterID), h.Cfg.Timeouts.Adapter.Processing, h.Cfg.Polling.Interval). + Should(BeEmpty()) + + ginkgo.GinkgoWriter.Printf("Verified: stuck-adapter recovered, cluster hard-deleted\n") + }) + }, +) diff --git a/pkg/helper/adapter.go b/pkg/helper/adapter.go index a783199..de0121f 100644 --- a/pkg/helper/adapter.go +++ b/pkg/helper/adapter.go @@ -147,6 +147,11 @@ func (h *Helper) DeployAdapter(ctx context.Context, opts AdapterDeploymentOption "--set", fmt.Sprintf("fullnameOverride=%s", releaseName), ) + // Override image pull policy if set (e.g. IfNotPresent for local kind clusters) + if policy := os.Getenv("IMAGE_PULL_POLICY"); policy != "" { + helmArgs = append(helmArgs, "--set", fmt.Sprintf("image.pullPolicy=%s", policy)) + } + // Add additional --set values if provided for key, value := range opts.SetValues { helmArgs = append(helmArgs, "--set", fmt.Sprintf("%s=%s", key, value)) diff --git a/test-design/README.md b/test-design/README.md index 368f9d8..a2cc101 100644 --- a/test-design/README.md +++ b/test-design/README.md @@ -98,8 +98,7 @@ A Tier2 failure means **edge cases or rare scenarios don't work as expected**. T |-----------|-----------| | [Cluster reaches correct status after adapter crash and recovery](testcases/cluster.md#test-title-cluster-can-reach-correct-status-after-adapter-crash-and-recovery) | Infrastructure recovery -- involves killing adapter pods and verifying self-healing. Crashes are rare; operators can restart pods manually | | [Maestro server unavailability graceful handling](testcases/adapter-with-maestro-transport.md#test-title-adapter-can-handle-maestro-server-unavailability-gracefully) | Infrastructure recovery -- simulates Maestro outage. Server failures are rare and recoverable | -| [DELETE during initial creation before Reconciled](testcases/delete-cluster.md#test-title-delete-during-initial-creation-before-cluster-reaches-reconciled) | Race condition -- user deletes a cluster before it finishes creating. Unusual timing scenario | -| [Cascade DELETE while child nodepool is mid-update](testcases/delete-cluster.md#test-title-cascade-delete-on-cluster-while-child-nodepool-is-mid-update-reconciliation) | Race condition -- cluster deletion while nodepool is being updated. Requires specific timing overlap | +| [Stuck deletion -- adapter unable to finalize prevents hard-delete](testcases/delete-cluster.md#test-title-stuck-deletion----adapter-unable-to-finalize-prevents-hard-delete) | Infrastructure recovery -- simulates adapter unavailability during deletion. Verifies system won't hard-delete prematurely and recovers when adapter returns | ### Tier Decision Flowchart diff --git a/test-design/testcases/delete-cluster.md b/test-design/testcases/delete-cluster.md index 6a2faaf..2bbaa2d 100644 --- a/test-design/testcases/delete-cluster.md +++ b/test-design/testcases/delete-cluster.md @@ -10,14 +10,11 @@ 6. [Create nodepool under soft-deleted cluster returns 409 Conflict](#test-title-create-nodepool-under-soft-deleted-cluster-returns-409-conflict) 7. [DELETE non-existent cluster returns 404](#test-title-delete-non-existent-cluster-returns-404) 8. [Stuck deletion -- adapter unable to finalize prevents hard-delete](#test-title-stuck-deletion----adapter-unable-to-finalize-prevents-hard-delete) -9. [DELETE during initial creation before cluster reaches Reconciled](#test-title-delete-during-initial-creation-before-cluster-reaches-reconciled) -10. [Simultaneous DELETE requests produce a single soft-delete record](#test-title-simultaneous-delete-requests-produce-a-single-soft-delete-record) +9. [Simultaneous DELETE requests produce a single soft-delete record](#test-title-simultaneous-delete-requests-produce-a-single-soft-delete-record) 11. [Adapter treats externally-deleted K8s resources as finalized](#test-title-adapter-treats-externally-deleted-k8s-resources-as-finalized) 12. [DELETE during update reconciliation before adapters converge](#test-title-delete-during-update-reconciliation-before-adapters-converge) 13. [Recreate cluster with same name after hard-delete](#test-title-recreate-cluster-with-same-name-after-hard-delete) 14. [LIST returns soft-deleted clusters alongside active clusters](#test-title-list-returns-soft-deleted-clusters-alongside-active-clusters) -15. [Cascade DELETE on cluster while a child nodepool is already deleting](#test-title-cascade-delete-on-cluster-while-a-child-nodepool-is-already-deleting) -16. [Cascade DELETE on cluster while child nodepool is mid-update-reconciliation](#test-title-cascade-delete-on-cluster-while-child-nodepool-is-mid-update-reconciliation) --- @@ -818,106 +815,6 @@ kubectl delete namespace {cluster_id} --ignore-not-found --- -## Test Title: DELETE during initial creation before cluster reaches Reconciled - -### Description - -This test validates deletion behavior when a cluster is still mid-reconciliation (adapters have not yet reported `Applied=True`). The cluster is created and immediately deleted without waiting for Reconciled state. Adapters should detect `deleted_time` regardless of their pre-deletion state and finalize cleanup. The system must not get stuck due to adapters having stale or incomplete status from the initial creation. - ---- - -| **Field** | **Value** | -|-----------|-----------| -| **Pos/Neg** | Positive | -| **Priority** | Tier2 | -| **Status** | Draft | -| **Automation** | Not Automated | -| **Version** | Post-MVP | -| **Created** | 2026-04-16 | -| **Updated** | 2026-04-16 | - ---- - -### Preconditions - -1. Environment is prepared using [hyperfleet-infra](https://github.com/openshift-hyperfleet/hyperfleet-infra) with all required platform resources -2. HyperFleet API and HyperFleet Sentinel services are deployed and running successfully -3. The adapters defined in testdata/adapter-configs are all deployed successfully - ---- - -### Test Steps - -#### Step 1: Create a cluster and immediately send DELETE without waiting for Reconciled - -**Action:** -- Create a cluster: -```bash -curl -X POST ${API_URL}/api/hyperfleet/v1/clusters \ - -H "Content-Type: application/json" \ - -d @testdata/payloads/clusters/cluster-request.json -``` -- Immediately send DELETE (do NOT wait for Reconciled or any adapter status): -```bash -curl -X DELETE ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id} -``` - -**Expected Result:** -- POST returns HTTP 201 with cluster created at `generation: 1` -- DELETE returns HTTP 202 (Accepted) with `deleted_time` set, `generation: 2` - -#### Step 1a: Capture adapter statuses at the moment of DELETE (optional validation) - -**Action:** -- Immediately after the DELETE response, capture adapter statuses to verify the edge case was exercised: -```bash -curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/statuses -``` - -**Expected Result:** -- At least one adapter should have no status entry yet or report `Applied=False` (still mid-reconciliation from initial creation) -- If all adapters already report `Applied=True`, log a warning: the edge case was not exercised and this run is equivalent to a happy-path deletion test. The test still passes but the stale-state scenario was not validated. - -#### Step 2: Verify adapters finalize despite incomplete initial reconciliation - -**Action:** -- Poll adapter statuses until all adapters report `Finalized=True`: -```bash -curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/statuses -``` - -**Expected Result:** -- All required adapters eventually report `Finalized` condition `status: "True"` -- Adapters that had not yet reported `Applied=True` (stale `Applied=False` or no status at all) still detect the soft-delete and finalize -- `observed_generation: 2` on all adapter statuses - -**Note:** Some adapters may have partially applied K8s resources from the initial creation before detecting `deleted_time`. The adapter's `lifecycle.delete.when` check runs before apply on subsequent reconciliation, so these partial resources should be cleaned up during finalization. - -#### Step 3: Verify cluster is hard-deleted - -**Action:** -- Poll until cluster record is removed (hard-delete executes automatically when `Reconciled=True`): -```bash -curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id} -``` - -**Expected Result:** -- Cluster `Reconciled` condition transitions to `status: "True"` (all adapters confirmed finalization) -- Hard-delete executes: GET returns HTTP 404 (Not Found) - -#### Step 4: Cleanup resources - -**Action:** -- If the cluster was not hard-deleted (test failed), fall back to namespace deletion: -```bash -kubectl delete namespace {cluster_id} --ignore-not-found -``` - -**Expected Result:** -- All test resources are cleaned up - ---- - ## Test Title: Simultaneous DELETE requests produce a single soft-delete record ### Description @@ -1143,7 +1040,7 @@ kubectl delete namespace {cluster_id} --ignore-not-found ### Description -This test validates the interaction between update and delete workflows. When a cluster is updated via PATCH and immediately deleted before adapters finish reconciling the update, the deletion workflow must take priority. Adapters receive the next event, detect `deleted_time`, and switch to cleanup mode instead of continuing update reconciliation. This is distinct from [DELETE during initial creation](#test-title-delete-during-initial-creation-before-cluster-reaches-reconciled) (matrix #18) because adapters already have `Applied=True` from the previous generation and are mid-reconciliation for the new generation — a different code path in the adapter's lifecycle handler. +This test validates the interaction between update and delete workflows. When a cluster is updated via PATCH and immediately deleted before adapters finish reconciling the update, the deletion workflow must take priority. Adapters receive the next event, detect `deleted_time`, and switch to cleanup mode instead of continuing update reconciliation. Adapters already have `Applied=True` from the previous generation and are mid-reconciliation for the new generation. --- @@ -1490,232 +1387,3 @@ curl -X DELETE ${API_URL}/api/hyperfleet/v1/clusters/{active_cluster_id} - Both clusters are eventually hard-deleted (GET returns HTTP 404) --- - -## Test Title: Cascade DELETE on cluster while a child nodepool is already deleting - -### Description - -This test validates the interaction between individual nodepool deletion and parent cluster cascade deletion. When a nodepool is already soft-deleted (has `deleted_time` set) and the parent cluster is subsequently deleted, the cascade must not overwrite the nodepool's existing `deleted_time`. The nodepool's original deletion timestamp and lifecycle must be preserved. Both the nodepool and cluster must eventually complete their deletion lifecycles. - ---- - -| **Field** | **Value** | -|-----------|-----------| -| **Pos/Neg** | Positive | -| **Priority** | Tier2 | -| **Status** | Draft | -| **Automation** | Not Automated | -| **Version** | Post-MVP | -| **Created** | 2026-04-17 | -| **Updated** | 2026-04-17 | - ---- - -### Preconditions - -1. Environment is prepared using [hyperfleet-infra](https://github.com/openshift-hyperfleet/hyperfleet-infra) with all required platform resources -2. HyperFleet API and HyperFleet Sentinel services are deployed and running successfully -3. The adapters defined in testdata/adapter-configs are all deployed successfully - ---- - -### Test Steps - -#### Step 1: Create a cluster with a nodepool and wait for Reconciled state - -**Action:** -- Create a cluster and one nodepool: -```bash -curl -X POST ${API_URL}/api/hyperfleet/v1/clusters \ - -H "Content-Type: application/json" \ - -d @testdata/payloads/clusters/cluster-request.json -``` -```bash -curl -X POST ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/nodepools \ - -H "Content-Type: application/json" \ - -d @testdata/payloads/nodepools/nodepool-request.json -``` -- Wait for both to reach Reconciled state - -**Expected Result:** -- Cluster and nodepool reach `Reconciled: True` - -#### Step 2: Soft-delete the nodepool first (do NOT wait for hard-delete) - -**Action:** -```bash -curl -X DELETE ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/nodepools/{nodepool_id} -``` -- Record the nodepool's `deleted_time` as `{nodepool_original_deleted_time}` - -**Expected Result:** -- Response returns HTTP 202 (Accepted) with `deleted_time` set on the nodepool - -#### Step 3: Immediately delete the parent cluster - -**Action:** -- Without waiting for the nodepool deletion to complete: -```bash -curl -X DELETE ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id} -``` - -**Expected Result:** -- Response returns HTTP 202 (Accepted) with `deleted_time` set on the cluster - -#### Step 4: Verify nodepool's deleted_time is preserved (not overwritten by cascade) - -**Action:** -```bash -curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/nodepools/{nodepool_id} -``` - -**Expected Result:** -- Nodepool `deleted_time` equals `{nodepool_original_deleted_time}` (preserved from Step 2, not overwritten by the cluster cascade) -- The cascade's `WHERE deleted_time IS NULL` guard should have skipped the already-deleted nodepool - -#### Step 5: Verify both resources complete deletion lifecycle - -**Action:** -- Poll until both are hard-deleted (hard-delete executes automatically when `Reconciled=True`): -```bash -curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/nodepools/{nodepool_id} -curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id} -``` - -**Expected Result:** -- Nodepool adapters report `Finalized: True` -- Cluster adapters report `Finalized: True` -- Both return HTTP 404 after hard-delete completes - -#### Step 6: Cleanup resources - -**Action:** -- If the test failed before hard-delete, fall back to namespace deletion: -```bash -kubectl delete namespace {cluster_id} --ignore-not-found -``` - -**Expected Result:** -- All test resources are cleaned up - ---- - -## Test Title: Cascade DELETE on cluster while child nodepool is mid-update-reconciliation - -### Description - -This test validates the interaction between nodepool update reconciliation and parent cluster cascade deletion. When a nodepool has been updated via PATCH (generation incremented, adapters not yet reconciled) and the parent cluster is subsequently deleted, the cascade must set `deleted_time` on the nodepool. Nodepool adapters must detect the soft-delete and switch to deletion mode, abandoning the in-flight update reconciliation. This is distinct from [Cascade DELETE on cluster while a child nodepool is already deleting](#test-title-cascade-delete-on-cluster-while-a-child-nodepool-is-already-deleting) (matrix #26) because here the nodepool has a pending spec update — the adapter must prioritize deletion over update reconciliation at a generation that has been bumped by both the PATCH and the cascade DELETE. - ---- - -| **Field** | **Value** | -|-----------|-----------| -| **Pos/Neg** | Positive | -| **Priority** | Tier2 | -| **Status** | Draft | -| **Automation** | Not Automated | -| **Version** | Post-MVP | -| **Created** | 2026-04-17 | -| **Updated** | 2026-04-17 | - ---- - -### Preconditions - -1. Environment is prepared using [hyperfleet-infra](https://github.com/openshift-hyperfleet/hyperfleet-infra) with all required platform resources -2. HyperFleet API and HyperFleet Sentinel services are deployed and running successfully -3. The adapters defined in testdata/adapter-configs are all deployed successfully - ---- - -### Test Steps - -#### Step 1: Create a cluster with a nodepool and wait for Reconciled state - -**Action:** -- Create a cluster and one nodepool: -```bash -curl -X POST ${API_URL}/api/hyperfleet/v1/clusters \ - -H "Content-Type: application/json" \ - -d @testdata/payloads/clusters/cluster-request.json -``` -```bash -curl -X POST ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/nodepools \ - -H "Content-Type: application/json" \ - -d @testdata/payloads/nodepools/nodepool-request.json -``` -- Wait for both to reach Reconciled state at `generation: 1` - -**Expected Result:** -- Cluster and nodepool reach `Reconciled: True` at `generation: 1` -- All adapters report `observed_generation: 1` - -#### Step 2: PATCH the nodepool to trigger update reconciliation (do NOT wait for reconciliation) - -**Action:** -- Send a PATCH to the nodepool to bump its generation: -```bash -curl -X PATCH ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/nodepools/{nodepool_id} \ - -H "Content-Type: application/json" \ - -d '{"spec": {"trigger-update": "true"}}' -``` - -**Expected Result:** -- Response returns HTTP 200 with nodepool `generation: 2` -- Nodepool adapters have not yet reconciled to generation 2 (update in flight) - -#### Step 3: Immediately DELETE the parent cluster before nodepool reconciliation completes - -**Action:** -- Without waiting for nodepool adapters to reconcile: -```bash -curl -X DELETE ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id} -``` - -**Expected Result:** -- Response returns HTTP 202 (Accepted) with `deleted_time` set on the cluster -- Cluster `generation` incremented - -#### Step 4: Verify cascade sets deleted_time on the nodepool - -**Action:** -```bash -curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/nodepools/{nodepool_id} -``` - -**Expected Result:** -- Nodepool has `deleted_time` set (cascaded from parent cluster) -- Nodepool `generation` is incremented beyond 2 (bumped by both the PATCH and the cascade DELETE) - -#### Step 5: Verify all adapters finalize and both resources are hard-deleted - -**Action:** -- Poll adapter statuses for both nodepool and cluster: -```bash -curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/nodepools/{nodepool_id}/statuses -curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/statuses -``` -- Poll until hard-delete completes (hard-delete executes automatically when `Reconciled=True`): -```bash -curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id}/nodepools/{nodepool_id} -curl -X GET ${API_URL}/api/hyperfleet/v1/clusters/{cluster_id} -``` - -**Expected Result:** -- Nodepool adapters report `Finalized: True` at the final generation (not the update generation) -- Nodepool adapters did not complete update reconciliation for generation 2 — they detected `deleted_time` and switched to cleanup mode -- Cluster adapters report `Finalized: True` -- Both nodepool and cluster return HTTP 404 after hard-delete - -#### Step 6: Cleanup resources - -**Action:** -- If the test failed before hard-delete, fall back to namespace deletion: -```bash -kubectl delete namespace {cluster_id} --ignore-not-found -``` - -**Expected Result:** -- All test resources are cleaned up - ---- diff --git a/test-design/testcases/update-delete-test-matrix.md b/test-design/testcases/update-delete-test-matrix.md index f48149e..5074f5b 100644 --- a/test-design/testcases/update-delete-test-matrix.md +++ b/test-design/testcases/update-delete-test-matrix.md @@ -34,7 +34,6 @@ Consolidated test matrix covering positive, negative, and edge case scenarios fo | 15 | Adapter statuses transition during update reconciliation | Cluster | Positive | Tier1 | [update-cluster.md](update-cluster.md#test-title-adapter-statuses-transition-during-update-reconciliation) | UPDATE happy path | | 16 | Multiple rapid updates coalesce to latest generation | Cluster | Positive | Tier1 | [update-cluster.md](update-cluster.md#test-title-multiple-rapid-updates-coalesce-to-latest-generation) | UPDATE edge cases | | 17 | Stuck deletion -- adapter unable to finalize prevents hard-delete | Cluster | Negative | Tier2 | [delete-cluster.md](delete-cluster.md#test-title-stuck-deletion----adapter-unable-to-finalize-prevents-hard-delete) | DELETE error cases | -| 18 | DELETE during initial creation before cluster reaches Reconciled | Cluster | Positive | Tier2 | [delete-cluster.md](delete-cluster.md#test-title-delete-during-initial-creation-before-cluster-reaches-reconciled) | DELETE edge cases | | 19 | Simultaneous DELETE requests produce a single soft-delete record | Cluster | Positive | Tier1 | [delete-cluster.md](delete-cluster.md#test-title-simultaneous-delete-requests-produce-a-single-soft-delete-record) | DELETE edge cases | | 20 | Adapter treats externally-deleted K8s resources as finalized | Cluster | Positive | Tier1 | [delete-cluster.md](delete-cluster.md#test-title-adapter-treats-externally-deleted-k8s-resources-as-finalized) | DELETE edge cases | | 21 | DELETE during update reconciliation before adapters converge | Cluster | Positive | Tier1 | [delete-cluster.md](delete-cluster.md#test-title-delete-during-update-reconciliation-before-adapters-converge) | DELETE edge cases | @@ -42,8 +41,6 @@ Consolidated test matrix covering positive, negative, and edge case scenarios fo | 23 | Labels-only PATCH bumps generation and triggers reconciliation (cluster) | Cluster | Positive | Tier1 | [update-cluster.md](update-cluster.md#test-title-labels-only-patch-bumps-generation-and-triggers-reconciliation) | UPDATE edge cases | | 24 | Labels-only PATCH bumps generation and triggers reconciliation (nodepool) | Nodepool | Positive | Tier1 | [update-nodepool.md](update-nodepool.md#test-title-labels-only-patch-bumps-generation-and-triggers-reconciliation) | UPDATE edge cases | | 25 | LIST returns soft-deleted clusters alongside active clusters | Cluster | Positive | Tier1 | [delete-cluster.md](delete-cluster.md#test-title-list-returns-soft-deleted-clusters-alongside-active-clusters) | DELETE API behavior | -| 26 | Cascade DELETE on cluster while a child nodepool is already deleting | Cluster + Nodepool | Positive | Tier2 | [delete-cluster.md](delete-cluster.md#test-title-cascade-delete-on-cluster-while-a-child-nodepool-is-already-deleting) | DELETE hierarchical | -| 27 | Cascade DELETE on cluster while child nodepool is mid-update-reconciliation | Cluster + Nodepool | Positive | Tier2 | [delete-cluster.md](delete-cluster.md#test-title-cascade-delete-on-cluster-while-child-nodepool-is-mid-update-reconciliation) | DELETE hierarchical | | 28 | Soft-deleted nodepool remains visible via GET and LIST | Nodepool | Positive | Tier1 | [delete-nodepool.md](delete-nodepool.md#test-title-soft-deleted-nodepool-remains-visible-via-get-and-list) | DELETE API behavior | | 29 | No-op PATCH does not increment generation | Cluster | Positive | Tier1 | [update-cluster.md](update-cluster.md#test-title-no-op-patch-does-not-increment-generation) | UPDATE edge cases | @@ -51,17 +48,17 @@ Consolidated test matrix covering positive, negative, and edge case scenarios fo | Category | Tier0 | Tier1 | Tier2 | Total | |----------|-------|-------|-------|-------| -| Positive | 5 | 15 | 3 | 23 | +| Positive | 5 | 15 | 0 | 20 | | Negative | 0 | 5 | 1 | 6 | -| **Total** | **5** | **20** | **4** | **29** | +| **Total** | **5** | **20** | **1** | **26** | ## Coverage by Ticket Area | Ticket Area | Test Cases | Status | |-------------|-----------|--------| | DELETE happy path (soft-delete -> Finalized -> Reconciled -> hard-delete) | #1, #3 | Covered | -| DELETE hierarchical (subresource cleanup before parent hard-delete) | #2, #12, #26, #27 | Covered | -| DELETE edge cases (idempotent re-DELETE, concurrent DELETEs, non-existent resource, stale pre-deletion state, NotFound-as-success, DELETE during update, name reuse after hard-delete) | #9, #11, #13, #14, #18, #19, #20, #21, #22 | Covered | +| DELETE hierarchical (subresource cleanup before parent hard-delete) | #2, #12 | Covered | +| DELETE edge cases (idempotent re-DELETE, concurrent DELETEs, non-existent resource, NotFound-as-success, DELETE during update, name reuse after hard-delete) | #9, #11, #13, #14, #19, #20, #21, #22 | Covered | | DELETE error cases (stuck adapter, unable to finalize) | #17 | Covered | | DELETE API behavior (409 on mutations, GET/LIST still allowed) | #4, #5, #8, #10, #25, #28 | Covered | | UPDATE happy path (PATCH -> generation -> reconciliation -> Reconciled) | #6, #7, #15 | Covered | @@ -77,3 +74,6 @@ Items considered for this matrix but deliberately not covered as standalone test | RBAC denied on DELETE | N/A | No RBAC implementation exists in the API. Authentication is bearer-token only with no role/permission model. Revisit when RBAC is added. | | Concurrent PATCH + DELETE race condition | Deferred | Non-deterministic test — both outcomes (PATCH-first or DELETE-first) are acceptable. Hard to assert on reliably in E2E. Revisit if the team adds a deterministic ordering guarantee. | | PATCH payload validation errors (malformed JSON, schema/type violations) | Out of E2E scope | API-boundary validation happens before lifecycle business logic; cover in API integration tests rather than cross-component E2E. | +| Cascade DELETE while child nodepool is already deleting (deleted_time preservation) | Out of E2E scope | `deleted_time` preservation is a service-layer invariant (`if np.DeletedTime == nil` guard in `CascadeSoftDelete`). Already covered by unit test (`cluster_test.go:1472`), nodepool idempotency test (`node_pool_test.go:941`), and integration test (`clusters_test.go:1077`). E2E assertion is structurally unable to verify itself due to 404 race — adapter may hard-delete the nodepool before the first poll. | +| Cascade DELETE while child nodepool is mid-update-reconciliation (generation handoff) | Out of E2E scope | Adapter generation handling is trivial — precondition phase always re-fetches the resource from the API (`GET /clusters/{id}`) and overwrites the event's `generationId` with the latest value. `CompareGenerations()` is a simple three-way branch (not exists → create, equal → skip, different → update) with no special handling for generation gaps. The adapter mechanically processes at the correct generation regardless of concurrent PATCH + cascade DELETE. Same 404 race weakness as the already-deleting cascade test — E2E assertion structurally unable to verify itself. | +| DELETE during initial creation before cluster reaches Reconciled | Out of E2E scope | Test acknowledges edge case is best-effort (warning when all adapters already Applied=True). Adapter behavior is mechanical — re-fetch always gets latest state including `deleted_time`, no branching on prior Applied state. Same adapter code path covered by Tier1 test #21 (DELETE during update reconciliation) on a more deterministic scenario. | diff --git a/testdata/adapter-configs/cl-stuck/adapter-config.yaml b/testdata/adapter-configs/cl-stuck/adapter-config.yaml new file mode 100644 index 0000000..8977fa7 --- /dev/null +++ b/testdata/adapter-configs/cl-stuck/adapter-config.yaml @@ -0,0 +1,21 @@ +adapter: + name: cl-stuck + +debug_config: false +log: + level: debug + +clients: + hyperfleet_api: + base_url: CHANGE_ME + version: v1 + timeout: 2s + retry_attempts: 3 + retry_backoff: exponential + + broker: + subscription_id: CHANGE_ME + topic: CHANGE_ME + + kubernetes: + api_version: "v1" diff --git a/testdata/adapter-configs/cl-stuck/adapter-task-config.yaml b/testdata/adapter-configs/cl-stuck/adapter-task-config.yaml new file mode 100644 index 0000000..3e844f7 --- /dev/null +++ b/testdata/adapter-configs/cl-stuck/adapter-task-config.yaml @@ -0,0 +1,112 @@ +# Minimal adapter task config for stuck deletion testing +# Creates a namespace as the only resource - simple and fast to verify + +params: + - name: "clusterId" + source: "event.id" + type: "string" + required: true + +preconditions: + - name: "clusterStatus" + api_call: + method: "GET" + url: "/clusters/{{ .clusterId }}" + timeout: 10s + retry_attempts: 3 + retry_backoff: "exponential" + capture: + - name: "clusterName" + field: "name" + - name: "generationSpec" + field: "generation" + - name: "clusterNotReconciled" + expression: | + status.conditions.filter(c, c.type == "Reconciled").size() > 0 + ? status.conditions.filter(c, c.type == "Reconciled")[0].status != "True" + : true + - name: "clusterReconciledTTL" + expression: | + (timestamp(now()) - timestamp( + status.conditions.filter(c, c.type == "Reconciled").size() > 0 + ? status.conditions.filter(c, c.type == "Reconciled")[0].last_transition_time + : now() + )).getSeconds() > 300 + + - name: "validationCheck" + expression: | + clusterNotReconciled || clusterReconciledTTL + +resources: + - name: "clusterNamespace" + transport: + client: "kubernetes" + manifest: + apiVersion: v1 + kind: Namespace + metadata: + name: "{{ .clusterId }}-cl-stuck" + labels: + hyperfleet.io/cluster-id: "{{ .clusterId }}" + hyperfleet.io/cluster-name: "{{ .clusterName }}" + annotations: + hyperfleet.io/generation: "{{ .generationSpec }}" + discovery: + namespace: "*" + by_selectors: + label_selector: + hyperfleet.io/cluster-id: "{{ .clusterId }}" + +post: + payloads: + - name: "clusterStatusPayload" + build: + adapter: "{{ .adapter.name }}" + conditions: + - type: "Applied" + status: + expression: | + resources.?clusterNamespace.?status.?phase.orValue("") == "Active" ? "True" : "False" + reason: + expression: | + resources.?clusterNamespace.?status.?phase.orValue("") == "Active" + ? "NamespaceCreated" + : "NamespacePending" + message: + expression: | + resources.?clusterNamespace.?status.?phase.orValue("") == "Active" + ? "Namespace created successfully" + : "Namespace creation in progress" + - type: "Available" + status: + expression: | + resources.?clusterNamespace.?status.?phase.orValue("") == "Active" ? "True" : "False" + reason: + expression: | + resources.?clusterNamespace.?status.?phase.orValue("") == "Active" ? "NamespaceReady" : "NamespaceNotReady" + message: + expression: | + resources.?clusterNamespace.?status.?phase.orValue("") == "Active" ? "Namespace is active and ready" : "Namespace is not active and ready" + - type: "Health" + status: + expression: | + adapter.?executionStatus.orValue("") == "success" ? "True" : "False" + reason: + expression: | + adapter.?errorReason.orValue("") != "" ? adapter.?errorReason.orValue("") : "Healthy" + message: + expression: | + adapter.?errorMessage.orValue("") != "" ? adapter.?errorMessage.orValue("") : "All adapter operations in progress or completed successfully" + observed_generation: + expression: "generationSpec" + observed_time: "{{ now | date \"2006-01-02T15:04:05Z07:00\" }}" + + post_actions: + - name: "reportClusterStatus" + api_call: + method: "PUT" + url: "/clusters/{{ .clusterId }}/statuses" + headers: + - name: "Content-Type" + value: "application/json" + body: "{{ .clusterStatusPayload }}" diff --git a/testdata/adapter-configs/cl-stuck/values.yaml b/testdata/adapter-configs/cl-stuck/values.yaml new file mode 100644 index 0000000..3128696 --- /dev/null +++ b/testdata/adapter-configs/cl-stuck/values.yaml @@ -0,0 +1,31 @@ +adapterConfig: + create: true + files: + adapter-config.yaml: cl-stuck/adapter-config.yaml + log: + level: debug + +adapterTaskConfig: + create: true + files: + task-config.yaml: cl-stuck/adapter-task-config.yaml + +broker: + create: true + googlepubsub: + projectId: ${GCP_PROJECT_ID} + subscriptionId: ${NAMESPACE}-clusters-${ADAPTER_NAME} + topic: ${NAMESPACE}-clusters + deadLetterTopic: ${NAMESPACE}-clusters-dlq + createTopicIfMissing: ${ADAPTER_GOOGLEPUBSUB_CREATE_TOPIC_IF_MISSING} + createSubscriptionIfMissing: ${ADAPTER_GOOGLEPUBSUB_CREATE_SUBSCRIPTION_IF_MISSING} + +image: + registry: ${IMAGE_REGISTRY} + repository: ${ADAPTER_IMAGE_REPO} + pullPolicy: Always + tag: ${ADAPTER_IMAGE_TAG} + +rbac: + resources: + - namespaces From 36dacbf061ac7e01c3abcb5c808453bbb2554ec9 Mon Sep 17 00:00:00 2001 From: Dmitrii Andreev Date: Fri, 15 May 2026 13:05:55 -0500 Subject: [PATCH 2/2] HYPERFLEET-862 - fix: generate schema like in other repos --- CLAUDE.md | 2 +- Makefile | 3 ++- hack/extract-schema/main.go | 18 ------------------ hack/tools.go | 5 +++++ 4 files changed, 8 insertions(+), 20 deletions(-) delete mode 100644 hack/extract-schema/main.go create mode 100644 hack/tools.go diff --git a/CLAUDE.md b/CLAUDE.md index 98614b9..64045e2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -42,7 +42,7 @@ Required after OpenAPI schema updates: make generate ``` -Extracts schema from `hyperfleet-api-spec` Go module via `hack/extract-schema/` (uses `embed.FS`) and regenerates `pkg/api/openapi/`. +Extracts schema from `hyperfleet-api-spec` Go module (pinned via `hack/tools.go`) and regenerates `pkg/api/openapi/`. ### Run E2E Tests diff --git a/Makefile b/Makefile index 4bc4ca7..912ce9c 100644 --- a/Makefile +++ b/Makefile @@ -45,10 +45,11 @@ help: ## Display this help .PHONY: generate generate: $(OAPI_CODEGEN) ## Generate API client code from OpenAPI schema + $(GO) mod download rm -rf pkg/api/openapi mkdir -p pkg/api/openapi openapi @rm -f openapi/openapi.yaml - @$(GO) run -mod=mod github.com/openshift-hyperfleet/hyperfleet-e2e/hack/extract-schema + @cp "$$($(GO) list -m -f '{{.Dir}}' github.com/openshift-hyperfleet/hyperfleet-api-spec)/schemas/core/openapi.yaml" openapi/openapi.yaml $(OAPI_CODEGEN) --config openapi/oapi-codegen.yaml openapi/openapi.yaml @echo "✓ API client code generated in pkg/api/openapi/" diff --git a/hack/extract-schema/main.go b/hack/extract-schema/main.go deleted file mode 100644 index 8c99276..0000000 --- a/hack/extract-schema/main.go +++ /dev/null @@ -1,18 +0,0 @@ -package main - -import ( - "log" - "os" - - specschemas "github.com/openshift-hyperfleet/hyperfleet-api-spec/schemas" -) - -func main() { - data, err := specschemas.FS.ReadFile("core/openapi.yaml") - if err != nil { - log.Fatalf("failed to read embedded schema: %v", err) - } - if err := os.WriteFile("openapi/openapi.yaml", data, 0600); err != nil { - log.Fatalf("failed to write schema: %v", err) - } -} diff --git a/hack/tools.go b/hack/tools.go new file mode 100644 index 0000000..0101075 --- /dev/null +++ b/hack/tools.go @@ -0,0 +1,5 @@ +//go:build tools + +package hack + +import _ "github.com/openshift-hyperfleet/hyperfleet-api-spec/schemas"