Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Required after OpenAPI schema updates:
make generate
```

Extracts schema from `hyperfleet-api-spec` Go module via `hack/extract-schema/` (uses `embed.FS`) and regenerates `pkg/api/openapi/`.
Extracts schema from `hyperfleet-api-spec` Go module (pinned via `hack/tools.go`) and regenerates `pkg/api/openapi/`.

### Run E2E Tests

Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,11 @@ help: ## Display this help

.PHONY: generate
generate: $(OAPI_CODEGEN) ## Generate API client code from OpenAPI schema
$(GO) mod download
rm -rf pkg/api/openapi
mkdir -p pkg/api/openapi openapi
@rm -f openapi/openapi.yaml
@$(GO) run -mod=mod github.com/openshift-hyperfleet/hyperfleet-e2e/hack/extract-schema
@cp "$$($(GO) list -m -f '{{.Dir}}' github.com/openshift-hyperfleet/hyperfleet-api-spec)/schemas/core/openapi.yaml" openapi/openapi.yaml
$(OAPI_CODEGEN) --config openapi/oapi-codegen.yaml openapi/openapi.yaml
@echo "✓ API client code generated in pkg/api/openapi/"

Expand Down
228 changes: 228 additions & 0 deletions e2e/cluster/stuck_deletion.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
package cluster

import (
"context"
"net/http"
"os"

"github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega" //nolint:staticcheck // dot import for test readability

"github.com/openshift-hyperfleet/hyperfleet-e2e/pkg/api/openapi"
"github.com/openshift-hyperfleet/hyperfleet-e2e/pkg/client"
"github.com/openshift-hyperfleet/hyperfleet-e2e/pkg/helper"
"github.com/openshift-hyperfleet/hyperfleet-e2e/pkg/labels"
)

var _ = ginkgo.Describe("[Suite: cluster][negative] Stuck Deletion -- Adapter Unable to Finalize Prevents Hard-Delete",
Comment thread
kuudori marked this conversation as resolved.
ginkgo.Serial,
ginkgo.Label(labels.Tier2, labels.Negative),
func() {
var (
h *helper.Helper
adapterChartPath string
apiChartPath string
baseDeployOpts helper.AdapterDeploymentOptions
)

ginkgo.BeforeEach(func(ctx context.Context) {
h = helper.New()

ginkgo.By("Clone adapter Helm chart repository")
var cleanupAdapterChart func() error
var err error
adapterChartPath, cleanupAdapterChart, err = h.CloneHelmChart(ctx, helper.HelmChartCloneOptions{
Component: "adapter",
RepoURL: h.Cfg.AdapterDeployment.ChartRepo,
Ref: h.Cfg.AdapterDeployment.ChartRef,
ChartPath: h.Cfg.AdapterDeployment.ChartPath,
WorkDir: helper.TestWorkDir,
})
Expect(err).NotTo(HaveOccurred(), "failed to clone adapter Helm chart")

ginkgo.DeferCleanup(func(ctx context.Context) {
ginkgo.By("Cleanup cloned adapter Helm chart")
if err := cleanupAdapterChart(); err != nil {
ginkgo.GinkgoWriter.Printf("Warning: failed to cleanup adapter chart: %v\n", err)
}
})

ginkgo.By("Clone API Helm chart repository")
var cleanupAPIChart func() error
apiChartPath, cleanupAPIChart, err = h.CloneHelmChart(ctx, helper.HelmChartCloneOptions{
Component: "api",
RepoURL: h.Cfg.APIDeployment.ChartRepo,
Ref: h.Cfg.APIDeployment.ChartRef,
ChartPath: h.Cfg.APIDeployment.ChartPath,
WorkDir: helper.TestWorkDir,
})
Expect(err).NotTo(HaveOccurred(), "failed to clone API Helm chart")

ginkgo.DeferCleanup(func(ctx context.Context) {
ginkgo.By("Cleanup cloned API Helm chart")
if err := cleanupAPIChart(); err != nil {
ginkgo.GinkgoWriter.Printf("Warning: failed to cleanup API chart: %v\n", err)
}
})

baseDeployOpts = helper.AdapterDeploymentOptions{
Namespace: h.Cfg.Namespace,
ChartPath: adapterChartPath,
}
})

ginkgo.It("should prevent hard-delete when an adapter cannot finalize",
func(ctx context.Context) {
adapterName := "cl-stuck"

err := os.Setenv("ADAPTER_NAME", adapterName)
Expect(err).NotTo(HaveOccurred(), "failed to set ADAPTER_NAME environment variable")
ginkgo.DeferCleanup(func() {
_ = os.Unsetenv("ADAPTER_NAME")
})

releaseName := helper.GenerateAdapterReleaseName(helper.ResourceTypeClusters, adapterName)

ginkgo.By("Deploy dedicated stuck-adapter")
deployOpts := baseDeployOpts
deployOpts.ReleaseName = releaseName
deployOpts.AdapterName = adapterName

err = h.DeployAdapter(ctx, deployOpts)
ginkgo.DeferCleanup(func(ctx context.Context) {
ginkgo.By("Uninstall stuck-adapter")
if err := h.UninstallAdapter(ctx, releaseName, h.Cfg.Namespace); err != nil {
ginkgo.GinkgoWriter.Printf("Warning: failed to uninstall adapter %s: %v\n", releaseName, err)
}

ginkgo.By("Clean up Pub/Sub subscription")
subscriptionID := h.Cfg.Namespace + "-" + helper.ResourceTypeClusters + "-" + adapterName
if err := h.DeletePubSubSubscription(ctx, subscriptionID); err != nil {
ginkgo.GinkgoWriter.Printf("Warning: failed to delete Pub/Sub subscription %s: %v\n", subscriptionID, err)
}
})
Expect(err).NotTo(HaveOccurred(), "failed to deploy stuck-adapter")
ginkgo.GinkgoWriter.Printf("Deployed stuck-adapter: release=%s\n", releaseName)

ginkgo.By("Upgrade API to add stuck-adapter to required adapters")
originalAdapters := h.GetAPIRequiredClusterAdapters()
updatedAdapters := append(append([]string{}, originalAdapters...), adapterName)

// Register API config restore AFTER adapter cleanup registration (LIFO → executes FIRST)
ginkgo.DeferCleanup(func(ctx context.Context) {
ginkgo.By("Restore API required adapters to original config")
if err := h.RestoreAPIRequiredAdaptersWithRetry(ctx, apiChartPath, h.Cfg.Namespace, originalAdapters, 3); err != nil {
ginkgo.GinkgoWriter.Printf("CRITICAL: %v\n", err)
}
})
Comment thread
coderabbitai[bot] marked this conversation as resolved.

err = h.UpgradeAPIRequiredAdapters(ctx, apiChartPath, h.Cfg.Namespace, updatedAdapters)
Expect(err).NotTo(HaveOccurred(), "failed to upgrade API with stuck-adapter in required adapters")

deploymentName, err := h.GetDeploymentName(ctx, h.Cfg.Namespace, releaseName)
Expect(err).NotTo(HaveOccurred(), "failed to find stuck-adapter deployment name")

ginkgo.By("Create cluster and wait for Reconciled with all adapters including stuck-adapter")
cluster, err := h.Client.CreateClusterFromPayload(ctx, h.TestDataPath("payloads/clusters/cluster-request.json"))
Expect(err).NotTo(HaveOccurred(), "failed to create cluster")
Expect(cluster.Id).NotTo(BeNil(), "cluster ID should be generated")
clusterID := *cluster.Id
ginkgo.GinkgoWriter.Printf("Created cluster ID: %s, Name: %s\n", clusterID, cluster.Name)

ginkgo.DeferCleanup(func(ctx context.Context) {
ginkgo.By("Cleanup test cluster " + clusterID)
if err := h.CleanupTestCluster(ctx, clusterID); err != nil {
ginkgo.GinkgoWriter.Printf("Warning: failed to cleanup cluster %s: %v\n", clusterID, err)
}
})

Eventually(h.PollCluster(ctx, clusterID), h.Cfg.Timeouts.Cluster.Reconciled, h.Cfg.Polling.Interval).
Should(helper.HaveResourceCondition(client.ConditionTypeReconciled, openapi.ResourceConditionStatusTrue))

ginkgo.By("Verify stuck-adapter reported Applied=True")
Eventually(func(g Gomega) {
statuses, err := h.Client.GetClusterStatuses(ctx, clusterID)
g.Expect(err).NotTo(HaveOccurred(), "failed to get cluster statuses")

var found bool
for _, s := range statuses.Items {
if s.Adapter == adapterName {
found = true
g.Expect(h.HasAdapterCondition(s.Conditions,
client.ConditionTypeApplied, openapi.AdapterConditionStatusTrue)).To(BeTrue(),
"stuck-adapter should have Applied=True before scale-down")
break
}
}
g.Expect(found).To(BeTrue(), "stuck-adapter should be present in statuses")
}, h.Cfg.Timeouts.Adapter.Processing, h.Cfg.Polling.Interval).Should(Succeed())
Comment thread
coderabbitai[bot] marked this conversation as resolved.

ginkgo.By("Scale down stuck-adapter to simulate unavailability")
err = h.ScaleDeployment(ctx, h.Cfg.Namespace, deploymentName, 0)
Expect(err).NotTo(HaveOccurred(), "failed to scale down stuck-adapter")

ginkgo.By("Soft-delete the cluster")
deletedCluster, err := h.Client.DeleteCluster(ctx, clusterID)
Expect(err).NotTo(HaveOccurred(), "DELETE request should succeed with 202")
Expect(deletedCluster.DeletedTime).NotTo(BeNil(), "soft-deleted cluster should have deleted_time set")

ginkgo.By("Wait for healthy adapters to report Finalized=True")
Eventually(func(g Gomega) {
statuses, err := h.Client.GetClusterStatuses(ctx, clusterID)
g.Expect(err).NotTo(HaveOccurred(), "failed to get cluster statuses")

adapterMap := make(map[string]openapi.AdapterStatus, len(statuses.Items))
for _, s := range statuses.Items {
adapterMap[s.Adapter] = s
}

for _, name := range originalAdapters {
adapter, exists := adapterMap[name]
g.Expect(exists).To(BeTrue(), "adapter %s should be present", name)
g.Expect(h.HasAdapterCondition(adapter.Conditions,
client.ConditionTypeFinalized, openapi.AdapterConditionStatusTrue)).To(BeTrue(),
"adapter %s should have Finalized=True", name)
}
}, h.Cfg.Timeouts.Adapter.Processing, h.Cfg.Polling.Interval).Should(Succeed())

ginkgo.By("Verify cluster remains stuck in soft-deleted state (not hard-deleted)")
Consistently(func(g Gomega) {
cl, err := h.Client.GetCluster(ctx, clusterID)
g.Expect(err).NotTo(HaveOccurred(), "cluster should still be accessible (not hard-deleted)")
g.Expect(cl.DeletedTime).NotTo(BeNil(), "cluster should still be soft-deleted")

g.Expect(h.HasResourceCondition(cl.Status.Conditions,
client.ConditionTypeReconciled, openapi.ResourceConditionStatusFalse)).To(BeTrue(),
"cluster Reconciled should remain False while stuck-adapter is unavailable")

statuses, err := h.Client.GetClusterStatuses(ctx, clusterID)
g.Expect(err).NotTo(HaveOccurred(), "failed to get cluster statuses")

for _, s := range statuses.Items {
if s.Adapter == adapterName {
g.Expect(h.HasAdapterCondition(s.Conditions,
client.ConditionTypeFinalized, openapi.AdapterConditionStatusTrue)).To(BeFalse(),
"stuck-adapter should NOT have Finalized=True while scaled to 0")
break
}
}
}, h.Cfg.Timeouts.Adapter.Processing/2, h.Cfg.Polling.Interval).Should(Succeed())
Comment thread
coderabbitai[bot] marked this conversation as resolved.

ginkgo.GinkgoWriter.Printf("Verified: cluster stuck in soft-deleted state, healthy adapters finalized but stuck-adapter has not\n")

ginkgo.By("Restore stuck-adapter by scaling up")
err = h.ScaleDeployment(ctx, h.Cfg.Namespace, deploymentName, 1)
Expect(err).NotTo(HaveOccurred(), "failed to scale up stuck-adapter")

ginkgo.By("Verify cluster is hard-deleted after stuck-adapter recovery")
Eventually(h.PollClusterHTTPStatus(ctx, clusterID), h.Cfg.Timeouts.Cluster.Reconciled, h.Cfg.Polling.Interval).
Should(Equal(http.StatusNotFound))
Comment thread
coderabbitai[bot] marked this conversation as resolved.

ginkgo.By("Verify downstream K8s namespace is cleaned up")
Eventually(h.PollNamespacesByPrefix(ctx, clusterID), h.Cfg.Timeouts.Adapter.Processing, h.Cfg.Polling.Interval).
Should(BeEmpty())

ginkgo.GinkgoWriter.Printf("Verified: stuck-adapter recovered, cluster hard-deleted\n")
})
Comment thread
coderabbitai[bot] marked this conversation as resolved.
},
)
18 changes: 0 additions & 18 deletions hack/extract-schema/main.go

This file was deleted.

5 changes: 5 additions & 0 deletions hack/tools.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
//go:build tools

package hack

import _ "github.com/openshift-hyperfleet/hyperfleet-api-spec/schemas"
5 changes: 5 additions & 0 deletions pkg/helper/adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,11 @@ func (h *Helper) DeployAdapter(ctx context.Context, opts AdapterDeploymentOption
"--set", fmt.Sprintf("fullnameOverride=%s", releaseName),
)

// Override image pull policy if set (e.g. IfNotPresent for local kind clusters)
if policy := os.Getenv("IMAGE_PULL_POLICY"); policy != "" {
helmArgs = append(helmArgs, "--set", fmt.Sprintf("image.pullPolicy=%s", policy))
}

// Add additional --set values if provided
for key, value := range opts.SetValues {
helmArgs = append(helmArgs, "--set", fmt.Sprintf("%s=%s", key, value))
Expand Down
3 changes: 1 addition & 2 deletions test-design/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@ A Tier2 failure means **edge cases or rare scenarios don't work as expected**. T
|-----------|-----------|
| [Cluster reaches correct status after adapter crash and recovery](testcases/cluster.md#test-title-cluster-can-reach-correct-status-after-adapter-crash-and-recovery) | Infrastructure recovery -- involves killing adapter pods and verifying self-healing. Crashes are rare; operators can restart pods manually |
| [Maestro server unavailability graceful handling](testcases/adapter-with-maestro-transport.md#test-title-adapter-can-handle-maestro-server-unavailability-gracefully) | Infrastructure recovery -- simulates Maestro outage. Server failures are rare and recoverable |
| [DELETE during initial creation before Reconciled](testcases/delete-cluster.md#test-title-delete-during-initial-creation-before-cluster-reaches-reconciled) | Race condition -- user deletes a cluster before it finishes creating. Unusual timing scenario |
| [Cascade DELETE while child nodepool is mid-update](testcases/delete-cluster.md#test-title-cascade-delete-on-cluster-while-child-nodepool-is-mid-update-reconciliation) | Race condition -- cluster deletion while nodepool is being updated. Requires specific timing overlap |
| [Stuck deletion -- adapter unable to finalize prevents hard-delete](testcases/delete-cluster.md#test-title-stuck-deletion----adapter-unable-to-finalize-prevents-hard-delete) | Infrastructure recovery -- simulates adapter unavailability during deletion. Verifies system won't hard-delete prematurely and recovers when adapter returns |

### Tier Decision Flowchart

Expand Down
Loading