From 9eb82e15d48c5fa51666b53b64993bf812a93e15 Mon Sep 17 00:00:00 2001 From: Cristiano Veiga Date: Thu, 2 Apr 2026 16:30:43 -0400 Subject: [PATCH] fix(hypershift/gcp): reconstruct resource names when SHARED_DIR is empty When the provision step is aborted (SIGTERM), the SHARED_DIR Kubernetes Secret may not be updated, leaving post steps with no project IDs to clean up. Since resource names are deterministic (derived from NAMESPACE and UNIQUE_HASH env vars), the deprovision step can reconstruct them as a fallback. Co-Authored-By: Claude Opus 4.6 --- ...hypershift-gcp-gke-deprovision-commands.sh | 41 ++++++++++++------- .../hypershift-gcp-gke-deprovision-ref.yaml | 14 +++++-- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/ci-operator/step-registry/hypershift/gcp/gke/deprovision/hypershift-gcp-gke-deprovision-commands.sh b/ci-operator/step-registry/hypershift/gcp/gke/deprovision/hypershift-gcp-gke-deprovision-commands.sh index 80b4e09a84f13..a717d45b9f5ea 100644 --- a/ci-operator/step-registry/hypershift/gcp/gke/deprovision/hypershift-gcp-gke-deprovision-commands.sh +++ b/ci-operator/step-registry/hypershift/gcp/gke/deprovision/hypershift-gcp-gke-deprovision-commands.sh @@ -6,18 +6,28 @@ set -euo pipefail GCP_CREDS_FILE="${CLUSTER_PROFILE_DIR}/credentials.json" gcloud auth activate-service-account --key-file="${GCP_CREDS_FILE}" -# Check if provision completed - if not, nothing to clean up -if [[ ! -f "${SHARED_DIR}/control-plane-project-id" ]]; then - echo "No control-plane-project-id file found - provision may not have completed" - echo "Nothing to deprovision, exiting successfully" - exit 0 +# Load cluster info from SHARED_DIR (written by provision step). +# If SHARED_DIR files are missing (provision was aborted before the Secret was synced), +# reconstruct resource names from env vars since they are deterministic. +if [[ -f "${SHARED_DIR}/control-plane-project-id" ]]; then + CP_PROJECT_ID="$(<"${SHARED_DIR}/control-plane-project-id")" + CP_CLUSTER_NAME="$(<"${SHARED_DIR}/control-plane-cluster-name")" + GCP_REGION="$(<"${SHARED_DIR}/gcp-region")" +else + # SHARED_DIR is backed by a Kubernetes Secret that is updated after the step exits. + # If the provision step is aborted (SIGTERM), the Secret update may not complete, + # leaving SHARED_DIR empty for post steps. Reconstruct from env vars. + RESOURCE_NAME_PREFIX="${NAMESPACE}-${UNIQUE_HASH}" + INFRA_ID="${RESOURCE_NAME_PREFIX}" + CP_PROJECT_ID="${INFRA_ID:0:14}-control-plane" + CP_CLUSTER_NAME="${RESOURCE_NAME_PREFIX}-gke" + GCP_REGION="${GKE_REGION}" + echo "WARNING: SHARED_DIR files missing - reconstructed resource names from env vars" + echo " CP_PROJECT_ID=${CP_PROJECT_ID}" + echo " CP_CLUSTER_NAME=${CP_CLUSTER_NAME}" + echo " GCP_REGION=${GCP_REGION}" fi -# Load cluster info from provision and hosted-cluster-setup steps -CP_PROJECT_ID="$(<"${SHARED_DIR}/control-plane-project-id")" -CP_CLUSTER_NAME="$(<"${SHARED_DIR}/control-plane-cluster-name")" -GCP_REGION="$(<"${SHARED_DIR}/gcp-region")" - # hosted-cluster-name may not exist if job was aborted before hosted-cluster-setup ran if [[ -f "${SHARED_DIR}/hosted-cluster-name" ]]; then HC_CLUSTER_NAME="$(<"${SHARED_DIR}/hosted-cluster-name")" @@ -27,12 +37,13 @@ else echo "Will skip DNS cleanup but still deprovision GKE cluster and projects" fi -# Hosted Cluster project file path (may not exist if provision failed early) -HC_PROJECT_FILE="${SHARED_DIR}/hosted-cluster-project-id" -if [[ -f "${HC_PROJECT_FILE}" ]]; then - HC_PROJECT_ID="$(<"${HC_PROJECT_FILE}")" +# Hosted Cluster project - read from SHARED_DIR or reconstruct +if [[ -f "${SHARED_DIR}/hosted-cluster-project-id" ]]; then + HC_PROJECT_ID="$(<"${SHARED_DIR}/hosted-cluster-project-id")" else - HC_PROJECT_ID="" + INFRA_ID="${INFRA_ID:-${NAMESPACE}-${UNIQUE_HASH}}" + HC_PROJECT_ID="${INFRA_ID:0:14}-hosted-cluster" + echo "WARNING: Reconstructed HC_PROJECT_ID=${HC_PROJECT_ID}" fi set -x diff --git a/ci-operator/step-registry/hypershift/gcp/gke/deprovision/hypershift-gcp-gke-deprovision-ref.yaml b/ci-operator/step-registry/hypershift/gcp/gke/deprovision/hypershift-gcp-gke-deprovision-ref.yaml index 1e8c0291b654f..6a2ff96f6b7dc 100644 --- a/ci-operator/step-registry/hypershift/gcp/gke/deprovision/hypershift-gcp-gke-deprovision-ref.yaml +++ b/ci-operator/step-registry/hypershift/gcp/gke/deprovision/hypershift-gcp-gke-deprovision-ref.yaml @@ -15,6 +15,9 @@ ref: - name: HYPERSHIFT_GCP_CI_DNS_DOMAIN default: "" documentation: "DNS domain for HyperShift CI hosted clusters" + - name: GKE_REGION + default: "us-central1" + documentation: "GCP region for the GKE cluster (used as fallback when SHARED_DIR is unavailable)" resources: requests: cpu: 100m @@ -34,8 +37,13 @@ ref: deleting the cluster first and waiting for completion, we ensure the Control Plane project can be deleted cleanly. - Reads from ${SHARED_DIR}: + Reads from ${SHARED_DIR} (when available): - control-plane-project-id: Control Plane project ID - hosted-cluster-project-id: Hosted Cluster project ID (optional) - - cluster-name: GKE cluster name - - gcp-region: GCP region \ No newline at end of file + - control-plane-cluster-name: GKE cluster name + - gcp-region: GCP region + + If SHARED_DIR files are missing (e.g. provision step was aborted before + the Kubernetes Secret backing SHARED_DIR was synced), resource names are + reconstructed from NAMESPACE and UNIQUE_HASH env vars using the same + naming logic as hypershift-gcp-gke-provision. \ No newline at end of file