From 4a3076fdf2d8ab8e718b6f94dac00294360d7f14 Mon Sep 17 00:00:00 2001 From: Federico Bonfigli Date: Wed, 15 Apr 2026 16:18:31 +0200 Subject: [PATCH 1/2] Add end to end tests for BYO security group for AWS NLBs Adds end to end tests for BYO security groups for AWS NLBs. These tests are defined here in downstream due to limitations in the upstream test setup. The tests validate the basic flows of provisioning a NLB with a BYO security group and switching back and forth between Managed and BYO security groups. --- .../e2e/helper.go | 272 ++++++++++++ .../e2e/loadbalancer.go | 414 +++++++++++++++++- 2 files changed, 685 insertions(+), 1 deletion(-) diff --git a/cmd/cloud-controller-manager-aws-tests-ext/e2e/helper.go b/cmd/cloud-controller-manager-aws-tests-ext/e2e/helper.go index b1bd409e2..207af47a2 100644 --- a/cmd/cloud-controller-manager-aws-tests-ext/e2e/helper.go +++ b/cmd/cloud-controller-manager-aws-tests-ext/e2e/helper.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "strings" + "time" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/config" @@ -12,7 +13,10 @@ import ( elbv2 "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2" elbv2types "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types" configv1client "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + clientset "k8s.io/client-go/kubernetes" "k8s.io/kubernetes/test/e2e/framework" ) @@ -194,3 +198,271 @@ func ec2IsNotFoundError(err error) bool { strings.Contains(errMsg, "InvalidGroupId.NotFound") || strings.Contains(errMsg, "InvalidGroup.Malformed") } + +// isAWSThrottlingError checks if an error is an AWS throttling/rate limit error. +func isAWSThrottlingError(err error) bool { + if err == nil { + return false + } + errMsg := err.Error() + return strings.Contains(errMsg, "Throttling") || + strings.Contains(errMsg, "RequestLimitExceeded") || + strings.Contains(errMsg, "TooManyRequests") || + strings.Contains(errMsg, "RequestThrottled") +} + +// createAWSSecurityGroup creates a test security group. +func createAWSSecurityGroup(ctx context.Context, ec2Client *ec2.Client, name, description, vpcID string) (string, error) { + framework.Logf("creating security group %s in VPC %s", name, vpcID) + + result, err := ec2Client.CreateSecurityGroup(ctx, &ec2.CreateSecurityGroupInput{ + GroupName: &name, + Description: &description, + VpcId: &vpcID, + TagSpecifications: []ec2types.TagSpecification{ + { + ResourceType: ec2types.ResourceTypeSecurityGroup, + Tags: []ec2types.Tag{ + { + Key: aws.String("Name"), + Value: &name, + }, + }, + }, + }, + }) + if err != nil { + return "", fmt.Errorf("failed to create security group %s: %v", name, err) + } + + sgID := aws.ToString(result.GroupId) + framework.Logf("created security group %s with ID %s", name, sgID) + return sgID, nil +} + +// isSecurityGroupManaged checks if a security group is managed by the controller. +// It checks for the cluster ownership tag to determine if the controller owns this security group. +// Managed SGs have tag kubernetes.io/cluster/ = "owned" +func isSecurityGroupManaged(ctx context.Context, ec2Client *ec2.Client, sgID, clusterName string) (bool, error) { + sg, err := getAWSSecurityGroup(ctx, ec2Client, sgID) + if err != nil { + return false, err + } + + // Check for cluster ownership tag + clusterTagKey := fmt.Sprintf("kubernetes.io/cluster/%s", clusterName) + for _, tag := range sg.Tags { + if aws.ToString(tag.Key) == clusterTagKey && aws.ToString(tag.Value) == "owned" { + return true, nil + } + } + return false, nil +} + +// authorizeSecurityGroupIngress adds ingress rules to a security group for the given service ports. +func authorizeSecurityGroupIngress(ctx context.Context, ec2Client *ec2.Client, sgID string, ports []v1.ServicePort) error { + if len(ports) == 0 { + return nil + } + + framework.Logf("authorizing ingress rules for security group %s", sgID) + + ingressRules := make([]ec2types.IpPermission, 0, len(ports)) + for _, port := range ports { + protocol := strings.ToLower(string(port.Protocol)) + rule := ec2types.IpPermission{ + FromPort: aws.Int32(port.Port), + ToPort: aws.Int32(port.Port), + IpProtocol: &protocol, + IpRanges: []ec2types.IpRange{ + { + CidrIp: aws.String("0.0.0.0/0"), + Description: aws.String(fmt.Sprintf("E2E test access for port %d", port.Port)), + }, + }, + } + ingressRules = append(ingressRules, rule) + } + + _, err := ec2Client.AuthorizeSecurityGroupIngress(ctx, &ec2.AuthorizeSecurityGroupIngressInput{ + GroupId: &sgID, + IpPermissions: ingressRules, + }) + if err != nil { + // Check if error is due to duplicate rules (which is acceptable) + if strings.Contains(err.Error(), "InvalidPermission.Duplicate") { + framework.Logf("some rules already exist in security group %s (this is okay)", sgID) + return nil + } + return fmt.Errorf("failed to authorize ingress for security group %s: %v", sgID, err) + } + + framework.Logf("successfully authorized %d ingress rule(s) for security group %s", len(ingressRules), sgID) + return nil +} + +// deleteAWSSecurityGroup deletes a security group. +func deleteAWSSecurityGroup(ctx context.Context, ec2Client *ec2.Client, sgID string) error { + framework.Logf("deleting security group %s", sgID) + + _, err := ec2Client.DeleteSecurityGroup(ctx, &ec2.DeleteSecurityGroupInput{ + GroupId: &sgID, + }) + if err != nil { + // If already deleted, that's okay + if ec2IsNotFoundError(err) { + framework.Logf("security group %s already deleted", sgID) + return nil + } + return fmt.Errorf("failed to delete security group %s: %v", sgID, err) + } + + framework.Logf("successfully deleted security group %s", sgID) + return nil +} + +// waitForSecurityGroupDeletion attempts to delete a security group and waits for it to be deleted. +// It handles dependency violations when the SG is still attached to resources like load balancers. +func waitForSecurityGroupDeletion(ctx context.Context, ec2Client *ec2.Client, sgID string, timeout time.Duration) error { + framework.Logf("waiting for security group %s deletion (timeout: %v)", sgID, timeout) + + return wait.PollUntilContextTimeout(ctx, 5*time.Second, timeout, true, func(pollCtx context.Context) (bool, error) { + // First check if SG still exists + exists, err := securityGroupExists(pollCtx, ec2Client, sgID) + if err != nil { + // Handle throttling errors by continuing to poll + if isAWSThrottlingError(err) { + framework.Logf("AWS throttling encountered while checking security group %s, retrying...", sgID) + return false, nil + } + return false, fmt.Errorf("error checking if security group exists: %v", err) + } + + if !exists { + framework.Logf("security group %s successfully deleted", sgID) + return true, nil + } + + // Try to delete it + err = deleteAWSSecurityGroup(pollCtx, ec2Client, sgID) + if err != nil { + // Handle throttling errors by continuing to poll + if isAWSThrottlingError(err) { + framework.Logf("AWS throttling encountered while deleting security group %s, retrying...", sgID) + return false, nil + } + + // Check for dependency violation errors - keep retrying + if strings.Contains(err.Error(), "DependencyViolation") || + strings.Contains(err.Error(), "InvalidGroup.InUse") || + strings.Contains(err.Error(), "resource has a dependent object") { + framework.Logf("security group %s still has dependencies, retrying...", sgID) + return false, nil // Keep waiting + } + + // Check if it's already deleted + if ec2IsNotFoundError(err) { + framework.Logf("security group %s deleted", sgID) + return true, nil + } + + // For other errors, return the error + return false, err + } + + // Deletion succeeded + return true, nil + }) +} + +// getClusterInstanceID extracts an EC2 instance ID from a cluster node's provider ID. +func getClusterInstanceID(ctx context.Context, cs clientset.Interface) (string, error) { + nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return "", fmt.Errorf("failed to list nodes: %v", err) + } + + if len(nodes.Items) == 0 { + return "", fmt.Errorf("no nodes found in cluster") + } + + // Get instance ID from first node + for _, node := range nodes.Items { + providerID := node.Spec.ProviderID + if providerID == "" { + continue + } + // providerID format: aws:///us-east-1a/i-1234567890abcdef0 + providerID = strings.Replace(providerID, "aws:///", "", 1) + parts := strings.Split(providerID, "/") + if len(parts) < 2 { + continue + } + instanceID := parts[1] + if strings.HasPrefix(instanceID, "i-") { + return instanceID, nil + } + } + + return "", fmt.Errorf("could not find valid instance ID from cluster nodes") +} + +// getClusterVPCID discovers the VPC ID from a cluster node's network interface. +func getClusterVPCID(ctx context.Context, cs clientset.Interface, ec2Client *ec2.Client) (string, error) { + instanceID, err := getClusterInstanceID(ctx, cs) + if err != nil { + return "", err + } + + // Describe instance to get VPC ID + result, err := ec2Client.DescribeInstances(ctx, &ec2.DescribeInstancesInput{ + InstanceIds: []string{instanceID}, + }) + if err != nil { + return "", fmt.Errorf("failed to describe instance %s: %v", instanceID, err) + } + + if len(result.Reservations) == 0 || len(result.Reservations[0].Instances) == 0 { + return "", fmt.Errorf("instance %s not found", instanceID) + } + + vpcID := aws.ToString(result.Reservations[0].Instances[0].VpcId) + if vpcID == "" { + return "", fmt.Errorf("VPC ID not found for instance %s", instanceID) + } + + framework.Logf("discovered cluster VPC ID: %s", vpcID) + return vpcID, nil +} + +// getClusterName discovers the cluster name from a cluster node's tags. +func getClusterName(ctx context.Context, cs clientset.Interface, ec2Client *ec2.Client) (string, error) { + instanceID, err := getClusterInstanceID(ctx, cs) + if err != nil { + return "", err + } + + // Describe instance to get cluster tag + result, err := ec2Client.DescribeInstances(ctx, &ec2.DescribeInstancesInput{ + InstanceIds: []string{instanceID}, + }) + if err != nil { + return "", fmt.Errorf("failed to describe instance %s: %v", instanceID, err) + } + + if len(result.Reservations) == 0 || len(result.Reservations[0].Instances) == 0 { + return "", fmt.Errorf("instance %s not found", instanceID) + } + + // Find cluster tag (kubernetes.io/cluster/) + for _, tag := range result.Reservations[0].Instances[0].Tags { + key := aws.ToString(tag.Key) + if after, ok := strings.CutPrefix(key, "kubernetes.io/cluster/"); ok { + clusterName := after + framework.Logf("discovered cluster name: %s", clusterName) + return clusterName, nil + } + } + + return "", fmt.Errorf("cluster tag not found on instance %s", instanceID) +} diff --git a/cmd/cloud-controller-manager-aws-tests-ext/e2e/loadbalancer.go b/cmd/cloud-controller-manager-aws-tests-ext/e2e/loadbalancer.go index 0bdfc8120..26c0d6207 100644 --- a/cmd/cloud-controller-manager-aws-tests-ext/e2e/loadbalancer.go +++ b/cmd/cloud-controller-manager-aws-tests-ext/e2e/loadbalancer.go @@ -3,9 +3,11 @@ package e2e import ( "context" "fmt" + "slices" "strings" "time" + "github.com/aws/aws-sdk-go-v2/aws" elbv2 "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2" elbv2types "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types" . "github.com/onsi/ginkgo/v2" @@ -30,7 +32,8 @@ const ( // when available: features.FeatureGateAWSServiceLBNetworkSecurityGroup featureGateAWSServiceLBNetworkSecurityGroup = "AWSServiceLBNetworkSecurityGroup" - annotationLBType = "service.beta.kubernetes.io/aws-load-balancer-type" + annotationLBType = "service.beta.kubernetes.io/aws-load-balancer-type" + annotationLBSecurityGroups = "service.beta.kubernetes.io/aws-load-balancer-security-groups" cloudConfigNamespace = "openshift-cloud-controller-manager" cloudConfigName = "cloud-conf" @@ -496,6 +499,415 @@ var _ = Describe(fmt.Sprintf("%s NLB [OCPFeatureGate:%s]", e2eTestPrefixLoadBala Expect(expectedPorts).To(ContainElements(int32(80), int32(443)), "security groups should include rules for ports 80 and 443") }) + + // Test: [cloud-provider-aws-e2e-openshift] loadbalancer NLB [OCPFeatureGate:AWSServiceLBNetworkSecurityGroup] should create NLB service with BYO security group and preserve it after deletion + // + // Creates a new Service type loadBalancer Network Load Balancer (NLB) with a user-provided (BYO) + // security group annotation, validates that the specified security group is attached to the NLB, + // then deletes the service and validates that the BYO security group is preserved (not deleted). + // + // Prerequisites: + // - AWSServiceLBNetworkSecurityGroup feature gate is enabled + // + // Expected Results: + // - BYO security group is created successfully + // - Service type loadBalancer Network Load Balancer (NLB) is created with BYO SG annotation + // - Backend pods start and become ready + // - Load balancer has the BYO security group attached (not managed SG) + // - BYO security group has no cluster tag (not "owned") + // - Service and load balancer are deleted successfully + // - BYO security group is NOT deleted (user retains ownership) + // - BYO security group still exists in AWS after service deletion + // - The test must fail if BYO security group is not attached or is deleted + // - The test must skip if the feature gate is not enabled + It("should create NLB with BYO SG and preserve it after deletion", func(ctx context.Context) { + isNLBFeatureEnabled(ctx) + + By("creating required AWS clients") + ec2Client, err := createAWSClientEC2(ctx) + framework.ExpectNoError(err, "failed to create AWS EC2 client") + + elbClient, err := createAWSClientLoadBalancer(ctx) + framework.ExpectNoError(err, "failed to create AWS ELB client") + + By("discovering cluster VPC and name for BYO security group creation") + vpcID, err := getClusterVPCID(ctx, cs, ec2Client) + framework.ExpectNoError(err, "failed to get cluster VPC ID") + + clusterName, err := getClusterName(ctx, cs, ec2Client) + framework.ExpectNoError(err, "failed to get cluster name") + + By("creating BYO security group for testing") + sgName := fmt.Sprintf("e2e-nlb-byo-sg-create-%s", ns.Name) + sgDescription := fmt.Sprintf("BYO Security Group for e2e test %s", ns.Name) + byoSGID, err := createAWSSecurityGroup(ctx, ec2Client, sgName, sgDescription, vpcID) + framework.ExpectNoError(err, "failed to create BYO security group") + framework.Logf("created BYO security group: %s", byoSGID) + + // Add cleanup for BYO security group + DeferCleanup(func(cleanupCtx context.Context) { + By("cleaning up BYO security group") + err := waitForSecurityGroupDeletion(cleanupCtx, ec2Client, byoSGID, 5*time.Minute) + if err != nil { + framework.Logf("warning: failed to delete BYO security group %s: %v", byoSGID, err) + } + }) + + By("adding ingress rules to BYO security group") + ports := []v1.ServicePort{{Port: 80, Protocol: v1.ProtocolTCP}} + err = authorizeSecurityGroupIngress(ctx, ec2Client, byoSGID, ports) + framework.ExpectNoError(err, "failed to authorize ingress for BYO security group") + + By("creating test service with BYO security group annotation") + serviceName := "nlb-byo-sg-create" + jig := e2eservice.NewTestJig(cs, ns.Name, serviceName) + + svc := &v1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: jig.Namespace, + Name: jig.Name, + Labels: jig.Labels, + Annotations: map[string]string{ + annotationLBType: "nlb", + annotationLBSecurityGroups: byoSGID, + }, + }, + Spec: v1.ServiceSpec{ + Type: v1.ServiceTypeLoadBalancer, + SessionAffinity: v1.ServiceAffinityNone, + Selector: jig.Labels, + Ports: []v1.ServicePort{ + { + Name: "http", + Protocol: v1.ProtocolTCP, + Port: 80, + TargetPort: intstr.FromInt(8080), + }, + }, + }, + } + + _, err = jig.Client.CoreV1().Services(jig.Namespace).Create(ctx, svc, metav1.CreateOptions{}) + framework.ExpectNoError(err, "failed to create LoadBalancer Service") + + By("waiting for AWS load balancer provisioning") + loadBalancerCreateTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs) + svc, err = jig.WaitForLoadBalancer(ctx, loadBalancerCreateTimeout) + framework.ExpectNoError(err, "LoadBalancer provisioning failed") + + By("extracting load balancer DNS name") + Expect(len(svc.Status.LoadBalancer.Ingress)).To(BeNumerically(">", 0), + "no ingress entry found in LoadBalancer status") + lbDNS := svc.Status.LoadBalancer.Ingress[0].Hostname + framework.Logf("load balancer DNS: %s", lbDNS) + + By("getting NLB from AWS API") + foundLB, err := getAWSLoadBalancerFromDNSName(ctx, elbClient, lbDNS) + framework.ExpectNoError(err, "failed to find load balancer with DNS name %s", lbDNS) + Expect(foundLB).NotTo(BeNil(), "found load balancer is nil") + + By("verifying BYO security group is attached to the NLB") + Expect(len(foundLB.SecurityGroups)).To(BeNumerically(">", 0), + "load balancer should have security groups attached") + + foundBYOSG := slices.Contains(foundLB.SecurityGroups, byoSGID) + Expect(foundBYOSG).To(BeTrue(), + "load balancer should have the BYO security group %s attached, but found: %v", byoSGID, foundLB.SecurityGroups) + + By("verifying BYO security group is not cluster owned") + isManaged, err := isSecurityGroupManaged(ctx, ec2Client, byoSGID, clusterName) + framework.ExpectNoError(err, "failed to check if BYO security group is managed") + Expect(isManaged).To(BeFalse(), + "BYO security group should NOT be managed by the controller (should not have 'owned' tag)") + + framework.Logf("successfully validated NLB with BYO security group %s", byoSGID) + + By("verifying BYO security group exists before service deletion") + exists, err := securityGroupExists(ctx, ec2Client, byoSGID) + framework.ExpectNoError(err, "failed to check if BYO security group exists") + Expect(exists).To(BeTrue(), "BYO security group %s should exist before service deletion", byoSGID) + + By("deleting the service") + err = deleteServiceAndWaitForLoadBalancerDeletion(ctx, jig, lbDNS) + framework.ExpectNoError(err, "failed to delete service and wait for load balancer deletion") + framework.Logf("service and load balancer deleted successfully") + + By("verifying BYO security group STILL EXISTS after service deletion") + exists, err = securityGroupExists(ctx, ec2Client, byoSGID) + framework.ExpectNoError(err, "failed to check if BYO security group exists after deletion") + Expect(exists).To(BeTrue(), + "BYO security group %s should NOT be deleted by the controller (user retains ownership)", byoSGID) + + By("verifying BYO security group properties are unchanged") + sg, err := getAWSSecurityGroup(ctx, ec2Client, byoSGID) + framework.ExpectNoError(err, "failed to get BYO security group after service deletion") + Expect(sg).NotTo(BeNil(), "BYO security group should be retrievable after service deletion") + Expect(aws.ToString(sg.GroupName)).To(Equal(sgName), + "BYO security group name should be unchanged") + + // Verify it's still marked as not cluster owned + isManaged, err = isSecurityGroupManaged(ctx, ec2Client, byoSGID, clusterName) + framework.ExpectNoError(err, "failed to check if BYO security group is managed") + Expect(isManaged).To(BeFalse(), + "BYO security group should still be user-managed (not controller-owned) after service deletion") + + framework.Logf("successfully validated that BYO security group %s was preserved after service deletion", byoSGID) + }) + + // Test: [cloud-provider-aws-e2e-openshift] loadbalancer NLB [OCPFeatureGate:AWSServiceLBNetworkSecurityGroup] should transition NLB between managed and BYO security groups + // + // Creates a Service type loadBalancer Network Load Balancer (NLB) with managed security groups, + // transitions to a user-provided (BYO) security group, then transitions back to managed security groups, + // validating the full round-trip and verifying BYO security group preservation. + // + // Prerequisites: + // - AWSServiceLBNetworkSecurityGroup feature gate is enabled + // + // Expected Results: + // - Service type loadBalancer Network Load Balancer (NLB) is created with managed SG initially + // - Managed security group is attached and has cluster ownership tag ("owned") + // - Service is updated with BYO security group annotation + // - Controller transitions from managed SG to BYO SG + // - Load balancer has BYO security group attached after first transition + // - Old managed security groups are deleted by the controller + // - Service is updated to remove BYO security group annotation + // - Controller transitions back from BYO SG to managed SG + // - Load balancer has new managed security group attached after second transition + // - BYO security group is preserved (not deleted) throughout + // - The test must fail if any transition doesn't occur correctly + // - The test must skip if the feature gate is not enabled + It("should transition NLB between managed and BYO security groups", func(ctx context.Context) { + isNLBFeatureEnabled(ctx) + + By("creating required AWS clients") + ec2Client, err := createAWSClientEC2(ctx) + framework.ExpectNoError(err, "failed to create AWS EC2 client") + + elbClient, err := createAWSClientLoadBalancer(ctx) + framework.ExpectNoError(err, "failed to create AWS ELB client") + + By("discovering cluster name for security group management") + clusterName, err := getClusterName(ctx, cs, ec2Client) + framework.ExpectNoError(err, "failed to get cluster name") + + By("creating test service with managed security groups (no BYO annotation)") + serviceName := "nlb-sg-update" + _, jig, err := createServiceNLB(ctx, cs, ns, serviceName, map[int32]int32{80: 8080}) + framework.ExpectNoError(err, "failed to create NLB service load balancer") + + foundLB, lbDNS, err := getNLBMetaFromName(ctx, cs, ns, serviceName, elbClient) + framework.ExpectNoError(err, "failed to get NLB metadata") + Expect(foundLB).NotTo(BeNil(), "found load balancer is nil") + + DeferCleanup(func(cleanupCtx context.Context) { + err := deleteServiceAndWaitForLoadBalancerDeletion(cleanupCtx, jig, lbDNS) + framework.ExpectNoError(err, "failed to delete service and wait for load balancer deletion") + }) + + By("verifying managed security groups are attached initially") + Expect(len(foundLB.SecurityGroups)).To(BeNumerically(">", 0), + "load balancer should have managed security groups attached initially") + initialManagedSGs := foundLB.SecurityGroups + framework.Logf("initial managed security groups: %v", initialManagedSGs) + + By("verifying initial security groups are managed by the controller") + for _, sgID := range initialManagedSGs { + isManaged, err := isSecurityGroupManaged(ctx, ec2Client, sgID, clusterName) + framework.ExpectNoError(err, "failed to check if security group %s is managed", sgID) + Expect(isManaged).To(BeTrue(), + "initial security group %s should be managed by the controller", sgID) + } + + By("discovering cluster VPC for BYO security group creation") + vpcID, err := getClusterVPCID(ctx, cs, ec2Client) + framework.ExpectNoError(err, "failed to get cluster VPC ID") + + By("creating BYO security group for transition") + sgName := fmt.Sprintf("e2e-nlb-byo-sg-update-%s", ns.Name) + sgDescription := fmt.Sprintf("BYO Security Group for e2e test %s", ns.Name) + byoSGID, err := createAWSSecurityGroup(ctx, ec2Client, sgName, sgDescription, vpcID) + framework.ExpectNoError(err, "failed to create BYO security group") + framework.Logf("created BYO security group: %s", byoSGID) + + // Add cleanup for BYO security group + DeferCleanup(func(cleanupCtx context.Context) { + By("cleaning up BYO security group") + err := waitForSecurityGroupDeletion(cleanupCtx, ec2Client, byoSGID, 5*time.Minute) + if err != nil { + framework.Logf("warning: failed to delete BYO security group %s: %v", byoSGID, err) + } + }) + + By("adding ingress rules to BYO security group") + ports := []v1.ServicePort{{Port: 80, Protocol: v1.ProtocolTCP}} + err = authorizeSecurityGroupIngress(ctx, ec2Client, byoSGID, ports) + framework.ExpectNoError(err, "failed to authorize ingress for BYO security group") + + By("updating service to use BYO security group annotation") + _, err = jig.UpdateService(ctx, func(s *v1.Service) { + if s.Annotations == nil { + s.Annotations = make(map[string]string) + } + s.Annotations[annotationLBSecurityGroups] = byoSGID + }) + framework.ExpectNoError(err, "failed to update service with BYO security group annotation") + framework.Logf("service updated with BYO security group annotation: %s", byoSGID) + + By("waiting for controller to reconcile and attach BYO security group") + var updatedLB *elbv2types.LoadBalancer + err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 3*time.Minute, true, func(pollCtx context.Context) (bool, error) { + select { + case <-pollCtx.Done(): + return false, pollCtx.Err() + default: + } + + lb, err := getAWSLoadBalancerFromDNSName(pollCtx, elbClient, lbDNS) + if err != nil { + framework.Logf("error getting load balancer: %v", err) + return false, nil + } + if lb == nil { + framework.Logf("load balancer not found yet") + return false, nil + } + + // Check if BYO SG is attached + if slices.Contains(lb.SecurityGroups, byoSGID) { + updatedLB = lb + framework.Logf("BYO security group %s is now attached to load balancer", byoSGID) + return true, nil + } + + framework.Logf("BYO security group not yet attached, current SGs: %v", lb.SecurityGroups) + return false, nil + }) + framework.ExpectNoError(err, "BYO security group should be attached to load balancer after service update") + Expect(updatedLB).NotTo(BeNil(), "updated load balancer should not be nil") + + By("verifying BYO security group is attached to the NLB") + foundBYOSG := slices.Contains(updatedLB.SecurityGroups, byoSGID) + Expect(foundBYOSG).To(BeTrue(), + "load balancer should have BYO security group %s attached after update", byoSGID) + + By("verifying old managed security groups are deleted") + err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 3*time.Minute, true, func(pollCtx context.Context) (bool, error) { + select { + case <-pollCtx.Done(): + return false, pollCtx.Err() + default: + } + + allDeleted := true + for _, sgID := range initialManagedSGs { + exists, err := securityGroupExists(pollCtx, ec2Client, sgID) + if err != nil { + framework.Logf("error checking if managed SG %s exists: %v", sgID, err) + return false, nil + } + if exists { + framework.Logf("managed security group %s still exists, waiting for cleanup...", sgID) + allDeleted = false + } else { + framework.Logf("managed security group %s was successfully deleted", sgID) + } + } + return allDeleted, nil + }) + framework.ExpectNoError(err, "old managed security groups should be deleted after BYO SG is attached") + + framework.Logf("successfully validated transition from managed SG to BYO SG %s", byoSGID) + + // Round-trip: Transition back from BYO to managed + By("updating service to remove BYO security group annotation (transition back to managed)") + _, err = jig.UpdateService(ctx, func(s *v1.Service) { + delete(s.Annotations, annotationLBSecurityGroups) + }) + framework.ExpectNoError(err, "failed to update service to remove BYO security group annotation") + framework.Logf("service updated to remove BYO security group annotation") + + By("waiting for controller to reconcile and attach new managed security group") + var finalLB *elbv2types.LoadBalancer + var newManagedSGIDs []string + err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 3*time.Minute, true, func(pollCtx context.Context) (bool, error) { + select { + case <-pollCtx.Done(): + return false, pollCtx.Err() + default: + } + + lb, err := getAWSLoadBalancerFromDNSName(pollCtx, elbClient, lbDNS) + if err != nil { + framework.Logf("error getting load balancer: %v", err) + return false, nil + } + if lb == nil { + framework.Logf("load balancer not found yet") + return false, nil + } + + // Check if BYO SG is removed + hasBYO := slices.Contains(lb.SecurityGroups, byoSGID) + if hasBYO { + framework.Logf("BYO security group still attached, waiting for transition back to managed...") + return false, nil + } + + // Must have at least one security group + if len(lb.SecurityGroups) == 0 { + framework.Logf("no security groups attached yet") + return false, nil + } + + // Check if the new SG is managed + for _, sgID := range lb.SecurityGroups { + managed, err := isSecurityGroupManaged(pollCtx, ec2Client, sgID, clusterName) + if err != nil { + framework.Logf("error checking if SG %s is managed: %v", sgID, err) + return false, nil + } + if managed { + finalLB = lb + newManagedSGIDs = lb.SecurityGroups + framework.Logf("new managed security groups attached: %v", newManagedSGIDs) + return true, nil + } + } + + framework.Logf("waiting for managed security groups to be created and attached") + return false, nil + }) + framework.ExpectNoError(err, "new managed security group should be created and attached after removing BYO annotation") + Expect(finalLB).NotTo(BeNil(), "final load balancer should not be nil") + Expect(len(newManagedSGIDs)).To(BeNumerically(">", 0), "should have new managed security groups attached") + + By("verifying BYO security group is no longer attached after transition to managed") + hasBYO := slices.Contains(finalLB.SecurityGroups, byoSGID) + Expect(hasBYO).To(BeFalse(), + "BYO security group %s should no longer be attached to load balancer after transition", byoSGID) + + By("verifying new managed security groups are controller-owned") + for _, sgID := range newManagedSGIDs { + isManaged, err := isSecurityGroupManaged(ctx, ec2Client, sgID, clusterName) + framework.ExpectNoError(err, "failed to check if security group %s is managed", sgID) + Expect(isManaged).To(BeTrue(), + "security group %s should be managed by the controller (have 'owned' tag)", sgID) + } + + By("verifying BYO security group still exists (preserved after round-trip)") + exists, err := securityGroupExists(ctx, ec2Client, byoSGID) + framework.ExpectNoError(err, "failed to check if BYO security group exists") + Expect(exists).To(BeTrue(), + "BYO security group %s should NOT be deleted when transitioning back to managed (user retains ownership)", byoSGID) + + By("verifying BYO security group is not managed by controller") + isManaged, err := isSecurityGroupManaged(ctx, ec2Client, byoSGID, clusterName) + framework.ExpectNoError(err, "failed to check if BYO security group is managed") + Expect(isManaged).To(BeFalse(), + "BYO security group should still be user-managed after round-trip") + + framework.Logf("successfully validated round-trip: managed → BYO SG %s → managed, with BYO SG preserved", byoSGID) + }) }) // createServiceNLB creates a Service type loadBalancer Network Load Balancer (NLB) with the given port mapping. From a2b197dafecc43f29292a148a810a02b7eee4d55 Mon Sep 17 00:00:00 2001 From: Federico Bonfigli Date: Wed, 22 Apr 2026 11:48:03 +0200 Subject: [PATCH 2/2] Use AWS SDK logic to detect retryable errors Uses the AWS SDK logic to detect API operations that fail due to retryable errors, like throttling or network issues. --- .../e2e/helper.go | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/cmd/cloud-controller-manager-aws-tests-ext/e2e/helper.go b/cmd/cloud-controller-manager-aws-tests-ext/e2e/helper.go index 207af47a2..69f85b0da 100644 --- a/cmd/cloud-controller-manager-aws-tests-ext/e2e/helper.go +++ b/cmd/cloud-controller-manager-aws-tests-ext/e2e/helper.go @@ -7,6 +7,7 @@ import ( "time" "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/aws/retry" "github.com/aws/aws-sdk-go-v2/config" ec2 "github.com/aws/aws-sdk-go-v2/service/ec2" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" @@ -199,16 +200,13 @@ func ec2IsNotFoundError(err error) bool { strings.Contains(errMsg, "InvalidGroup.Malformed") } -// isAWSThrottlingError checks if an error is an AWS throttling/rate limit error. -func isAWSThrottlingError(err error) bool { +// isAWSRetryableError checks if an error is retryable using AWS SDK's standard retry logic. +func isAWSRetryableError(err error) bool { if err == nil { return false } - errMsg := err.Error() - return strings.Contains(errMsg, "Throttling") || - strings.Contains(errMsg, "RequestLimitExceeded") || - strings.Contains(errMsg, "TooManyRequests") || - strings.Contains(errMsg, "RequestThrottled") + result := retry.IsErrorRetryables(retry.DefaultRetryables).IsErrorRetryable(err) + return result == aws.TrueTernary } // createAWSSecurityGroup creates a test security group. @@ -330,8 +328,8 @@ func waitForSecurityGroupDeletion(ctx context.Context, ec2Client *ec2.Client, sg // First check if SG still exists exists, err := securityGroupExists(pollCtx, ec2Client, sgID) if err != nil { - // Handle throttling errors by continuing to poll - if isAWSThrottlingError(err) { + // Handle retryable errors by continuing to poll + if isAWSRetryableError(err) { framework.Logf("AWS throttling encountered while checking security group %s, retrying...", sgID) return false, nil } @@ -346,8 +344,8 @@ func waitForSecurityGroupDeletion(ctx context.Context, ec2Client *ec2.Client, sg // Try to delete it err = deleteAWSSecurityGroup(pollCtx, ec2Client, sgID) if err != nil { - // Handle throttling errors by continuing to poll - if isAWSThrottlingError(err) { + // Handle retryable errors by continuing to poll + if isAWSRetryableError(err) { framework.Logf("AWS throttling encountered while deleting security group %s, retrying...", sgID) return false, nil }