From 7fc37a0c16b58762e7c88f482531a27d6c6842a5 Mon Sep 17 00:00:00 2001
From: Arjun <agadiyar@nvidia.com>
Date: Mon, 2 Feb 2026 08:20:18 +0000
Subject: [PATCH] Fix flaking failures in the GPU Operator CI e2e tests

Signed-off-by: Arjun <agadiyar@nvidia.com>
---
 tests/scripts/update-nvidiadriver.sh | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/scripts/update-nvidiadriver.sh b/tests/scripts/update-nvidiadriver.sh
index c60db6560..70a49ff5b 100755
--- a/tests/scripts/update-nvidiadriver.sh
+++ b/tests/scripts/update-nvidiadriver.sh
@@ -50,7 +50,19 @@ test_custom_labels_override() {
     exit 1
   fi
 
-  # The labels override triggers a rollout of all gpu-operator operands, so we wait for the driver upgrade to transition to "upgrade-done" state.
+  # Wait for the operator to update the pod template with new labels
+  echo "Waiting for DaemonSet pod template to be updated with new labels..."
+  kubectl wait daemonset \
+      -l "app.kubernetes.io/component=nvidia-driver" \
+      -n "$TEST_NAMESPACE" \
+      --for=jsonpath='{.spec.template.metadata.labels.cloudprovider}'=aws \
+      --timeout=120s
+
+  # Delete driver pod to force recreation with updated labels. Existing pods are not automatically restarted due to the DaemonSet's 'OnDelete` updateStrategy.
+  echo "Deleting driver pod to trigger recreation with updated labels..."
+  kubectl delete pod -l "app.kubernetes.io/component=nvidia-driver" -n "$TEST_NAMESPACE"
+
+  # Wait for the driver upgrade to transition to "upgrade-done" state
   wait_for_driver_upgrade_done
 
   check_nvidia_driver_pods_ready