From 604fe39b0fc31747e9e31f593755603f8aa8a917 Mon Sep 17 00:00:00 2001
From: Jian Zhang <jiazha@redhat.com>
Date: Thu, 4 Dec 2025 11:06:51 +0800
Subject: [PATCH] Update e2e test case to support replica: 2

Co-authored-by: Todd Short <tmshort@users.noreply.github.com>
---
 Makefile                                      |   6 +-
 hack/test/pre-upgrade-setup.sh                |   4 +-
 ...mv1-system-catalogd-controller-manager.yml |   4 +-
 ...operator-controller-controller-manager.yml |   4 +-
 helm/olmv1/values.yaml                        |   2 +
 helm/replicas-2.yaml                          |   9 ++
 manifests/experimental-e2e.yaml               |   8 +-
 manifests/experimental.yaml                   |   4 +-
 manifests/standard-e2e.yaml                   |   4 +-
 manifests/standard.yaml                       |   4 +-
 test/e2e/cluster_extension_install_test.go    |  24 ++-
 test/e2e/single_namespace_support_test.go     |   6 +-
 test/e2e/webhook_support_test.go              |   5 +-
 test/helpers/helpers.go                       |  18 ++-
 test/upgrade-e2e/post_upgrade_test.go         | 150 +++++++++++++++---
 testdata/build-test-registry.sh               |   4 +-
 16 files changed, 197 insertions(+), 59 deletions(-)
 create mode 100644 helm/replicas-2.yaml

diff --git a/Makefile b/Makefile
index c7d9ea5cfe..b6b45c4e13 100644
--- a/Makefile
+++ b/Makefile
@@ -158,7 +158,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE
 $(STANDARD_MANIFEST) ?= helm/cert-manager.yaml
 $(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml
 $(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml
-$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml
+$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/replicas-2.yaml
 HELM_SETTINGS ?=
 .PHONY: $(MANIFESTS)
 $(MANIFESTS): $(HELM)
@@ -484,8 +484,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i
 CATD_NAMESPACE := olmv1-system
 .PHONY: wait
 wait:
-	kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s
-	kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert
+	kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m
+	kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert
 
 .PHONY: docker-build
 docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH.
diff --git a/hack/test/pre-upgrade-setup.sh b/hack/test/pre-upgrade-setup.sh
index 669f9da37f..e52c88b467 100755
--- a/hack/test/pre-upgrade-setup.sh
+++ b/hack/test/pre-upgrade-setup.sh
@@ -155,5 +155,5 @@ spec:
       version: 1.0.0
 EOF
 
-kubectl wait --for=condition=Serving --timeout=60s ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
-kubectl wait --for=condition=Installed --timeout=60s ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
+kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
+kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
diff --git a/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml b/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml
index 092cb7a24e..ac69bce6a4 100644
--- a/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml
+++ b/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml
@@ -12,11 +12,11 @@ metadata:
   namespace: {{ .Values.namespaces.olmv1.name }}
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: {{ .Values.options.catalogd.deployment.replicas }}
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml b/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml
index 249610244d..bea8f0404f 100644
--- a/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml
+++ b/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml
@@ -11,11 +11,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: {{ .Values.namespaces.olmv1.name }}
 spec:
-  replicas: 1
+  replicas: {{ .Values.options.operatorController.deployment.replicas }}
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/helm/olmv1/values.yaml b/helm/olmv1/values.yaml
index cb454f6250..c5845b9a1b 100644
--- a/helm/olmv1/values.yaml
+++ b/helm/olmv1/values.yaml
@@ -8,6 +8,7 @@ options:
     enabled: true
     deployment:
       image: quay.io/operator-framework/operator-controller:devel
+      replicas: 1
       extraArguments: []
     features:
       enabled: []
@@ -19,6 +20,7 @@ options:
     enabled: true
     deployment:
       image: quay.io/operator-framework/catalogd:devel
+      replicas: 1
       extraArguments: []
     features:
       enabled: []
diff --git a/helm/replicas-2.yaml b/helm/replicas-2.yaml
new file mode 100644
index 0000000000..4f50e3dfb6
--- /dev/null
+++ b/helm/replicas-2.yaml
@@ -0,0 +1,9 @@
+# Override values to set replicas to 2 for both operator-controller and catalogd
+# This is used in experimental-e2e.yaml to test multi-replica deployments
+options:
+  operatorController:
+    deployment:
+      replicas: 2
+  catalogd:
+    deployment:
+      replicas: 2
diff --git a/manifests/experimental-e2e.yaml b/manifests/experimental-e2e.yaml
index e536cd72a6..fbfa2df82b 100644
--- a/manifests/experimental-e2e.yaml
+++ b/manifests/experimental-e2e.yaml
@@ -2107,11 +2107,11 @@ metadata:
   namespace: olmv1-system
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2258,11 +2258,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: olmv1-system
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/manifests/experimental.yaml b/manifests/experimental.yaml
index f88debab0f..9e9d611f05 100644
--- a/manifests/experimental.yaml
+++ b/manifests/experimental.yaml
@@ -2036,7 +2036,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2174,7 +2174,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/manifests/standard-e2e.yaml b/manifests/standard-e2e.yaml
index 1aed38ba96..36cb9971cf 100644
--- a/manifests/standard-e2e.yaml
+++ b/manifests/standard-e2e.yaml
@@ -1799,7 +1799,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -1949,7 +1949,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/manifests/standard.yaml b/manifests/standard.yaml
index 34cc579181..933ff5d819 100644
--- a/manifests/standard.yaml
+++ b/manifests/standard.yaml
@@ -1724,7 +1724,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -1861,7 +1861,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/test/e2e/cluster_extension_install_test.go b/test/e2e/cluster_extension_install_test.go
index b3380ff0f5..17b6fb9e34 100644
--- a/test/e2e/cluster_extension_install_test.go
+++ b/test/e2e/cluster_extension_install_test.go
@@ -27,8 +27,11 @@ import (
 )
 
 const (
-	artifactName         = "operator-controller-e2e"
-	pollDuration         = time.Minute
+	artifactName = "operator-controller-e2e"
+	// pollDuration is set to 3 minutes to account for leader election time in multi-replica deployments.
+	// In the worst case (previous leader crashed), leader election can take up to 163 seconds
+	// (LeaseDuration: 137s + RetryPeriod: 26s). Adding buffer for reconciliation time.
+	pollDuration         = 3 * time.Minute
 	pollInterval         = time.Second
 	testCatalogRefEnvVar = "CATALOG_IMG"
 	testCatalogName      = "test-catalog"
@@ -169,10 +172,11 @@ location = "docker-registry.operator-controller-e2e.svc.cluster.local:5000"`,
 		require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
 	}, 2*time.Minute, pollInterval)
 
-	// Give the check 2 minutes instead of the typical 1 for the pod's
-	// files to update from the configmap change.
+	// Give the check extra time for the pod's files to update from the configmap change.
 	// The theoretical max time is the kubelet sync period of 1 minute +
-	// ConfigMap cache TTL of 1 minute = 2 minutes
+	// ConfigMap cache TTL of 1 minute = 2 minutes.
+	// With multi-replica deployments, add leader election time (up to 163s in worst case).
+	// Total: 2 min (ConfigMap) + 2.7 min (leader election) + buffer = 5 minutes
 	t.Log("By eventually reporting progressing as True")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
@@ -180,7 +184,7 @@ location = "docker-registry.operator-controller-e2e.svc.cluster.local:5000"`,
 		require.NotNil(ct, cond)
 		require.Equal(ct, metav1.ConditionTrue, cond.Status)
 		require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
-	}, 2*time.Minute, pollInterval)
+	}, 5*time.Minute, pollInterval)
 
 	t.Log("By eventually installing the package successfully")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
@@ -655,6 +659,8 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) {
 	// backoff of this eventually check we MUST ensure we do not touch the ClusterExtension
 	// after creating int the Namespace and ServiceAccount.
 	t.Log("By eventually installing the package successfully")
+	// Use 5 minutes for recovery tests to account for exponential backoff after repeated failures
+	// plus leader election time (up to 163s in worst case)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
 		cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
@@ -663,7 +669,7 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) {
 		require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
 		require.Contains(ct, cond.Message, "Installed bundle")
 		require.NotEmpty(ct, clusterExtension.Status.Install)
-	}, pollDuration, pollInterval)
+	}, 5*time.Minute, pollInterval)
 
 	t.Log("By eventually reporting Progressing == True with Reason Success")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
@@ -777,6 +783,8 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi
 	// backoff of this eventually check we MUST ensure we do not touch the ClusterExtension
 	// after deleting the Deployment.
 	t.Log("By eventually installing the package successfully")
+	// Use 5 minutes for recovery tests to account for exponential backoff after repeated failures
+	// plus leader election time (up to 163s in worst case)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
 		cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
@@ -785,7 +793,7 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi
 		require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
 		require.Contains(ct, cond.Message, "Installed bundle")
 		require.NotEmpty(ct, clusterExtension.Status.Install)
-	}, pollDuration, pollInterval)
+	}, 5*time.Minute, pollInterval)
 
 	t.Log("By eventually reporting Progressing == True with Reason Success")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
diff --git a/test/e2e/single_namespace_support_test.go b/test/e2e/single_namespace_support_test.go
index 2c3b825a1a..7cfe782fbd 100644
--- a/test/e2e/single_namespace_support_test.go
+++ b/test/e2e/single_namespace_support_test.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"os"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -404,9 +405,12 @@ func TestClusterExtensionVersionUpdate(t *testing.T) {
 		require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
 	}, pollDuration, pollInterval)
 	t.Log("We should have two ClusterExtensionRevision resources")
+	// Use 5 minutes for checking ClusterExtensionRevision creation after upgrade.
+	// In multi-replica deployments, revision creation happens after the upgrade completes,
+	// which includes leader election time (up to 163s) plus reconciliation overhead.
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		cerList := &ocv1.ClusterExtensionRevisionList{}
 		require.NoError(ct, c.List(context.Background(), cerList))
 		require.Len(ct, cerList.Items, 2)
-	}, pollDuration, pollInterval)
+	}, 5*time.Minute, pollInterval)
 }
diff --git a/test/e2e/webhook_support_test.go b/test/e2e/webhook_support_test.go
index 1c80c615be..f04c16fca8 100644
--- a/test/e2e/webhook_support_test.go
+++ b/test/e2e/webhook_support_test.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"os"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -138,6 +139,8 @@ func TestWebhookSupport(t *testing.T) {
 	})
 
 	t.Log("By waiting for webhook-operator extension to be installed successfully")
+	// Use 5 minutes for webhook installation as it requires webhook cert generation via cert-manager,
+	// which adds additional time on top of leader election and reconciliation in multi-replica deployments.
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		require.NoError(ct, c.Get(t.Context(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
 		cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
@@ -147,7 +150,7 @@ func TestWebhookSupport(t *testing.T) {
 		require.Contains(ct, cond.Message, "Installed bundle")
 		require.NotNil(ct, clusterExtension.Status.Install)
 		require.NotEmpty(ct, clusterExtension.Status.Install.Bundle)
-	}, pollDuration, pollInterval)
+	}, 5*time.Minute, pollInterval)
 
 	t.Log("By waiting for webhook-operator deployment to be available")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
diff --git a/test/helpers/helpers.go b/test/helpers/helpers.go
index af142c6e34..2d99854fa2 100644
--- a/test/helpers/helpers.go
+++ b/test/helpers/helpers.go
@@ -34,7 +34,10 @@ var (
 )
 
 const (
-	pollDuration         = time.Minute
+	// pollDuration is set to 3 minutes to account for leader election time in multi-replica deployments.
+	// In the worst case (previous leader crashed), leader election can take up to 163 seconds
+	// (LeaseDuration: 137s + RetryPeriod: 26s). Adding buffer for reconciliation time.
+	pollDuration         = 3 * time.Minute
 	pollInterval         = time.Second
 	testCatalogName      = "test-catalog"
 	testCatalogRefEnvVar = "CATALOG_IMG"
@@ -289,23 +292,26 @@ func ValidateCatalogUnpackWithName(t *testing.T, catalogName string) {
 func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) {
 	ls := labels.Set{"olm.operatorframework.io/owner-name": clusterExtensionName}
 
-	// CRDs may take an extra long time to be deleted, and may run into the following error:
-	// Condition=Terminating Status=True Reason=InstanceDeletionFailed Message="could not list instances: storage is (re)initializing"
+	// CRDs may take extra time to be deleted in multi-replica deployments due to leader election
+	// and finalizer processing. Use 3 minutes which accounts for leader election (up to 163s)
+	// plus buffer for reconciliation overhead.
 	t.Logf("By waiting for CustomResourceDefinitions of %q to be deleted", clusterExtensionName)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		list := &apiextensionsv1.CustomResourceDefinitionList{}
 		err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
 		require.NoError(ct, err)
 		require.Empty(ct, list.Items)
-	}, 5*pollDuration, pollInterval)
+	}, pollDuration, pollInterval)
 
+	// ClusterRoleBindings and ClusterRoles deletion is typically faster than CRDs.
+	// Use pollDuration (3 minutes) to account for leader election in multi-replica deployments.
 	t.Logf("By waiting for ClusterRoleBindings of %q to be deleted", clusterExtensionName)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		list := &rbacv1.ClusterRoleBindingList{}
 		err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
 		require.NoError(ct, err)
 		require.Empty(ct, list.Items)
-	}, 2*pollDuration, pollInterval)
+	}, pollDuration, pollInterval)
 
 	t.Logf("By waiting for ClusterRoles of %q to be deleted", clusterExtensionName)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
@@ -313,7 +319,7 @@ func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) {
 		err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
 		require.NoError(ct, err)
 		require.Empty(ct, list.Items)
-	}, 2*pollDuration, pollInterval)
+	}, pollDuration, pollInterval)
 }
 
 func TestCleanup(t *testing.T, cat *ocv1.ClusterCatalog, clusterExtension *ocv1.ClusterExtension, sa *corev1.ServiceAccount, ns *corev1.Namespace) {
diff --git a/test/upgrade-e2e/post_upgrade_test.go b/test/upgrade-e2e/post_upgrade_test.go
index 785d91ea3b..a8cd8fc810 100644
--- a/test/upgrade-e2e/post_upgrade_test.go
+++ b/test/upgrade-e2e/post_upgrade_test.go
@@ -25,6 +25,11 @@ import (
 const (
 	artifactName = "operator-controller-upgrade-e2e"
 	container    = "manager"
+	// pollDuration is set to 3 minutes to account for leader election time in multi-replica deployments.
+	// In the worst case (previous leader crashed), leader election can take up to 163 seconds
+	// (LeaseDuration: 137s + RetryPeriod: 26s). Adding buffer for reconciliation time.
+	pollDuration = 3 * time.Minute
+	pollInterval = time.Second
 )
 
 func TestClusterCatalogUnpacking(t *testing.T) {
@@ -45,23 +50,22 @@ func TestClusterCatalogUnpacking(t *testing.T) {
 		require.Equal(ct, *managerDeployment.Spec.Replicas, managerDeployment.Status.ReadyReplicas)
 	}, time.Minute, time.Second)
 
-	var managerPod corev1.Pod
-	t.Log("Waiting for only one controller-manager pod to remain")
+	t.Log("Waiting for controller-manager pods to match the desired replica count")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		var managerPods corev1.PodList
 		err := c.List(ctx, &managerPods, client.MatchingLabels(managerLabelSelector))
 		require.NoError(ct, err)
-		require.Len(ct, managerPods.Items, 1)
-		managerPod = managerPods.Items[0]
+		require.Len(ct, managerPods.Items, int(*managerDeployment.Spec.Replicas))
 	}, time.Minute, time.Second)
 
 	t.Log("Waiting for acquired leader election")
 	leaderCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute)
 	defer leaderCancel()
-	leaderSubstrings := []string{"successfully acquired lease"}
-	leaderElected, err := watchPodLogsForSubstring(leaderCtx, &managerPod, leaderSubstrings...)
+
+	// When there are multiple replicas, find the leader pod
+	managerPod, err := findLeaderPod(leaderCtx, "catalogd")
 	require.NoError(t, err)
-	require.True(t, leaderElected)
+	require.NotNil(t, managerPod)
 
 	t.Log("Reading logs to make sure that ClusterCatalog was reconciled by catalogdv1")
 	logCtx, cancel := context.WithTimeout(ctx, time.Minute)
@@ -70,7 +74,7 @@ func TestClusterCatalogUnpacking(t *testing.T) {
 		"reconcile ending",
 		fmt.Sprintf(`ClusterCatalog=%q`, testClusterCatalogName),
 	}
-	found, err := watchPodLogsForSubstring(logCtx, &managerPod, substrings...)
+	found, err := watchPodLogsForSubstring(logCtx, managerPod, substrings...)
 	require.NoError(t, err)
 	require.True(t, found)
 
@@ -103,11 +107,18 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {
 
 	// wait for catalogd deployment to finish
 	t.Log("Wait for catalogd deployment to be ready")
-	catalogdManagerPod := waitForDeployment(t, ctx, "catalogd")
+	waitForDeployment(t, ctx, "catalogd")
+
+	// Find the catalogd leader pod
+	catalogdLeaderCtx, catalogdLeaderCancel := context.WithTimeout(ctx, 3*time.Minute)
+	defer catalogdLeaderCancel()
+	catalogdManagerPod, err := findLeaderPod(catalogdLeaderCtx, "catalogd")
+	require.NoError(t, err)
+	require.NotNil(t, catalogdManagerPod)
 
 	// wait for operator-controller deployment to finish
 	t.Log("Wait for operator-controller deployment to be ready")
-	managerPod := waitForDeployment(t, ctx, "operator-controller")
+	waitForDeployment(t, ctx, "operator-controller")
 
 	t.Log("Wait for acquired leader election")
 	// Average case is under 1 minute but in the worst case: (previous leader crashed)
@@ -115,10 +126,11 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {
 	leaderCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute)
 	defer leaderCancel()
 
-	leaderSubstrings := []string{"successfully acquired lease"}
-	leaderElected, err := watchPodLogsForSubstring(leaderCtx, managerPod, leaderSubstrings...)
+	// When there are multiple replicas, find the leader pod
+	var managerPod *corev1.Pod
+	managerPod, err = findLeaderPod(leaderCtx, "operator-controller")
 	require.NoError(t, err)
-	require.True(t, leaderElected)
+	require.NotNil(t, managerPod)
 
 	t.Log("Reading logs to make sure that ClusterExtension was reconciled by operator-controller before we update it")
 	// Make sure that after we upgrade OLM itself we can still reconcile old objects without any changes
@@ -154,7 +166,7 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {
 		require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
 
 		require.True(ct, clusterCatalog.Status.LastUnpacked.After(catalogdManagerPod.CreationTimestamp.Time))
-	}, time.Minute, time.Second)
+	}, pollDuration, pollInterval)
 
 	// TODO: if we change the underlying revision storage mechanism, the new version
 	//   will not detect any installed versions, we need to make sure that the upgrade
@@ -172,7 +184,7 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {
 		require.Contains(ct, cond.Message, "Installed bundle")
 		require.NotNil(ct, clusterExtension.Status.Install)
 		require.NotEmpty(ct, clusterExtension.Status.Install.Bundle.Version)
-	}, time.Minute, time.Second)
+	}, pollDuration, pollInterval)
 
 	previousVersion := clusterExtension.Status.Install.Bundle.Version
 
@@ -182,6 +194,13 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {
 	require.NoError(t, c.Update(ctx, &clusterExtension))
 
 	t.Log("Checking that the ClusterExtension installs successfully")
+	// Use 10 minutes for post-OLM-upgrade extension upgrade operations.
+	// After upgrading OLM itself, the system needs time to:
+	// - Stabilize after operator-controller pods restart (leader election: up to 163s)
+	// - Process the ClusterExtension spec change
+	// - Resolve and unpack the new bundle (1.0.1)
+	// - Apply manifests and wait for rollout
+	// In multi-replica deployments with recent OLM upgrade, this can take significant time
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		require.NoError(ct, c.Get(ctx, types.NamespacedName{Name: testClusterExtensionName}, &clusterExtension))
 		cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
@@ -190,14 +209,13 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {
 		require.Contains(ct, cond.Message, "Installed bundle")
 		require.Equal(ct, ocv1.BundleMetadata{Name: "test-operator.1.0.1", Version: "1.0.1"}, clusterExtension.Status.Install.Bundle)
 		require.NotEqual(ct, previousVersion, clusterExtension.Status.Install.Bundle.Version)
-	}, time.Minute, time.Second)
+	}, 10*time.Minute, pollInterval)
 }
 
 // waitForDeployment checks that the updated deployment with the given app.kubernetes.io/name label
 // has reached the desired number of replicas and that the number pods matches that number
-// i.e. no old pods remain. It will return a pointer to the first pod. This is only necessary
-// to facilitate the mitigation put in place for https://github.com/operator-framework/operator-controller/issues/1626
-func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) *corev1.Pod {
+// i.e. no old pods remain.
+func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) {
 	deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector()
 
 	t.Log("Checking that the deployment is updated")
@@ -217,13 +235,101 @@ func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel stri
 		desiredNumReplicas = *managerDeployment.Spec.Replicas
 	}, time.Minute, time.Second)
 
-	var managerPods corev1.PodList
 	t.Logf("Ensure the number of remaining pods equal the desired number of replicas (%d)", desiredNumReplicas)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
+		var managerPods corev1.PodList
 		require.NoError(ct, c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}))
-		require.Len(ct, managerPods.Items, 1)
+		require.Len(ct, managerPods.Items, int(desiredNumReplicas))
 	}, time.Minute, time.Second)
-	return &managerPods.Items[0]
+}
+
+// findLeaderPod finds the pod that has acquired the leader lease by checking logs of all pods
+func findLeaderPod(ctx context.Context, controlPlaneLabel string) (*corev1.Pod, error) {
+	deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector()
+
+	// If there's only one pod, it must be the leader
+	// First, do a quick check to see if we can find the leader in existing logs
+	var leaderPod *corev1.Pod
+	leaderSubstrings := []string{"successfully acquired lease"}
+
+	// Retry logic: keep checking until we find a leader or context times out
+	ticker := time.NewTicker(2 * time.Second)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return nil, fmt.Errorf("timeout waiting for leader election: %w", ctx.Err())
+		case <-ticker.C:
+			var managerPods corev1.PodList
+			if err := c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}); err != nil {
+				continue
+			}
+
+			if len(managerPods.Items) == 0 {
+				continue
+			}
+
+			// If there's only one pod, it must be the leader
+			if len(managerPods.Items) == 1 {
+				return &managerPods.Items[0], nil
+			}
+
+			// Check each pod's existing logs (without following) for leader election message
+			for i := range managerPods.Items {
+				pod := &managerPods.Items[i]
+
+				// Check existing logs only (don't follow)
+				isLeader, err := checkPodLogsForSubstring(ctx, pod, leaderSubstrings...)
+				if err != nil {
+					// If we can't read logs from this pod, try the next one
+					continue
+				}
+				if isLeader {
+					leaderPod = pod
+					break
+				}
+			}
+
+			if leaderPod != nil {
+				return leaderPod, nil
+			}
+			// No leader found yet, retry after ticker interval
+		}
+	}
+}
+
+// checkPodLogsForSubstring checks existing pod logs (without following) for the given substrings
+func checkPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) {
+	podLogOpts := corev1.PodLogOptions{
+		Follow:    false, // Don't follow, just read existing logs
+		Container: container,
+	}
+
+	req := kclientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &podLogOpts)
+	podLogs, err := req.Stream(ctx)
+	if err != nil {
+		return false, err
+	}
+	defer podLogs.Close()
+
+	scanner := bufio.NewScanner(podLogs)
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		foundCount := 0
+		for _, substring := range substrings {
+			if strings.Contains(line, substring) {
+				foundCount++
+			}
+		}
+		if foundCount == len(substrings) {
+			return true, nil
+		}
+	}
+
+	// If we didn't find the substrings, return false (not an error)
+	return false, nil
 }
 
 func watchPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) {
diff --git a/testdata/build-test-registry.sh b/testdata/build-test-registry.sh
index e2dcc09148..7dee9e3c11 100755
--- a/testdata/build-test-registry.sh
+++ b/testdata/build-test-registry.sh
@@ -103,7 +103,7 @@ spec:
   type: NodePort
 EOF
 
-kubectl wait --for=condition=Available -n "${namespace}" "deploy/${name}" --timeout=60s
+kubectl wait --for=condition=Available -n "${namespace}" "deploy/${name}" --timeout=3m
 
 kubectl apply -f - << EOF
 apiVersion: batch/v1
@@ -135,4 +135,4 @@ spec:
             secretName: ${namespace}-registry
 EOF
 
-kubectl wait --for=condition=Complete -n "${namespace}" "job/${name}-push" --timeout=60s
+kubectl wait --for=condition=Complete -n "${namespace}" "job/${name}-push" --timeout=3m