sets a sensible default of **2 replicas** and update upgrade e2e test cases

jianzhangbjz · jianzhangbjz · commit 89b96d0d39c1 · 2025-12-04T15:22:44.000+08:00
diff --git a/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml b/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml
@@ -12,11 +12,11 @@ metadata:
   namespace: {{ .Values.namespaces.olmv1.name }}
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: {{ .Values.options.catalogd.deployment.replicas }}
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod (e.g., 3 pods with 2 replicas) for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml b/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml
@@ -11,11 +11,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: {{ .Values.namespaces.olmv1.name }}
 spec:
-  replicas: 1
+  replicas: {{ .Values.options.operatorController.deployment.replicas }}
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod (e.g., 3 pods with 2 replicas) for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/helm/olmv1/values.yaml b/helm/olmv1/values.yaml
@@ -8,6 +8,7 @@ options:
     enabled: true
     deployment:
       image: quay.io/operator-framework/operator-controller:devel
+      replicas: 2
       extraArguments: []
     features:
       enabled: []
@@ -19,6 +20,7 @@ options:
     enabled: true
     deployment:
       image: quay.io/operator-framework/catalogd:devel
+      replicas: 2
       extraArguments: []
     features:
       enabled: []
diff --git a/manifests/experimental-e2e.yaml b/manifests/experimental-e2e.yaml
@@ -2107,11 +2107,11 @@ metadata:
   namespace: olmv1-system
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod (e.g., 3 pods with 2 replicas) for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2258,11 +2258,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: olmv1-system
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod (e.g., 3 pods with 2 replicas) for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/manifests/experimental.yaml b/manifests/experimental.yaml
@@ -2032,11 +2032,11 @@ metadata:
   namespace: olmv1-system
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod (e.g., 3 pods with 2 replicas) for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2170,11 +2170,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: olmv1-system
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod (e.g., 3 pods with 2 replicas) for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/manifests/standard-e2e.yaml b/manifests/standard-e2e.yaml
@@ -1795,11 +1795,11 @@ metadata:
   namespace: olmv1-system
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod (e.g., 3 pods with 2 replicas) for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -1945,11 +1945,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: olmv1-system
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod (e.g., 3 pods with 2 replicas) for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/manifests/standard.yaml b/manifests/standard.yaml
@@ -1720,11 +1720,11 @@ metadata:
   namespace: olmv1-system
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod (e.g., 3 pods with 2 replicas) for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -1857,11 +1857,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: olmv1-system
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod (e.g., 3 pods with 2 replicas) for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/test/upgrade-e2e/post_upgrade_test.go b/test/upgrade-e2e/post_upgrade_test.go
@@ -45,23 +45,22 @@ func TestClusterCatalogUnpacking(t *testing.T) {
 		require.Equal(ct, *managerDeployment.Spec.Replicas, managerDeployment.Status.ReadyReplicas)
 	}, time.Minute, time.Second)
 
-	var managerPod corev1.Pod
-	t.Log("Waiting for only one controller-manager pod to remain")
+	t.Log("Waiting for controller-manager pods to match the desired replica count")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		var managerPods corev1.PodList
 		err := c.List(ctx, &managerPods, client.MatchingLabels(managerLabelSelector))
 		require.NoError(ct, err)
-		require.Len(ct, managerPods.Items, 1)
-		managerPod = managerPods.Items[0]
+		require.Len(ct, managerPods.Items, int(*managerDeployment.Spec.Replicas))
 	}, time.Minute, time.Second)
 
 	t.Log("Waiting for acquired leader election")
 	leaderCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute)
 	defer leaderCancel()
-	leaderSubstrings := []string{"successfully acquired lease"}
-	leaderElected, err := watchPodLogsForSubstring(leaderCtx, &managerPod, leaderSubstrings...)
+
+	// When there are multiple replicas, find the leader pod
+	managerPod, err := findLeaderPod(leaderCtx, "catalogd")
 	require.NoError(t, err)
-	require.True(t, leaderElected)
+	require.NotNil(t, managerPod)
 
 	t.Log("Reading logs to make sure that ClusterCatalog was reconciled by catalogdv1")
 	logCtx, cancel := context.WithTimeout(ctx, time.Minute)
@@ -70,7 +69,7 @@ func TestClusterCatalogUnpacking(t *testing.T) {
 		"reconcile ending",
 		fmt.Sprintf(`ClusterCatalog=%q`, testClusterCatalogName),
 	}
-	found, err := watchPodLogsForSubstring(logCtx, &managerPod, substrings...)
+	found, err := watchPodLogsForSubstring(logCtx, managerPod, substrings...)
 	require.NoError(t, err)
 	require.True(t, found)
 
@@ -115,10 +114,13 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) {
 	leaderCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute)
 	defer leaderCancel()
 
-	leaderSubstrings := []string{"successfully acquired lease"}
-	leaderElected, err := watchPodLogsForSubstring(leaderCtx, managerPod, leaderSubstrings...)
+	// When there are multiple replicas, find the leader pod
+	leaderPod, err := findLeaderPod(leaderCtx, "operator-controller")
 	require.NoError(t, err)
-	require.True(t, leaderElected)
+	require.NotNil(t, leaderPod)
+
+	// Use the leader pod for subsequent operations
+	managerPod = leaderPod
 
 	t.Log("Reading logs to make sure that ClusterExtension was reconciled by operator-controller before we update it")
 	// Make sure that after we upgrade OLM itself we can still reconcile old objects without any changes
@@ -221,11 +223,48 @@ func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel stri
 	t.Logf("Ensure the number of remaining pods equal the desired number of replicas (%d)", desiredNumReplicas)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		require.NoError(ct, c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}))
-		require.Len(ct, managerPods.Items, 1)
+		require.Len(ct, managerPods.Items, int(desiredNumReplicas))
 	}, time.Minute, time.Second)
 	return &managerPods.Items[0]
 }
 
+// findLeaderPod finds the pod that has acquired the leader lease by checking logs of all pods
+func findLeaderPod(ctx context.Context, controlPlaneLabel string) (*corev1.Pod, error) {
+	deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector()
+
+	var managerPods corev1.PodList
+	if err := c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}); err != nil {
+		return nil, fmt.Errorf("failed to list pods: %w", err)
+	}
+
+	if len(managerPods.Items) == 0 {
+		return nil, fmt.Errorf("no pods found for label %s", controlPlaneLabel)
+	}
+
+	// If there's only one pod, it must be the leader
+	if len(managerPods.Items) == 1 {
+		return &managerPods.Items[0], nil
+	}
+
+	// Check each pod's logs for leader election message
+	leaderSubstrings := []string{"successfully acquired lease"}
+	for i := range managerPods.Items {
+		pod := &managerPods.Items[i]
+
+		// Check if this pod has acquired the lease
+		isLeader, err := watchPodLogsForSubstring(ctx, pod, leaderSubstrings...)
+		if err != nil {
+			// If we can't read logs from this pod, try the next one
+			continue
+		}
+		if isLeader {
+			return pod, nil
+		}
+	}
+
+	return nil, fmt.Errorf("no leader pod found among %d pods", len(managerPods.Items))
+}
+
 func watchPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) {
 	podLogOpts := corev1.PodLogOptions{
 		Follow:    true,