From 604fe39b0fc31747e9e31f593755603f8aa8a917 Mon Sep 17 00:00:00 2001 From: Jian Zhang Date: Thu, 4 Dec 2025 11:06:51 +0800 Subject: [PATCH] Update e2e test case to support replica: 2 Co-authored-by: Todd Short --- Makefile | 6 +- hack/test/pre-upgrade-setup.sh | 4 +- ...mv1-system-catalogd-controller-manager.yml | 4 +- ...operator-controller-controller-manager.yml | 4 +- helm/olmv1/values.yaml | 2 + helm/replicas-2.yaml | 9 ++ manifests/experimental-e2e.yaml | 8 +- manifests/experimental.yaml | 4 +- manifests/standard-e2e.yaml | 4 +- manifests/standard.yaml | 4 +- test/e2e/cluster_extension_install_test.go | 24 ++- test/e2e/single_namespace_support_test.go | 6 +- test/e2e/webhook_support_test.go | 5 +- test/helpers/helpers.go | 18 ++- test/upgrade-e2e/post_upgrade_test.go | 150 +++++++++++++++--- testdata/build-test-registry.sh | 4 +- 16 files changed, 197 insertions(+), 59 deletions(-) create mode 100644 helm/replicas-2.yaml diff --git a/Makefile b/Makefile index c7d9ea5cfe..b6b45c4e13 100644 --- a/Makefile +++ b/Makefile @@ -158,7 +158,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE $(STANDARD_MANIFEST) ?= helm/cert-manager.yaml $(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml $(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml -$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml +$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/replicas-2.yaml HELM_SETTINGS ?= .PHONY: $(MANIFESTS) $(MANIFESTS): $(HELM) @@ -484,8 +484,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i CATD_NAMESPACE := olmv1-system .PHONY: wait wait: - kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s - kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert + kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m + kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert .PHONY: docker-build docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH. diff --git a/hack/test/pre-upgrade-setup.sh b/hack/test/pre-upgrade-setup.sh index 669f9da37f..e52c88b467 100755 --- a/hack/test/pre-upgrade-setup.sh +++ b/hack/test/pre-upgrade-setup.sh @@ -155,5 +155,5 @@ spec: version: 1.0.0 EOF -kubectl wait --for=condition=Serving --timeout=60s ClusterCatalog $TEST_CLUSTER_CATALOG_NAME -kubectl wait --for=condition=Installed --timeout=60s ClusterExtension $TEST_CLUSTER_EXTENSION_NAME +kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME +kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME diff --git a/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml b/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml index 092cb7a24e..ac69bce6a4 100644 --- a/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml +++ b/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml @@ -12,11 +12,11 @@ metadata: namespace: {{ .Values.namespaces.olmv1.name }} spec: minReadySeconds: 5 - replicas: 1 + replicas: {{ .Values.options.catalogd.deployment.replicas }} strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: diff --git a/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml b/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml index 249610244d..bea8f0404f 100644 --- a/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml +++ b/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml @@ -11,11 +11,11 @@ metadata: name: operator-controller-controller-manager namespace: {{ .Values.namespaces.olmv1.name }} spec: - replicas: 1 + replicas: {{ .Values.options.operatorController.deployment.replicas }} strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: diff --git a/helm/olmv1/values.yaml b/helm/olmv1/values.yaml index cb454f6250..c5845b9a1b 100644 --- a/helm/olmv1/values.yaml +++ b/helm/olmv1/values.yaml @@ -8,6 +8,7 @@ options: enabled: true deployment: image: quay.io/operator-framework/operator-controller:devel + replicas: 1 extraArguments: [] features: enabled: [] @@ -19,6 +20,7 @@ options: enabled: true deployment: image: quay.io/operator-framework/catalogd:devel + replicas: 1 extraArguments: [] features: enabled: [] diff --git a/helm/replicas-2.yaml b/helm/replicas-2.yaml new file mode 100644 index 0000000000..4f50e3dfb6 --- /dev/null +++ b/helm/replicas-2.yaml @@ -0,0 +1,9 @@ +# Override values to set replicas to 2 for both operator-controller and catalogd +# This is used in experimental-e2e.yaml to test multi-replica deployments +options: + operatorController: + deployment: + replicas: 2 + catalogd: + deployment: + replicas: 2 diff --git a/manifests/experimental-e2e.yaml b/manifests/experimental-e2e.yaml index e536cd72a6..fbfa2df82b 100644 --- a/manifests/experimental-e2e.yaml +++ b/manifests/experimental-e2e.yaml @@ -2107,11 +2107,11 @@ metadata: namespace: olmv1-system spec: minReadySeconds: 5 - replicas: 1 + replicas: 2 strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: @@ -2258,11 +2258,11 @@ metadata: name: operator-controller-controller-manager namespace: olmv1-system spec: - replicas: 1 + replicas: 2 strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: diff --git a/manifests/experimental.yaml b/manifests/experimental.yaml index f88debab0f..9e9d611f05 100644 --- a/manifests/experimental.yaml +++ b/manifests/experimental.yaml @@ -2036,7 +2036,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: @@ -2174,7 +2174,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: diff --git a/manifests/standard-e2e.yaml b/manifests/standard-e2e.yaml index 1aed38ba96..36cb9971cf 100644 --- a/manifests/standard-e2e.yaml +++ b/manifests/standard-e2e.yaml @@ -1799,7 +1799,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: @@ -1949,7 +1949,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: diff --git a/manifests/standard.yaml b/manifests/standard.yaml index 34cc579181..933ff5d819 100644 --- a/manifests/standard.yaml +++ b/manifests/standard.yaml @@ -1724,7 +1724,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: @@ -1861,7 +1861,7 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates + maxSurge: 1 # Allow temporary extra pod for zero-downtime updates maxUnavailable: 0 # Never allow pods to be unavailable during updates selector: matchLabels: diff --git a/test/e2e/cluster_extension_install_test.go b/test/e2e/cluster_extension_install_test.go index b3380ff0f5..17b6fb9e34 100644 --- a/test/e2e/cluster_extension_install_test.go +++ b/test/e2e/cluster_extension_install_test.go @@ -27,8 +27,11 @@ import ( ) const ( - artifactName = "operator-controller-e2e" - pollDuration = time.Minute + artifactName = "operator-controller-e2e" + // pollDuration is set to 3 minutes to account for leader election time in multi-replica deployments. + // In the worst case (previous leader crashed), leader election can take up to 163 seconds + // (LeaseDuration: 137s + RetryPeriod: 26s). Adding buffer for reconciliation time. + pollDuration = 3 * time.Minute pollInterval = time.Second testCatalogRefEnvVar = "CATALOG_IMG" testCatalogName = "test-catalog" @@ -169,10 +172,11 @@ location = "docker-registry.operator-controller-e2e.svc.cluster.local:5000"`, require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) }, 2*time.Minute, pollInterval) - // Give the check 2 minutes instead of the typical 1 for the pod's - // files to update from the configmap change. + // Give the check extra time for the pod's files to update from the configmap change. // The theoretical max time is the kubelet sync period of 1 minute + - // ConfigMap cache TTL of 1 minute = 2 minutes + // ConfigMap cache TTL of 1 minute = 2 minutes. + // With multi-replica deployments, add leader election time (up to 163s in worst case). + // Total: 2 min (ConfigMap) + 2.7 min (leader election) + buffer = 5 minutes t.Log("By eventually reporting progressing as True") require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) @@ -180,7 +184,7 @@ location = "docker-registry.operator-controller-e2e.svc.cluster.local:5000"`, require.NotNil(ct, cond) require.Equal(ct, metav1.ConditionTrue, cond.Status) require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) - }, 2*time.Minute, pollInterval) + }, 5*time.Minute, pollInterval) t.Log("By eventually installing the package successfully") require.EventuallyWithT(t, func(ct *assert.CollectT) { @@ -655,6 +659,8 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) { // backoff of this eventually check we MUST ensure we do not touch the ClusterExtension // after creating int the Namespace and ServiceAccount. t.Log("By eventually installing the package successfully") + // Use 5 minutes for recovery tests to account for exponential backoff after repeated failures + // plus leader election time (up to 163s in worst case) require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled) @@ -663,7 +669,7 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) { require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) require.Contains(ct, cond.Message, "Installed bundle") require.NotEmpty(ct, clusterExtension.Status.Install) - }, pollDuration, pollInterval) + }, 5*time.Minute, pollInterval) t.Log("By eventually reporting Progressing == True with Reason Success") require.EventuallyWithT(t, func(ct *assert.CollectT) { @@ -777,6 +783,8 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi // backoff of this eventually check we MUST ensure we do not touch the ClusterExtension // after deleting the Deployment. t.Log("By eventually installing the package successfully") + // Use 5 minutes for recovery tests to account for exponential backoff after repeated failures + // plus leader election time (up to 163s in worst case) require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled) @@ -785,7 +793,7 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) require.Contains(ct, cond.Message, "Installed bundle") require.NotEmpty(ct, clusterExtension.Status.Install) - }, pollDuration, pollInterval) + }, 5*time.Minute, pollInterval) t.Log("By eventually reporting Progressing == True with Reason Success") require.EventuallyWithT(t, func(ct *assert.CollectT) { diff --git a/test/e2e/single_namespace_support_test.go b/test/e2e/single_namespace_support_test.go index 2c3b825a1a..7cfe782fbd 100644 --- a/test/e2e/single_namespace_support_test.go +++ b/test/e2e/single_namespace_support_test.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -404,9 +405,12 @@ func TestClusterExtensionVersionUpdate(t *testing.T) { require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) }, pollDuration, pollInterval) t.Log("We should have two ClusterExtensionRevision resources") + // Use 5 minutes for checking ClusterExtensionRevision creation after upgrade. + // In multi-replica deployments, revision creation happens after the upgrade completes, + // which includes leader election time (up to 163s) plus reconciliation overhead. require.EventuallyWithT(t, func(ct *assert.CollectT) { cerList := &ocv1.ClusterExtensionRevisionList{} require.NoError(ct, c.List(context.Background(), cerList)) require.Len(ct, cerList.Items, 2) - }, pollDuration, pollInterval) + }, 5*time.Minute, pollInterval) } diff --git a/test/e2e/webhook_support_test.go b/test/e2e/webhook_support_test.go index 1c80c615be..f04c16fca8 100644 --- a/test/e2e/webhook_support_test.go +++ b/test/e2e/webhook_support_test.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -138,6 +139,8 @@ func TestWebhookSupport(t *testing.T) { }) t.Log("By waiting for webhook-operator extension to be installed successfully") + // Use 5 minutes for webhook installation as it requires webhook cert generation via cert-manager, + // which adds additional time on top of leader election and reconciliation in multi-replica deployments. require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(t.Context(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled) @@ -147,7 +150,7 @@ func TestWebhookSupport(t *testing.T) { require.Contains(ct, cond.Message, "Installed bundle") require.NotNil(ct, clusterExtension.Status.Install) require.NotEmpty(ct, clusterExtension.Status.Install.Bundle) - }, pollDuration, pollInterval) + }, 5*time.Minute, pollInterval) t.Log("By waiting for webhook-operator deployment to be available") require.EventuallyWithT(t, func(ct *assert.CollectT) { diff --git a/test/helpers/helpers.go b/test/helpers/helpers.go index af142c6e34..2d99854fa2 100644 --- a/test/helpers/helpers.go +++ b/test/helpers/helpers.go @@ -34,7 +34,10 @@ var ( ) const ( - pollDuration = time.Minute + // pollDuration is set to 3 minutes to account for leader election time in multi-replica deployments. + // In the worst case (previous leader crashed), leader election can take up to 163 seconds + // (LeaseDuration: 137s + RetryPeriod: 26s). Adding buffer for reconciliation time. + pollDuration = 3 * time.Minute pollInterval = time.Second testCatalogName = "test-catalog" testCatalogRefEnvVar = "CATALOG_IMG" @@ -289,23 +292,26 @@ func ValidateCatalogUnpackWithName(t *testing.T, catalogName string) { func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) { ls := labels.Set{"olm.operatorframework.io/owner-name": clusterExtensionName} - // CRDs may take an extra long time to be deleted, and may run into the following error: - // Condition=Terminating Status=True Reason=InstanceDeletionFailed Message="could not list instances: storage is (re)initializing" + // CRDs may take extra time to be deleted in multi-replica deployments due to leader election + // and finalizer processing. Use 3 minutes which accounts for leader election (up to 163s) + // plus buffer for reconciliation overhead. t.Logf("By waiting for CustomResourceDefinitions of %q to be deleted", clusterExtensionName) require.EventuallyWithT(t, func(ct *assert.CollectT) { list := &apiextensionsv1.CustomResourceDefinitionList{} err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()}) require.NoError(ct, err) require.Empty(ct, list.Items) - }, 5*pollDuration, pollInterval) + }, pollDuration, pollInterval) + // ClusterRoleBindings and ClusterRoles deletion is typically faster than CRDs. + // Use pollDuration (3 minutes) to account for leader election in multi-replica deployments. t.Logf("By waiting for ClusterRoleBindings of %q to be deleted", clusterExtensionName) require.EventuallyWithT(t, func(ct *assert.CollectT) { list := &rbacv1.ClusterRoleBindingList{} err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()}) require.NoError(ct, err) require.Empty(ct, list.Items) - }, 2*pollDuration, pollInterval) + }, pollDuration, pollInterval) t.Logf("By waiting for ClusterRoles of %q to be deleted", clusterExtensionName) require.EventuallyWithT(t, func(ct *assert.CollectT) { @@ -313,7 +319,7 @@ func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) { err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()}) require.NoError(ct, err) require.Empty(ct, list.Items) - }, 2*pollDuration, pollInterval) + }, pollDuration, pollInterval) } func TestCleanup(t *testing.T, cat *ocv1.ClusterCatalog, clusterExtension *ocv1.ClusterExtension, sa *corev1.ServiceAccount, ns *corev1.Namespace) { diff --git a/test/upgrade-e2e/post_upgrade_test.go b/test/upgrade-e2e/post_upgrade_test.go index 785d91ea3b..a8cd8fc810 100644 --- a/test/upgrade-e2e/post_upgrade_test.go +++ b/test/upgrade-e2e/post_upgrade_test.go @@ -25,6 +25,11 @@ import ( const ( artifactName = "operator-controller-upgrade-e2e" container = "manager" + // pollDuration is set to 3 minutes to account for leader election time in multi-replica deployments. + // In the worst case (previous leader crashed), leader election can take up to 163 seconds + // (LeaseDuration: 137s + RetryPeriod: 26s). Adding buffer for reconciliation time. + pollDuration = 3 * time.Minute + pollInterval = time.Second ) func TestClusterCatalogUnpacking(t *testing.T) { @@ -45,23 +50,22 @@ func TestClusterCatalogUnpacking(t *testing.T) { require.Equal(ct, *managerDeployment.Spec.Replicas, managerDeployment.Status.ReadyReplicas) }, time.Minute, time.Second) - var managerPod corev1.Pod - t.Log("Waiting for only one controller-manager pod to remain") + t.Log("Waiting for controller-manager pods to match the desired replica count") require.EventuallyWithT(t, func(ct *assert.CollectT) { var managerPods corev1.PodList err := c.List(ctx, &managerPods, client.MatchingLabels(managerLabelSelector)) require.NoError(ct, err) - require.Len(ct, managerPods.Items, 1) - managerPod = managerPods.Items[0] + require.Len(ct, managerPods.Items, int(*managerDeployment.Spec.Replicas)) }, time.Minute, time.Second) t.Log("Waiting for acquired leader election") leaderCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute) defer leaderCancel() - leaderSubstrings := []string{"successfully acquired lease"} - leaderElected, err := watchPodLogsForSubstring(leaderCtx, &managerPod, leaderSubstrings...) + + // When there are multiple replicas, find the leader pod + managerPod, err := findLeaderPod(leaderCtx, "catalogd") require.NoError(t, err) - require.True(t, leaderElected) + require.NotNil(t, managerPod) t.Log("Reading logs to make sure that ClusterCatalog was reconciled by catalogdv1") logCtx, cancel := context.WithTimeout(ctx, time.Minute) @@ -70,7 +74,7 @@ func TestClusterCatalogUnpacking(t *testing.T) { "reconcile ending", fmt.Sprintf(`ClusterCatalog=%q`, testClusterCatalogName), } - found, err := watchPodLogsForSubstring(logCtx, &managerPod, substrings...) + found, err := watchPodLogsForSubstring(logCtx, managerPod, substrings...) require.NoError(t, err) require.True(t, found) @@ -103,11 +107,18 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { // wait for catalogd deployment to finish t.Log("Wait for catalogd deployment to be ready") - catalogdManagerPod := waitForDeployment(t, ctx, "catalogd") + waitForDeployment(t, ctx, "catalogd") + + // Find the catalogd leader pod + catalogdLeaderCtx, catalogdLeaderCancel := context.WithTimeout(ctx, 3*time.Minute) + defer catalogdLeaderCancel() + catalogdManagerPod, err := findLeaderPod(catalogdLeaderCtx, "catalogd") + require.NoError(t, err) + require.NotNil(t, catalogdManagerPod) // wait for operator-controller deployment to finish t.Log("Wait for operator-controller deployment to be ready") - managerPod := waitForDeployment(t, ctx, "operator-controller") + waitForDeployment(t, ctx, "operator-controller") t.Log("Wait for acquired leader election") // Average case is under 1 minute but in the worst case: (previous leader crashed) @@ -115,10 +126,11 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { leaderCtx, leaderCancel := context.WithTimeout(ctx, 3*time.Minute) defer leaderCancel() - leaderSubstrings := []string{"successfully acquired lease"} - leaderElected, err := watchPodLogsForSubstring(leaderCtx, managerPod, leaderSubstrings...) + // When there are multiple replicas, find the leader pod + var managerPod *corev1.Pod + managerPod, err = findLeaderPod(leaderCtx, "operator-controller") require.NoError(t, err) - require.True(t, leaderElected) + require.NotNil(t, managerPod) t.Log("Reading logs to make sure that ClusterExtension was reconciled by operator-controller before we update it") // Make sure that after we upgrade OLM itself we can still reconcile old objects without any changes @@ -154,7 +166,7 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) require.True(ct, clusterCatalog.Status.LastUnpacked.After(catalogdManagerPod.CreationTimestamp.Time)) - }, time.Minute, time.Second) + }, pollDuration, pollInterval) // TODO: if we change the underlying revision storage mechanism, the new version // will not detect any installed versions, we need to make sure that the upgrade @@ -172,7 +184,7 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { require.Contains(ct, cond.Message, "Installed bundle") require.NotNil(ct, clusterExtension.Status.Install) require.NotEmpty(ct, clusterExtension.Status.Install.Bundle.Version) - }, time.Minute, time.Second) + }, pollDuration, pollInterval) previousVersion := clusterExtension.Status.Install.Bundle.Version @@ -182,6 +194,13 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { require.NoError(t, c.Update(ctx, &clusterExtension)) t.Log("Checking that the ClusterExtension installs successfully") + // Use 10 minutes for post-OLM-upgrade extension upgrade operations. + // After upgrading OLM itself, the system needs time to: + // - Stabilize after operator-controller pods restart (leader election: up to 163s) + // - Process the ClusterExtension spec change + // - Resolve and unpack the new bundle (1.0.1) + // - Apply manifests and wait for rollout + // In multi-replica deployments with recent OLM upgrade, this can take significant time require.EventuallyWithT(t, func(ct *assert.CollectT) { require.NoError(ct, c.Get(ctx, types.NamespacedName{Name: testClusterExtensionName}, &clusterExtension)) cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled) @@ -190,14 +209,13 @@ func TestClusterExtensionAfterOLMUpgrade(t *testing.T) { require.Contains(ct, cond.Message, "Installed bundle") require.Equal(ct, ocv1.BundleMetadata{Name: "test-operator.1.0.1", Version: "1.0.1"}, clusterExtension.Status.Install.Bundle) require.NotEqual(ct, previousVersion, clusterExtension.Status.Install.Bundle.Version) - }, time.Minute, time.Second) + }, 10*time.Minute, pollInterval) } // waitForDeployment checks that the updated deployment with the given app.kubernetes.io/name label // has reached the desired number of replicas and that the number pods matches that number -// i.e. no old pods remain. It will return a pointer to the first pod. This is only necessary -// to facilitate the mitigation put in place for https://github.com/operator-framework/operator-controller/issues/1626 -func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) *corev1.Pod { +// i.e. no old pods remain. +func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) { deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector() t.Log("Checking that the deployment is updated") @@ -217,13 +235,101 @@ func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel stri desiredNumReplicas = *managerDeployment.Spec.Replicas }, time.Minute, time.Second) - var managerPods corev1.PodList t.Logf("Ensure the number of remaining pods equal the desired number of replicas (%d)", desiredNumReplicas) require.EventuallyWithT(t, func(ct *assert.CollectT) { + var managerPods corev1.PodList require.NoError(ct, c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector})) - require.Len(ct, managerPods.Items, 1) + require.Len(ct, managerPods.Items, int(desiredNumReplicas)) }, time.Minute, time.Second) - return &managerPods.Items[0] +} + +// findLeaderPod finds the pod that has acquired the leader lease by checking logs of all pods +func findLeaderPod(ctx context.Context, controlPlaneLabel string) (*corev1.Pod, error) { + deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector() + + // If there's only one pod, it must be the leader + // First, do a quick check to see if we can find the leader in existing logs + var leaderPod *corev1.Pod + leaderSubstrings := []string{"successfully acquired lease"} + + // Retry logic: keep checking until we find a leader or context times out + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return nil, fmt.Errorf("timeout waiting for leader election: %w", ctx.Err()) + case <-ticker.C: + var managerPods corev1.PodList + if err := c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}); err != nil { + continue + } + + if len(managerPods.Items) == 0 { + continue + } + + // If there's only one pod, it must be the leader + if len(managerPods.Items) == 1 { + return &managerPods.Items[0], nil + } + + // Check each pod's existing logs (without following) for leader election message + for i := range managerPods.Items { + pod := &managerPods.Items[i] + + // Check existing logs only (don't follow) + isLeader, err := checkPodLogsForSubstring(ctx, pod, leaderSubstrings...) + if err != nil { + // If we can't read logs from this pod, try the next one + continue + } + if isLeader { + leaderPod = pod + break + } + } + + if leaderPod != nil { + return leaderPod, nil + } + // No leader found yet, retry after ticker interval + } + } +} + +// checkPodLogsForSubstring checks existing pod logs (without following) for the given substrings +func checkPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) { + podLogOpts := corev1.PodLogOptions{ + Follow: false, // Don't follow, just read existing logs + Container: container, + } + + req := kclientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &podLogOpts) + podLogs, err := req.Stream(ctx) + if err != nil { + return false, err + } + defer podLogs.Close() + + scanner := bufio.NewScanner(podLogs) + for scanner.Scan() { + line := scanner.Text() + + foundCount := 0 + for _, substring := range substrings { + if strings.Contains(line, substring) { + foundCount++ + } + } + if foundCount == len(substrings) { + return true, nil + } + } + + // If we didn't find the substrings, return false (not an error) + return false, nil } func watchPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) { diff --git a/testdata/build-test-registry.sh b/testdata/build-test-registry.sh index e2dcc09148..7dee9e3c11 100755 --- a/testdata/build-test-registry.sh +++ b/testdata/build-test-registry.sh @@ -103,7 +103,7 @@ spec: type: NodePort EOF -kubectl wait --for=condition=Available -n "${namespace}" "deploy/${name}" --timeout=60s +kubectl wait --for=condition=Available -n "${namespace}" "deploy/${name}" --timeout=3m kubectl apply -f - << EOF apiVersion: batch/v1 @@ -135,4 +135,4 @@ spec: secretName: ${namespace}-registry EOF -kubectl wait --for=condition=Complete -n "${namespace}" "job/${name}-push" --timeout=60s +kubectl wait --for=condition=Complete -n "${namespace}" "job/${name}-push" --timeout=3m