Skip to content

Commit 00affaf

Browse files
mayankshah1607Copilotegegunes
authored
K8SPSMDB-1493: fix stuck backups when smart update is in progress (#2143)
* fix backup/smart-upgrde deadlock Signed-off-by: Mayank Shah <mayank.shah@percona.com> * add comments Signed-off-by: Mayank Shah <mayank.shah@percona.com> * fix unit test Signed-off-by: Mayank Shah <mayank.shah@percona.com> * unit test Signed-off-by: Mayank Shah <mayank.shah@percona.com> * unit tests for conditions Signed-off-by: Mayank Shah <mayank.shah@percona.com> * linting Signed-off-by: Mayank Shah <mayank.shah@percona.com> * remove debug statements Signed-off-by: Mayank Shah <mayank.shah@percona.com> * linting Signed-off-by: Mayank Shah <mayank.shah@percona.com> * Update pkg/controller/perconaservermongodb/status_test.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update pkg/apis/psmdb/v1/psmdb_types.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix test semantics Signed-off-by: Mayank Shah <mayank.shah@percona.com> * wrap error Signed-off-by: Mayank Shah <mayank.shah@percona.com> --------- Signed-off-by: Mayank Shah <mayank.shah@percona.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Ege Güneş <ege.gunes@percona.com>
1 parent e300d85 commit 00affaf

File tree

5 files changed

+251
-0
lines changed

5 files changed

+251
-0
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package v1
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
)
8+
9+
func TestConditions(t *testing.T) {
10+
status := &PerconaServerMongoDBStatus{}
11+
12+
cond := status.FindCondition(AppStateReady)
13+
assert.Nil(t, cond)
14+
15+
status.AddCondition(ClusterCondition{
16+
Type: AppStateReady,
17+
Status: ConditionTrue,
18+
Reason: "ClusterReady",
19+
Message: "Cluster is ready",
20+
})
21+
cond = status.FindCondition(AppStateReady)
22+
assert.NotNil(t, cond)
23+
assert.Equal(t, ConditionTrue, cond.Status)
24+
assert.Equal(t, "ClusterReady", cond.Reason)
25+
assert.Equal(t, "Cluster is ready", cond.Message)
26+
lastTransitionTime := cond.LastTransitionTime
27+
assert.NotNil(t, lastTransitionTime)
28+
assert.True(t, status.IsStatusConditionTrue(AppStateReady))
29+
30+
status.AddCondition(ClusterCondition{
31+
Type: AppStateReady,
32+
Status: ConditionFalse,
33+
Reason: "ClusterNotReady",
34+
Message: "Cluster is not ready",
35+
})
36+
cond = status.FindCondition(AppStateReady)
37+
assert.NotNil(t, cond)
38+
assert.Equal(t, ConditionFalse, cond.Status)
39+
assert.Equal(t, "ClusterNotReady", cond.Reason)
40+
assert.Equal(t, "Cluster is not ready", cond.Message)
41+
assert.NotNil(t, cond.LastTransitionTime)
42+
assert.True(t, cond.LastTransitionTime.After(lastTransitionTime.Time))
43+
assert.False(t, status.IsStatusConditionTrue(AppStateReady))
44+
45+
status.RemoveCondition(AppStateReady)
46+
assert.False(t, status.IsStatusConditionTrue(AppStateReady))
47+
assert.Nil(t, status.FindCondition(AppStateReady))
48+
}

pkg/apis/psmdb/v1/psmdb_types.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,10 @@ const (
339339
ConditionUnknown ConditionStatus = "Unknown"
340340
)
341341

342+
// ConditionTypePendingSmartUpdate is a condition type set on PSMDBCluster when a smart update is required
343+
// but has not yet started. For e.g., if a backup/restore is running at the same time as a smart update is triggered.
344+
const ConditionTypePendingSmartUpdate AppState = "pendingSmartUpdate"
345+
342346
type ClusterCondition struct {
343347
Status ConditionStatus `json:"status"`
344348
Type AppState `json:"type"`
@@ -357,6 +361,14 @@ func (s *PerconaServerMongoDBStatus) FindCondition(conditionType AppState) *Clus
357361
return nil
358362
}
359363

364+
func (s *PerconaServerMongoDBStatus) IsStatusConditionTrue(conditionType AppState) bool {
365+
cond := s.FindCondition(conditionType)
366+
if cond == nil {
367+
return false
368+
}
369+
return cond.Status == ConditionTrue
370+
}
371+
360372
type PMMSpec struct {
361373
Enabled bool `json:"enabled,omitempty"`
362374
ServerHost string `json:"serverHost,omitempty"`
@@ -1386,6 +1398,10 @@ func (cr *PerconaServerMongoDB) CanBackup(ctx context.Context) error {
13861398
return nil
13871399
}
13881400

1401+
if cr.Status.State == AppStateInit && cr.Status.IsStatusConditionTrue(ConditionTypePendingSmartUpdate) {
1402+
return nil
1403+
}
1404+
13891405
if cr.CompareVersion("1.15.0") <= 0 && !cr.Spec.UnsafeConf {
13901406
return errors.Errorf("allowUnsafeConfigurations must be true to run backup on cluster with status %s", cr.Status.State)
13911407
}
@@ -1414,6 +1430,15 @@ func (cr *PerconaServerMongoDB) CanRestore(ctx context.Context) error {
14141430
return nil
14151431
}
14161432

1433+
func (s *PerconaServerMongoDBStatus) RemoveCondition(conditionType AppState) {
1434+
for i, c := range s.Conditions {
1435+
if c.Type == conditionType {
1436+
s.Conditions = append(s.Conditions[:i], s.Conditions[i+1:]...)
1437+
return
1438+
}
1439+
}
1440+
}
1441+
14171442
func (s *PerconaServerMongoDBStatus) AddCondition(c ClusterCondition) {
14181443
existingCondition := s.FindCondition(c.Type)
14191444
if existingCondition == nil {

pkg/controller/perconaservermongodb/connections_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,9 @@ func fakeStatefulset(cr *api.PerconaServerMongoDB, rs *api.ReplsetSpec, size int
355355
},
356356
Spec: appsv1.StatefulSetSpec{
357357
Replicas: &size,
358+
Selector: &metav1.LabelSelector{
359+
MatchLabels: ls,
360+
},
358361
Template: corev1.PodTemplateSpec{
359362
Spec: corev1.PodSpec{
360363
Containers: []corev1.Container{

pkg/controller/perconaservermongodb/status.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,11 +233,61 @@ func (r *ReconcilePerconaServerMongoDB) updateStatus(ctx context.Context, cr *ap
233233
clusterCondition.Type = cr.Status.State
234234
cr.Status.AddCondition(clusterCondition)
235235

236+
cr.Status.RemoveCondition(api.ConditionTypePendingSmartUpdate)
237+
if awaiting, err := r.isAwaitingSmartUpdate(ctx, cr); err != nil {
238+
return errors.Wrap(err, "cannot check if awaiting smart update")
239+
} else if cr.Status.Ready == cr.Status.Size && awaiting {
240+
cr.Status.AddCondition(api.ClusterCondition{
241+
Type: api.ConditionTypePendingSmartUpdate,
242+
Status: api.ConditionTrue,
243+
Message: "Smart update is pending but has not yet started",
244+
})
245+
}
246+
236247
cr.Status.ObservedGeneration = cr.ObjectMeta.Generation
237248

238249
return r.writeStatus(ctx, cr)
239250
}
240251

252+
// isAwaitingSmartUpdate returns true if the cluster smart update is pending but has not yet started.
253+
func (r *ReconcilePerconaServerMongoDB) isAwaitingSmartUpdate(ctx context.Context, cr *api.PerconaServerMongoDB) (bool, error) {
254+
if cr.Spec.UpdateStrategy != api.SmartUpdateStatefulSetStrategyType {
255+
return false, nil
256+
}
257+
258+
statefulSets := make([]string, 0, len(cr.Spec.Replsets)+2)
259+
for _, rs := range cr.Spec.Replsets {
260+
statefulSets = append(statefulSets, cr.StatefulsetNamespacedName(rs.Name).Name)
261+
}
262+
if cr.Spec.Sharding.Enabled {
263+
statefulSets = append(statefulSets, cr.StatefulsetNamespacedName(api.ConfigReplSetName).Name)
264+
statefulSets = append(statefulSets, cr.MongosNamespacedName().Name)
265+
}
266+
267+
// count the number of updated pods from statefulsets that have changed
268+
var updated int32
269+
stsChanged := false // true if any statefulset has changed
270+
for _, name := range statefulSets {
271+
sts := &appsv1.StatefulSet{}
272+
if err := r.client.Get(ctx, types.NamespacedName{Name: name, Namespace: cr.Namespace}, sts); err != nil {
273+
return false, client.IgnoreNotFound(err)
274+
}
275+
276+
pods := corev1.PodList{}
277+
if err := r.client.List(ctx, &pods, &client.ListOptions{
278+
Namespace: cr.Namespace,
279+
LabelSelector: labels.SelectorFromSet(sts.Spec.Selector.MatchLabels),
280+
}); err != nil {
281+
return false, errors.Wrap(err, "get pod list")
282+
}
283+
if isSfsChanged(sts, &pods) {
284+
updated += sts.Status.UpdatedReplicas
285+
stsChanged = true
286+
}
287+
}
288+
return stsChanged && updated == 0, nil
289+
}
290+
241291
func (r *ReconcilePerconaServerMongoDB) upgradeInProgress(ctx context.Context, cr *api.PerconaServerMongoDB, rsName string) (bool, error) {
242292
sfsObj := &appsv1.StatefulSet{}
243293
err := r.client.Get(ctx, types.NamespacedName{Name: cr.Name + "-" + rsName, Namespace: cr.Namespace}, sfsObj)

pkg/controller/perconaservermongodb/status_test.go

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
fakeBackup "github.com/percona/percona-server-mongodb-operator/pkg/psmdb/backup/fake"
1818
faketls "github.com/percona/percona-server-mongodb-operator/pkg/psmdb/tls/fake"
1919
"github.com/percona/percona-server-mongodb-operator/pkg/version"
20+
"github.com/stretchr/testify/assert"
2021
)
2122

2223
// creates a fake client to mock API calls with the mock objects
@@ -323,3 +324,127 @@ func fakeSvc(name, namespace string, svcType corev1.ServiceType, ip, hostname st
323324
},
324325
}
325326
}
327+
328+
func TestIsAwaitingSmartUpdate(t *testing.T) {
329+
ctx := t.Context()
330+
cr := &api.PerconaServerMongoDB{
331+
ObjectMeta: metav1.ObjectMeta{
332+
Name: "psmdb-mock",
333+
Namespace: "psmdb",
334+
Generation: 1,
335+
},
336+
Spec: api.PerconaServerMongoDBSpec{
337+
Backup: api.BackupSpec{
338+
Enabled: false,
339+
},
340+
CRVersion: version.Version(),
341+
Image: "percona/percona-server-mongodb:latest",
342+
Replsets: []*api.ReplsetSpec{
343+
{
344+
Name: "rs0",
345+
Size: 3,
346+
VolumeSpec: fakeVolumeSpec(t),
347+
},
348+
},
349+
UpdateStrategy: api.SmartUpdateStatefulSetStrategyType,
350+
UpgradeOptions: api.UpgradeOptions{
351+
SetFCV: true,
352+
},
353+
Sharding: api.Sharding{Enabled: false},
354+
},
355+
Status: api.PerconaServerMongoDBStatus{
356+
MongoVersion: "4.2",
357+
ObservedGeneration: 1,
358+
State: api.AppStateReady,
359+
MongoImage: "percona/percona-server-mongodb:4.0",
360+
},
361+
}
362+
sts := fakeStatefulset(cr, cr.Spec.Replsets[0], cr.Spec.Replsets[0].Size, "some-revision", "mongod")
363+
pods := fakePodsForRS(cr, cr.Spec.Replsets[0])
364+
365+
testCases := []struct {
366+
desc string
367+
expected bool
368+
mock func(cl client.Client) error
369+
cluster *api.PerconaServerMongoDB
370+
}{
371+
{
372+
desc: "smart update is disabled",
373+
expected: false,
374+
cluster: updateResource(cr.DeepCopy(), func(cr *api.PerconaServerMongoDB) {
375+
cr.Spec.UpdateStrategy = ""
376+
}),
377+
},
378+
{
379+
desc: "statefulset has not changed",
380+
expected: false,
381+
mock: func(cl client.Client) error {
382+
for _, pod := range pods {
383+
labels := pod.GetLabels()
384+
labels["controller-revision-hash"] = "some-revision"
385+
pod.SetLabels(labels)
386+
if err := cl.Update(ctx, pod); err != nil {
387+
return err
388+
}
389+
}
390+
return nil
391+
},
392+
cluster: cr.DeepCopy(),
393+
},
394+
{
395+
desc: "statefulset has changed, no pods updated",
396+
expected: true,
397+
mock: func(cl client.Client) error {
398+
for _, pod := range pods {
399+
labels := pod.GetLabels()
400+
labels["controller-revision-hash"] = "previous-revision"
401+
pod.SetLabels(labels)
402+
if err := cl.Update(ctx, pod); err != nil {
403+
return err
404+
}
405+
}
406+
fakeSts := sts.DeepCopyObject().(*appsv1.StatefulSet)
407+
fakeSts.Status.UpdatedReplicas = 0
408+
return cl.Status().Update(ctx, fakeSts)
409+
},
410+
cluster: cr.DeepCopy(),
411+
},
412+
{
413+
desc: "statefulset has changed, some pods updated",
414+
expected: false,
415+
mock: func(cl client.Client) error {
416+
outdatedPod := pods[2]
417+
418+
labels := outdatedPod.GetLabels()
419+
labels["controller-revision-hash"] = "previous-revision"
420+
outdatedPod.SetLabels(labels)
421+
if err := cl.Update(ctx, outdatedPod); err != nil {
422+
return err
423+
}
424+
fakeSts := sts.DeepCopyObject().(*appsv1.StatefulSet)
425+
fakeSts.Status.UpdatedReplicas = 2
426+
return cl.Status().Update(ctx, fakeSts)
427+
},
428+
cluster: cr.DeepCopy(),
429+
},
430+
}
431+
432+
for _, tc := range testCases {
433+
t.Run(tc.desc, func(t *testing.T) {
434+
// Setup mocks
435+
objs := []client.Object{}
436+
objs = append(objs, tc.cluster, sts)
437+
objs = append(objs, pods...)
438+
r := buildFakeClient(objs...)
439+
if tc.mock != nil {
440+
err := tc.mock(r.client)
441+
assert.NoError(t, err)
442+
}
443+
444+
actual, err := r.isAwaitingSmartUpdate(ctx, tc.cluster)
445+
assert.NoError(t, err)
446+
assert.Equal(t, tc.expected, actual)
447+
})
448+
449+
}
450+
}

0 commit comments

Comments
 (0)