@@ -28,14 +28,14 @@ import (
2828 "time"
2929
3030 "github.com/rs/zerolog"
31- "k8s.io/api/core/v1"
3231 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3332
3433 driver "github.com/arangodb/go-driver"
3534 "github.com/arangodb/go-driver/agency"
3635 api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
3736 "github.com/arangodb/kube-arangodb/pkg/util/arangod"
3837 "github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
38+ v1 "k8s.io/api/core/v1"
3939)
4040
4141// prepareAgencyPodTermination checks if the given agency pod is allowed to terminate
@@ -137,6 +137,16 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
137137 return nil
138138 }
139139
140+ resignJobAvailable := false
141+ currentVersion := memberStatus .ArangoVersion
142+ if currentVersion != "" {
143+ if currentVersion .CompareTo ("3.4.7" ) > 0 && currentVersion .CompareTo ("3.5" ) < 0 {
144+ resignJobAvailable = true
145+ } else if currentVersion .CompareTo ("3.5.0" ) > 0 {
146+ resignJobAvailable = true
147+ }
148+ }
149+
140150 // Check node the pod is scheduled on
141151 dbserverDataWillBeGone := false
142152 if p .Spec .NodeName != "" {
@@ -147,7 +157,9 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
147157 log .Warn ().Err (err ).Msg ("Failed to get node for member" )
148158 return maskAny (err )
149159 } else if node .Spec .Unschedulable {
150- dbserverDataWillBeGone = true
160+ if r .context .GetSpec ().IsLocallyAttachedVolumes () || ! resignJobAvailable {
161+ dbserverDataWillBeGone = true
162+ }
151163 }
152164 }
153165
@@ -168,13 +180,12 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
168180 }
169181
170182 // Is this a simple pod restart?
171- if ! dbserverDataWillBeGone {
183+ if ! dbserverDataWillBeGone && ! resignJobAvailable {
172184 log .Debug ().Msg ("Pod is just being restarted, safe to remove dbserver pod" )
173185 return nil
174186 }
175187
176188 // Inspect cleaned out state
177- log .Debug ().Msg ("DBServer data is being deleted, so we will cleanout the dbserver first" )
178189 c , err := r .context .GetDatabaseClient (ctx )
179190 if err != nil {
180191 log .Debug ().Err (err ).Msg ("Failed to create member client" )
@@ -201,7 +212,7 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
201212 }
202213 // Not cleaned out yet, check member status
203214 if memberStatus .Conditions .IsTrue (api .ConditionTypeTerminated ) {
204- log .Warn ().Msg ("Member is already terminated before it could be cleaned out. Not good, but removing dbserver pod because we cannot do anything further" )
215+ log .Warn ().Msg ("Member is already terminated before it could resign or be cleaned out. Not good, but removing dbserver pod because we cannot do anything further" )
205216 // At this point we have to set CleanedOut to true,
206217 // because we can no longer reason about the state in the agency and
207218 // bringing back the dbserver again may result in an cleaned out server without us knowing
@@ -220,13 +231,24 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
220231 var jobID string
221232 ctx = driver .WithJobIDResponse (ctx , & jobID )
222233 // Ensure the cleanout is triggered
223- log .Debug ().Msg ("Server is not yet clean out. Triggering a clean out now" )
224- if err := cluster .CleanOutServer (ctx , memberStatus .ID ); err != nil {
225- log .Debug ().Err (err ).Msg ("Failed to clean out server" )
226- return maskAny (err )
234+ if dbserverDataWillBeGone {
235+ log .Debug ().Msg ("Server is not yet cleaned out. Triggering a clean out now" )
236+ if err := cluster .CleanOutServer (ctx , memberStatus .ID ); err != nil {
237+ log .Debug ().Err (err ).Msg ("Failed to clean out server" )
238+ return maskAny (err )
239+ }
240+ memberStatus .Phase = api .MemberPhaseDrain
241+ } else {
242+ log .Debug ().Msg ("Temporary shutdown, resign leadership" )
243+ if err := cluster .ResignServer (ctx , memberStatus .ID ); err != nil {
244+ log .Debug ().Err (err ).Msg ("Failed to resign server" )
245+ return maskAny (err )
246+ }
247+ memberStatus .Phase = api .MemberPhaseResign
227248 }
249+
228250 memberStatus .CleanoutJobID = jobID
229- memberStatus . Phase = api . MemberPhaseDrain
251+
230252 if err := updateMember (memberStatus ); err != nil {
231253 return maskAny (err )
232254 }
@@ -239,18 +261,54 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
239261 }
240262 jobStatus , err := arangod .CleanoutServerJobStatus (ctx , memberStatus .CleanoutJobID , c , agency )
241263 if err != nil {
242- log .Debug ().Err (err ).Msg ("Failed to fetch cleanout job status" )
264+ log .Debug ().Err (err ).Msg ("Failed to fetch job status" )
265+ return maskAny (err )
266+ }
267+ if jobStatus .IsFailed () {
268+ log .Warn ().Str ("reason" , jobStatus .Reason ()).Msg ("Job failed" )
269+ // Revert cleanout state
270+ memberStatus .Phase = api .MemberPhaseCreated
271+ memberStatus .CleanoutJobID = ""
272+ if err := updateMember (memberStatus ); err != nil {
273+ return maskAny (err )
274+ }
275+ log .Error ().Msg ("Cleanout/Resign server job failed, continue anyway" )
276+ return nil
277+ }
278+ if jobStatus .IsFinished () {
279+ memberStatus .CleanoutJobID = ""
280+ memberStatus .Phase = api .MemberPhaseCreated
281+ }
282+ } else if memberStatus .Phase == api .MemberPhaseResign {
283+ // Check the job progress
284+ agency , err := r .context .GetAgency (ctx )
285+ if err != nil {
286+ log .Debug ().Err (err ).Msg ("Failed to create agency client" )
287+ return maskAny (err )
288+ }
289+ jobStatus , err := arangod .CleanoutServerJobStatus (ctx , memberStatus .CleanoutJobID , c , agency )
290+ if err != nil {
291+ log .Debug ().Err (err ).Msg ("Failed to fetch job status" )
243292 return maskAny (err )
244293 }
245294 if jobStatus .IsFailed () {
246- log .Warn ().Str ("reason" , jobStatus .Reason ()).Msg ("Cleanout Job failed" )
295+ log .Warn ().Str ("reason" , jobStatus .Reason ()).Msg ("Resign Job failed" )
247296 // Revert cleanout state
248297 memberStatus .Phase = api .MemberPhaseCreated
249298 memberStatus .CleanoutJobID = ""
250299 if err := updateMember (memberStatus ); err != nil {
251300 return maskAny (err )
252301 }
253- log .Error ().Msg ("Cleanout server job failed, continue anyway" )
302+ log .Error ().Msg ("Cleanout/Resign server job failed, continue anyway" )
303+ return nil
304+ }
305+ if jobStatus .IsFinished () {
306+ log .Debug ().Str ("reason" , jobStatus .Reason ()).Msg ("Resign Job finished" )
307+ memberStatus .CleanoutJobID = ""
308+ memberStatus .Phase = api .MemberPhaseCreated
309+ if err := updateMember (memberStatus ); err != nil {
310+ return maskAny (err )
311+ }
254312 return nil
255313 }
256314 }
0 commit comments