@@ -88,6 +88,7 @@ def __init__(
8888 if ver_found < len (versions ):
8989 print ("One deployment doesn't support starters with more nodes!" )
9090 self .props .cluster_nodes = 3
91+ self .backup_instance_count = self .props .cluster_nodes
9192
9293 def starter_prepare_env_impl (self , sm = None , more_opts = None ):
9394 # pylint: disable=invalid-name
@@ -146,7 +147,8 @@ def add_starter(name, port, opts, sm, hasAgency):
146147 self .create_tls_ca_cert ()
147148 port = 9528
148149 count = 0
149- for this_node in list (range (1 , self .props .cluster_nodes + 1 )):
150+ full_node_count = self .props .cluster_nodes + 2 # we need 2 additional nodes for hotbackup testing
151+ for this_node in list (range (1 , full_node_count + 1 )):
150152 node = []
151153 node_opts .append (node )
152154 if this_node != 1 :
@@ -158,44 +160,48 @@ def add_starter(name, port, opts, sm, hasAgency):
158160 add_starter (f"node{ this_node } " , port , node + common_opts , sm , count < 3 )
159161 port += 100
160162 count += 1
161- self .backup_instance_count = count
162163 for instance in self .starter_instances :
163164 instance .is_leader = True
164165
165166 def starter_run_impl (self ):
166167 lh .subsection ("instance setup" )
167- for manager in self .starter_instances :
168+ for manager in self .starter_instances [: self . props . cluster_nodes ] :
168169 logging .info ("Spawning instance" )
169170 manager .run_starter ()
170171
171172 logging .info ("waiting for the starters to become alive" )
172- not_started = self .starter_instances [:] # This is a explicit copy
173+ not_running = self .get_running_starters () # This is a explicit copy
173174 count = 0
174- while not_started :
175- logging .debug ("waiting for mananger with logfile:" + str (not_started [- 1 ].log_file ))
176- if not_started [- 1 ].is_instance_up ():
177- not_started .pop ()
175+ while not_running :
176+ logging .debug ("waiting for mananger with logfile:" + str (not_running [- 1 ].log_file ))
177+ if not_running [- 1 ].is_instance_up ():
178+ not_running .pop ()
178179 progress ("." )
179180 time .sleep (1 )
180181 count += 1
181182 if count > 120 :
182183 raise Exception ("Cluster installation didn't come up in two minutes!" )
183184
184185 logging .info ("waiting for the cluster instances to become alive" )
185- for node in self .starter_instances :
186+ for node in self .get_running_starters () :
186187 node .detect_instances ()
187188 node .detect_instance_pids ()
188189 # self.cfg.add_frontend('http', self.cfg.publicip, str(node.get_frontend_port()))
189190
190191 logging .info ("instances are ready - JWT: " + self .starter_instances [0 ].get_jwt_header ())
191192 count = 0
192- for node in self .starter_instances :
193+ for node in self .get_running_starters () :
193194 node .set_passvoid ("cluster" , count == 0 )
194195 count += 1
196+ for node in self .get_not_running_starters ():
197+ node .set_passvoid ("cluster" , False )
195198 self .passvoid = "cluster"
199+ self .cfg .passvoid = self .passvoid
200+ if self .new_cfg :
201+ self .new_cfg .passvoid = self .passvoid
196202
197203 def finish_setup_impl (self ):
198- self .makedata_instances = self .starter_instances [:]
204+ self .makedata_instances = self .get_running_starters ()
199205 self .set_frontend_instances ()
200206
201207 def _check_for_shards_in_sync (self ):
@@ -491,12 +497,12 @@ def jam_attempt_impl(self):
491497 # After attempt of jamming, we have peer for nodeX in setup.json.
492498 # This peer will brake further updates because this peer is unavailable.
493499 # It is necessary to remove this peer from json for each starter instance
494- for instance in self .starter_instances :
500+ for instance in self .get_running_starters () :
495501 remove_node_x_from_json (instance .basedir )
496502
497503 def shutdown_impl (self ):
498504 ret = False
499- for node in self .starter_instances :
505+ for node in self .get_running_starters () :
500506 ret = ret or node .terminate_instance ()
501507 logging .info ("test ended" )
502508 return ret
@@ -536,3 +542,100 @@ def generate_keyfile(self, keyfile):
536542 "--host=localhost" ,
537543 ]
538544 )
545+
546+ # pylint: disable=too-many-statements
547+ @step
548+ def test_hotbackup_impl (self ):
549+ """test hotbackup feature: Cluster"""
550+ with step ("step 1: create a backup" ):
551+ backup_step_1 = self .create_backup_and_upload ("thy_name_is_" + self .name )
552+
553+ with step ("step 2: add new db server" ):
554+ old_servers = self .get_running_starters ()
555+ new_starter = self .get_not_running_starters ()[- 1 ]
556+ new_starter .run_starter_and_wait ()
557+ self .backup_instance_count += 1
558+ self .makedata_instances = self .get_running_starters ()
559+
560+ with step ("step 3: create a backup" ):
561+ backup_step_3 = self .create_backup_and_upload ("thy_name_is_" + self .name + "_plus1_server" )
562+
563+ with step ("step 4: remove old db server" ):
564+ self .remove_starter_dbserver (old_servers [0 ])
565+
566+ with step ("step 5: create another backup" ):
567+ self .create_backup_and_upload ("thy_name_is_" + self .name + "_plus1_server_minus1_server" , False )
568+
569+ with step ("step 6: create non-backup data" ):
570+ self .create_non_backup_data ()
571+ self .tcp_ping_all_nodes ()
572+
573+ with step ("step 7: download and restore backup from step 1" ):
574+ self .download_backup (backup_step_1 )
575+ self .validate_local_backup (backup_step_1 )
576+ backups = self .list_backup ()
577+ if backup_step_1 not in backups :
578+ raise Exception ("downloaded backup has different name? " + str (backups ))
579+ self .restore_backup (backup_step_1 )
580+ self .tcp_ping_all_nodes ()
581+
582+ with step ("step 8: check data" ):
583+ self .check_data_impl ()
584+ if not self .check_non_backup_data ():
585+ raise Exception ("data created after backup is still there??" )
586+
587+ with step ("step 9: add new db server" ):
588+ new_starter2 = self .get_not_running_starters ()[- 1 ]
589+ new_starter2 .run_starter_and_wait ()
590+ self .backup_instance_count += 1
591+ self .makedata_instances = self .get_running_starters ()
592+
593+ with step ("step 10: create non-backup data" ):
594+ self .create_non_backup_data ()
595+ self .tcp_ping_all_nodes ()
596+
597+ with step ("step 11: download and restore backup from step 3" ):
598+ self .download_backup (backup_step_3 )
599+ self .validate_local_backup (backup_step_3 )
600+ backups = self .list_backup ()
601+ if backup_step_3 not in backups :
602+ raise Exception ("downloaded backup has different name? " + str (backups ))
603+ self .restore_backup (backup_step_3 )
604+ self .tcp_ping_all_nodes ()
605+
606+ with step ("step 12: check data" ):
607+ self .check_data_impl ()
608+
609+ with step ("step 13: remove old db server" ):
610+ self .remove_starter_dbserver (old_servers [1 ])
611+
612+ with step ("step 14: create non-backup data" ):
613+ self .create_non_backup_data ()
614+ self .tcp_ping_all_nodes ()
615+
616+ @step
617+ def remove_starter_dbserver (self , starter ):
618+ """remove dbserver managed by given starter from cluster"""
619+ terminated_dbserver_uuid = starter .get_dbserver ().get_uuid ()
620+ starter .stop_dbserver ()
621+ self .remove_server_from_agency (terminated_dbserver_uuid )
622+ self .backup_instance_count -= 1
623+ self .makedata_instances = self .get_running_starters ()
624+
625+ @step
626+ def test_hotbackup_after_upgrade_impl (self ):
627+ """test hotbackup after upgrade: cluster"""
628+ with step ("step 1: check data" ):
629+ self .check_data_impl ()
630+ with step ("step 2: download backup" ):
631+ latest_backup = self .uploaded_backups [- 1 ]
632+ self .download_backup (latest_backup )
633+ backups = self .list_backup ()
634+ if latest_backup not in backups :
635+ raise Exception ("downloaded backup has different name? " + str (backups ))
636+ with step ("step 3: restore backup" ):
637+ self .restore_backup (latest_backup )
638+ self .tcp_ping_all_nodes ()
639+ # we don't run checkdata after restore in this function, because it is ran afterwards by in runner.py
640+ with step ("step 4: delete backups" ):
641+ self .delete_all_backups ()
0 commit comments