Skip to content

Commit 269ff04

Browse files
committed
on behalf of @vitaly: rebased hot backup topology change
1 parent dc6a6d2 commit 269ff04

File tree

9 files changed

+336
-114
lines changed

9 files changed

+336
-114
lines changed

release_tester/arangodb/starter/deployments/activefailover.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -337,8 +337,10 @@ def jam_attempt_impl(self):
337337
args = ["--skip", "802_"]
338338
self.checkdata_args = args
339339
ret = curr_leader.arangosh.check_test_data(
340-
"checking active failover new leader node", True, args, log_debug=True
341-
)
340+
"checking active failover new leader node",
341+
True,
342+
args,
343+
log_debug=True)
342344
if not ret[0]:
343345
raise Exception("check data failed " + ret[1])
344346

release_tester/arangodb/starter/deployments/cluster.py

Lines changed: 116 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def __init__(
8888
if ver_found < len(versions):
8989
print("One deployment doesn't support starters with more nodes!")
9090
self.props.cluster_nodes = 3
91+
self.backup_instance_count = self.props.cluster_nodes
9192

9293
def starter_prepare_env_impl(self, sm=None, more_opts=None):
9394
# pylint: disable=invalid-name
@@ -146,7 +147,8 @@ def add_starter(name, port, opts, sm, hasAgency):
146147
self.create_tls_ca_cert()
147148
port = 9528
148149
count = 0
149-
for this_node in list(range(1, self.props.cluster_nodes + 1)):
150+
full_node_count = self.props.cluster_nodes + 2 # we need 2 additional nodes for hotbackup testing
151+
for this_node in list(range(1, full_node_count + 1)):
150152
node = []
151153
node_opts.append(node)
152154
if this_node != 1:
@@ -158,44 +160,48 @@ def add_starter(name, port, opts, sm, hasAgency):
158160
add_starter(f"node{this_node}", port, node + common_opts, sm, count < 3)
159161
port += 100
160162
count += 1
161-
self.backup_instance_count = count
162163
for instance in self.starter_instances:
163164
instance.is_leader = True
164165

165166
def starter_run_impl(self):
166167
lh.subsection("instance setup")
167-
for manager in self.starter_instances:
168+
for manager in self.starter_instances[: self.props.cluster_nodes]:
168169
logging.info("Spawning instance")
169170
manager.run_starter()
170171

171172
logging.info("waiting for the starters to become alive")
172-
not_started = self.starter_instances[:] # This is a explicit copy
173+
not_running = self.get_running_starters() # This is a explicit copy
173174
count = 0
174-
while not_started:
175-
logging.debug("waiting for mananger with logfile:" + str(not_started[-1].log_file))
176-
if not_started[-1].is_instance_up():
177-
not_started.pop()
175+
while not_running:
176+
logging.debug("waiting for mananger with logfile:" + str(not_running[-1].log_file))
177+
if not_running[-1].is_instance_up():
178+
not_running.pop()
178179
progress(".")
179180
time.sleep(1)
180181
count += 1
181182
if count > 120:
182183
raise Exception("Cluster installation didn't come up in two minutes!")
183184

184185
logging.info("waiting for the cluster instances to become alive")
185-
for node in self.starter_instances:
186+
for node in self.get_running_starters():
186187
node.detect_instances()
187188
node.detect_instance_pids()
188189
# self.cfg.add_frontend('http', self.cfg.publicip, str(node.get_frontend_port()))
189190

190191
logging.info("instances are ready - JWT: " + self.starter_instances[0].get_jwt_header())
191192
count = 0
192-
for node in self.starter_instances:
193+
for node in self.get_running_starters():
193194
node.set_passvoid("cluster", count == 0)
194195
count += 1
196+
for node in self.get_not_running_starters():
197+
node.set_passvoid("cluster", False)
195198
self.passvoid = "cluster"
199+
self.cfg.passvoid = self.passvoid
200+
if self.new_cfg:
201+
self.new_cfg.passvoid = self.passvoid
196202

197203
def finish_setup_impl(self):
198-
self.makedata_instances = self.starter_instances[:]
204+
self.makedata_instances = self.get_running_starters()
199205
self.set_frontend_instances()
200206

201207
def _check_for_shards_in_sync(self):
@@ -491,12 +497,12 @@ def jam_attempt_impl(self):
491497
# After attempt of jamming, we have peer for nodeX in setup.json.
492498
# This peer will brake further updates because this peer is unavailable.
493499
# It is necessary to remove this peer from json for each starter instance
494-
for instance in self.starter_instances:
500+
for instance in self.get_running_starters():
495501
remove_node_x_from_json(instance.basedir)
496502

497503
def shutdown_impl(self):
498504
ret = False
499-
for node in self.starter_instances:
505+
for node in self.get_running_starters():
500506
ret = ret or node.terminate_instance()
501507
logging.info("test ended")
502508
return ret
@@ -536,3 +542,100 @@ def generate_keyfile(self, keyfile):
536542
"--host=localhost",
537543
]
538544
)
545+
546+
# pylint: disable=too-many-statements
547+
@step
548+
def test_hotbackup_impl(self):
549+
"""test hotbackup feature: Cluster"""
550+
with step("step 1: create a backup"):
551+
backup_step_1 = self.create_backup_and_upload("thy_name_is_" + self.name)
552+
553+
with step("step 2: add new db server"):
554+
old_servers = self.get_running_starters()
555+
new_starter = self.get_not_running_starters()[-1]
556+
new_starter.run_starter_and_wait()
557+
self.backup_instance_count += 1
558+
self.makedata_instances = self.get_running_starters()
559+
560+
with step("step 3: create a backup"):
561+
backup_step_3 = self.create_backup_and_upload("thy_name_is_" + self.name + "_plus1_server")
562+
563+
with step("step 4: remove old db server"):
564+
self.remove_starter_dbserver(old_servers[0])
565+
566+
with step("step 5: create another backup"):
567+
self.create_backup_and_upload("thy_name_is_" + self.name + "_plus1_server_minus1_server", False)
568+
569+
with step("step 6: create non-backup data"):
570+
self.create_non_backup_data()
571+
self.tcp_ping_all_nodes()
572+
573+
with step("step 7: download and restore backup from step 1"):
574+
self.download_backup(backup_step_1)
575+
self.validate_local_backup(backup_step_1)
576+
backups = self.list_backup()
577+
if backup_step_1 not in backups:
578+
raise Exception("downloaded backup has different name? " + str(backups))
579+
self.restore_backup(backup_step_1)
580+
self.tcp_ping_all_nodes()
581+
582+
with step("step 8: check data"):
583+
self.check_data_impl()
584+
if not self.check_non_backup_data():
585+
raise Exception("data created after backup is still there??")
586+
587+
with step("step 9: add new db server"):
588+
new_starter2 = self.get_not_running_starters()[-1]
589+
new_starter2.run_starter_and_wait()
590+
self.backup_instance_count += 1
591+
self.makedata_instances = self.get_running_starters()
592+
593+
with step("step 10: create non-backup data"):
594+
self.create_non_backup_data()
595+
self.tcp_ping_all_nodes()
596+
597+
with step("step 11: download and restore backup from step 3"):
598+
self.download_backup(backup_step_3)
599+
self.validate_local_backup(backup_step_3)
600+
backups = self.list_backup()
601+
if backup_step_3 not in backups:
602+
raise Exception("downloaded backup has different name? " + str(backups))
603+
self.restore_backup(backup_step_3)
604+
self.tcp_ping_all_nodes()
605+
606+
with step("step 12: check data"):
607+
self.check_data_impl()
608+
609+
with step("step 13: remove old db server"):
610+
self.remove_starter_dbserver(old_servers[1])
611+
612+
with step("step 14: create non-backup data"):
613+
self.create_non_backup_data()
614+
self.tcp_ping_all_nodes()
615+
616+
@step
617+
def remove_starter_dbserver(self, starter):
618+
"""remove dbserver managed by given starter from cluster"""
619+
terminated_dbserver_uuid = starter.get_dbserver().get_uuid()
620+
starter.stop_dbserver()
621+
self.remove_server_from_agency(terminated_dbserver_uuid)
622+
self.backup_instance_count -= 1
623+
self.makedata_instances = self.get_running_starters()
624+
625+
@step
626+
def test_hotbackup_after_upgrade_impl(self):
627+
"""test hotbackup after upgrade: cluster"""
628+
with step("step 1: check data"):
629+
self.check_data_impl()
630+
with step("step 2: download backup"):
631+
latest_backup = self.uploaded_backups[-1]
632+
self.download_backup(latest_backup)
633+
backups = self.list_backup()
634+
if latest_backup not in backups:
635+
raise Exception("downloaded backup has different name? " + str(backups))
636+
with step("step 3: restore backup"):
637+
self.restore_backup(latest_backup)
638+
self.tcp_ping_all_nodes()
639+
# we don't run checkdata after restore in this function, because it is ran afterwards by in runner.py
640+
with step("step 4: delete backups"):
641+
self.delete_all_backups()

release_tester/arangodb/starter/deployments/cluster_perf.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,9 @@ def __init__(
101101
runner_type,
102102
abort_on_error,
103103
installer_set,
104-
RunnerProperties(rp, "CLUSTER", 400, 600, self.scenario.hot_backup, 6),
104+
RunnerProperties(
105+
rp, "CLUSTER", 400, 600, self.scenario.hot_backup, 6
106+
),
105107
selenium,
106108
selenium_driver_args,
107109
selenium_include_suites,

release_tester/arangodb/starter/deployments/dc2dc.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,9 @@ def __init__(
114114
runner_type,
115115
abort_on_error,
116116
installer_set,
117-
RunnerProperties(rp, name, 0, 4500, True, 12),
117+
RunnerProperties(
118+
rp, name, 0, 4500, True, 12
119+
),
118120
selenium,
119121
selenium_driver_args,
120122
selenium_include_suites,

release_tester/arangodb/starter/deployments/leaderfollower.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ def __init__(
3636
runner_type,
3737
abort_on_error,
3838
installer_set,
39-
RunnerProperties(rp, "LeaderFollower", 400, 500, False, 2),
39+
RunnerProperties(
40+
rp, "LeaderFollower", 400, 500, False, 2
41+
),
4042
selenium,
4143
selenium_driver_args,
4244
selenium_include_suites,

release_tester/arangodb/starter/deployments/none.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ def __init__(
2424
runner_type,
2525
abort_on_error,
2626
installer_set,
27-
RunnerProperties(rp, "none", 0, 1, False, 1),
27+
RunnerProperties(
28+
rp, "none", 0, 1, False, 1
29+
),
2830
selenium,
2931
selenium_driver_args,
3032
selenium_include_suites,

0 commit comments

Comments
 (0)