diff --git a/release_tester/arangodb/sh.py b/release_tester/arangodb/sh.py index a577f8de..9073e8f3 100644 --- a/release_tester/arangodb/sh.py +++ b/release_tester/arangodb/sh.py @@ -245,12 +245,14 @@ def hotbackup_create_nonbackup_data(self, suff=""): if (!arango.isConnected()) {{ throw new Error('connecting the database failed'); }} + print("connection is established, creating this_collection_will_not_be_backed_up{suff}"); db._create("this_collection_will_not_be_backed_up{suff}"); + print("now saving a document tot this collection"); db.this_collection_will_not_be_backed_up{suff}.save( {{"this": "document will be gone"}}); """ logging.debug("script to be executed: " + str(js_script_string) + str(self.connect_instance)) - res = self.run_command(["create volatile data", js_script_string], True) # self.cfg.verbose) + res = self.run_command(["create volatile data", js_script_string], True, progressive_timeout=900) # self.cfg.verbose) logging.debug("data create result: " + str(res)) if not res: diff --git a/release_tester/arangodb/starter/deployments/activefailover.py b/release_tester/arangodb/starter/deployments/activefailover.py index 6a99bf85..3355af25 100644 --- a/release_tester/arangodb/starter/deployments/activefailover.py +++ b/release_tester/arangodb/starter/deployments/activefailover.py @@ -337,8 +337,10 @@ def jam_attempt_impl(self): args = ["--skip", "802_"] self.checkdata_args = args ret = curr_leader.arangosh.check_test_data( - "checking active failover new leader node", True, args, log_debug=True - ) + "checking active failover new leader node", + True, + args, + log_debug=True) if not ret[0]: raise Exception("check data failed " + ret[1]) diff --git a/release_tester/arangodb/starter/deployments/cluster.py b/release_tester/arangodb/starter/deployments/cluster.py index c64c0dce..87d70047 100644 --- a/release_tester/arangodb/starter/deployments/cluster.py +++ b/release_tester/arangodb/starter/deployments/cluster.py @@ -88,6 +88,7 @@ def __init__( if ver_found < len(versions): print("One deployment doesn't support starters with more nodes!") self.props.cluster_nodes = 3 + self.backup_instance_count = self.props.cluster_nodes def starter_prepare_env_impl(self, sm=None, more_opts=None): # pylint: disable=invalid-name @@ -146,7 +147,9 @@ def add_starter(name, port, opts, sm, hasAgency): self.create_tls_ca_cert() port = 9528 count = 0 - for this_node in list(range(1, self.props.cluster_nodes + 1)): + # we need 2 additional nodes for hotbackup testing + full_node_count = self.props.cluster_nodes + 2 if self.hot_backup else self.props.cluster_nodes + for this_node in list(range(1, full_node_count + 1)): node = [] node_opts.append(node) if this_node != 1: @@ -158,23 +161,22 @@ def add_starter(name, port, opts, sm, hasAgency): add_starter(f"node{this_node}", port, node + common_opts, sm, count < 3) port += 100 count += 1 - self.backup_instance_count = count for instance in self.starter_instances: instance.is_leader = True def starter_run_impl(self): lh.subsection("instance setup") - for manager in self.starter_instances: + for manager in self.starter_instances[: self.props.cluster_nodes]: logging.info("Spawning instance") manager.run_starter() logging.info("waiting for the starters to become alive") - not_started = self.starter_instances[:] # This is a explicit copy + not_running = self.get_running_starters() # This is a explicit copy count = 0 - while not_started: - logging.debug("waiting for mananger with logfile:" + str(not_started[-1].log_file)) - if not_started[-1].is_instance_up(): - not_started.pop() + while not_running: + logging.debug("waiting for mananger with logfile:" + str(not_running[-1].log_file)) + if not_running[-1].is_instance_up(): + not_running.pop() progress(".") time.sleep(1) count += 1 @@ -182,20 +184,25 @@ def starter_run_impl(self): raise Exception("Cluster installation didn't come up in two minutes!") logging.info("waiting for the cluster instances to become alive") - for node in self.starter_instances: + for node in self.get_running_starters(): node.detect_instances() node.detect_instance_pids() # self.cfg.add_frontend('http', self.cfg.publicip, str(node.get_frontend_port())) logging.info("instances are ready - JWT: " + self.starter_instances[0].get_jwt_header()) count = 0 - for node in self.starter_instances: + for node in self.get_running_starters(): node.set_passvoid("cluster", count == 0) count += 1 + for node in self.get_not_running_starters(): + node.set_passvoid("cluster", False) self.passvoid = "cluster" + self.cfg.passvoid = self.passvoid + if self.new_cfg: + self.new_cfg.passvoid = self.passvoid def finish_setup_impl(self): - self.makedata_instances = self.starter_instances[:] + self.makedata_instances = self.get_running_starters() self.set_frontend_instances() def _check_for_shards_in_sync(self): @@ -234,10 +241,10 @@ def upgrade_arangod_version_impl(self): if self.cfg.stress_upgrade: bench_instances.append(self.starter_instances[0].launch_arangobench("cluster_upgrade_scenario_1")) bench_instances.append(self.starter_instances[1].launch_arangobench("cluster_upgrade_scenario_2")) - for node in self.starter_instances: + for node in self.get_running_starters(): node.replace_binary_for_upgrade(self.new_installer.cfg) - for node in self.starter_instances: + for node in self.get_running_starters(): node.detect_instance_pids_still_alive() self.starter_instances[1].command_upgrade() @@ -491,12 +498,12 @@ def jam_attempt_impl(self): # After attempt of jamming, we have peer for nodeX in setup.json. # This peer will brake further updates because this peer is unavailable. # It is necessary to remove this peer from json for each starter instance - for instance in self.starter_instances: + for instance in self.get_running_starters(): remove_node_x_from_json(instance.basedir) def shutdown_impl(self): ret = False - for node in self.starter_instances: + for node in self.get_running_starters(): ret = ret or node.terminate_instance() logging.info("test ended") return ret @@ -536,3 +543,103 @@ def generate_keyfile(self, keyfile): "--host=localhost", ] ) + + # pylint: disable=too-many-statements + @step + def test_hotbackup_impl(self): + """test hotbackup feature: Cluster""" + with step("step 1: create a backup"): + backup_step_1 = self.create_backup_and_upload("thy_name_is_" + self.name) + + with step("step 2: add new db server"): + old_servers = self.get_running_starters() + new_starter = self.get_not_running_starters()[-1] + new_starter.run_starter_and_wait() + self.backup_instance_count += 1 + self.makedata_instances = self.get_running_starters() + + with step("step 3: create a backup"): + backup_step_3 = self.create_backup_and_upload("thy_name_is_" + self.name + "_plus1_server") + + with step("step 4: remove old db server"): + self.remove_starter_dbserver(old_servers[0]) + + with step("step 5: create another backup"): + self.create_backup_and_upload("thy_name_is_" + self.name + "_plus1_server_minus1_server", False) + + with step("step 6: create non-backup data"): + self._check_for_shards_in_sync() + self.create_non_backup_data() + self.tcp_ping_all_nodes() + + with step("step 7: download and restore backup from step 1"): + self.download_backup(backup_step_1) + self.validate_local_backup(backup_step_1) + backups = self.list_backup() + if backup_step_1 not in backups: + raise Exception("downloaded backup has different name? " + str(backups)) + self.restore_backup(backup_step_1) + self.tcp_ping_all_nodes() + + with step("step 8: check data"): + self.check_data_impl() + if not self.check_non_backup_data(): + raise Exception("data created after backup is still there??") + + with step("step 9: add new db server"): + new_starter2 = self.get_not_running_starters()[-1] + new_starter2.run_starter_and_wait() + self.backup_instance_count += 1 + self.makedata_instances = self.get_running_starters() + + with step("step 10: create non-backup data"): + self.create_non_backup_data() + self.tcp_ping_all_nodes() + + with step("step 11: download and restore backup from step 3"): + self.download_backup(backup_step_3) + self.validate_local_backup(backup_step_3) + backups = self.list_backup() + if backup_step_3 not in backups: + raise Exception("downloaded backup has different name? " + str(backups)) + self.restore_backup(backup_step_3) + self.tcp_ping_all_nodes() + + with step("step 12: check data"): + self.check_data_impl() + + with step("step 13: remove old db server"): + self.remove_starter_dbserver(old_servers[1]) + + with step("step 14: create non-backup data"): + self._check_for_shards_in_sync() + self.create_non_backup_data() + self.tcp_ping_all_nodes() + + @step + def remove_starter_dbserver(self, starter): + """remove dbserver managed by given starter from cluster""" + print("removing starter " + repr(starter)) + terminated_dbserver_uuid = starter.get_dbserver().get_uuid() + starter.stop_dbserver() + self.remove_server_from_agency(terminated_dbserver_uuid) + self.backup_instance_count -= 1 + self.makedata_instances = self.get_running_starters() + + @step + def test_hotbackup_after_upgrade_impl(self): + """test hotbackup after upgrade: cluster""" + with step("step 1: check data"): + self.check_data_impl() + with step("step 2: download backup"): + latest_backup = self.uploaded_backups[-1] + self.download_backup(latest_backup) + backups = self.list_backup() + if latest_backup not in backups: + raise Exception("downloaded backup has different name? " + str(backups)) + with step("step 3: restore backup"): + self.restore_backup(latest_backup) + self.tcp_ping_all_nodes() + # we don't run checkdata after restore in this function, because it is ran afterwards by in runner.py + with step("step 4: delete backups"): + self.delete_all_backups() diff --git a/release_tester/arangodb/starter/deployments/cluster_perf.py b/release_tester/arangodb/starter/deployments/cluster_perf.py index 451c588e..099a3365 100644 --- a/release_tester/arangodb/starter/deployments/cluster_perf.py +++ b/release_tester/arangodb/starter/deployments/cluster_perf.py @@ -101,7 +101,9 @@ def __init__( runner_type, abort_on_error, installer_set, - RunnerProperties(rp, "CLUSTER", 400, 600, self.scenario.hot_backup, 6), + RunnerProperties( + rp, "CLUSTER", 400, 600, self.scenario.hot_backup, 6 + ), selenium, selenium_driver_args, selenium_include_suites, diff --git a/release_tester/arangodb/starter/deployments/dc2dc.py b/release_tester/arangodb/starter/deployments/dc2dc.py index a2adf394..b209134a 100644 --- a/release_tester/arangodb/starter/deployments/dc2dc.py +++ b/release_tester/arangodb/starter/deployments/dc2dc.py @@ -114,7 +114,9 @@ def __init__( runner_type, abort_on_error, installer_set, - RunnerProperties(rp, name, 0, 4500, True, 12), + RunnerProperties( + rp, name, 0, 4500, True, 12 + ), selenium, selenium_driver_args, selenium_include_suites, diff --git a/release_tester/arangodb/starter/deployments/leaderfollower.py b/release_tester/arangodb/starter/deployments/leaderfollower.py index 3cf9ad75..a35fc1a1 100644 --- a/release_tester/arangodb/starter/deployments/leaderfollower.py +++ b/release_tester/arangodb/starter/deployments/leaderfollower.py @@ -36,7 +36,9 @@ def __init__( runner_type, abort_on_error, installer_set, - RunnerProperties(rp, "LeaderFollower", 400, 500, False, 2), + RunnerProperties( + rp, "LeaderFollower", 400, 500, False, 2 + ), selenium, selenium_driver_args, selenium_include_suites, diff --git a/release_tester/arangodb/starter/deployments/none.py b/release_tester/arangodb/starter/deployments/none.py index 83721f06..35536fe5 100644 --- a/release_tester/arangodb/starter/deployments/none.py +++ b/release_tester/arangodb/starter/deployments/none.py @@ -24,7 +24,9 @@ def __init__( runner_type, abort_on_error, installer_set, - RunnerProperties(rp, "none", 0, 1, False, 1), + RunnerProperties( + rp, "none", 0, 1, False, 1 + ), selenium, selenium_driver_args, selenium_include_suites, diff --git a/release_tester/arangodb/starter/deployments/runner.py b/release_tester/arangodb/starter/deployments/runner.py index 5faa7b9e..e8af43f0 100644 --- a/release_tester/arangodb/starter/deployments/runner.py +++ b/release_tester/arangodb/starter/deployments/runner.py @@ -111,6 +111,7 @@ def __init__( self.old_installer = old_inst self.new_installer = new_inst self.backup_name = None + self.uploaded_backups = [] self.hot_backup = ( cfg.hot_backup_supported and properties.supports_hotbackup and self.old_installer.supports_hot_backup() ) @@ -275,34 +276,7 @@ def run(self): "{0}{1} Deployment started. Please test the UI!".format((self.versionstr), str(self.name)), ) if self.hot_backup: - self.progress(False, "TESTING HOTBACKUP") - self.backup_name = self.create_backup("thy_name_is_" + self.name) - self.validate_local_backup(self.backup_name) - self.tcp_ping_all_nodes() - self.create_non_backup_data() - taken_backups = self.list_backup() - backup_no = len(taken_backups) - 1 - self.upload_backup(taken_backups[backup_no]) - self.tcp_ping_all_nodes() - self.delete_backup(taken_backups[backup_no]) - self.tcp_ping_all_nodes() - backups = self.list_backup() - if len(backups) != len(taken_backups) - 1: - raise Exception("expected backup to be gone, " "but its still there: " + str(backups)) - self.download_backup(self.backup_name) - self.validate_local_backup(self.backup_name) - self.tcp_ping_all_nodes() - backups = self.list_backup() - if backups[len(backups) - 1] != self.backup_name: - raise Exception("downloaded backup has different name? " + str(backups)) - self.before_backup() - self.restore_backup(backups[len(backups) - 1]) - self.tcp_ping_all_nodes() - self.after_backup() - time.sleep(20) # TODO fix - self.check_data_impl() - if not self.check_non_backup_data(): - raise Exception("data created after backup is still there??") + self.test_hotbackup() if self.dump_restore: self.dump_everything("dump_this_" + self.name) print(self.backup_name) @@ -311,8 +285,6 @@ def run(self): self.check_data_impl() if self.new_installer: - if self.hot_backup: - self.create_non_backup_data() self.versionstr = "NEW[" + self.new_cfg.version + "] " self.upgrade_counter += 1 @@ -341,34 +313,7 @@ def run(self): if self.is_minor_upgrade() and self.new_installer.supports_backup(): self.new_installer.check_backup_is_created() if self.hot_backup: - self.check_data_impl() - self.progress(False, "TESTING HOTBACKUP AFTER UPGRADE") - backups = self.list_backup() - self.upload_backup(backups[0]) - self.tcp_ping_all_nodes() - self.delete_backup(backups[0]) - self.tcp_ping_all_nodes() - backups = self.list_backup() - if len(backups) != 0: - raise Exception("expected backup to be gone, " "but its still there: " + str(backups)) - self.download_backup(self.backup_name) - self.validate_local_backup(self.backup_name) - self.tcp_ping_all_nodes() - backups = self.list_backup() - if backups[0] != self.backup_name: - raise Exception("downloaded backup has different name? " + str(backups)) - time.sleep(20) # TODO fix - self.before_backup() - self.restore_backup(backups[0]) - self.tcp_ping_all_nodes() - self.after_backup() - if not self.check_non_backup_data(): - raise Exception("data created after backup is still there??") - self.delete_backup(backups[0]) - self.tcp_ping_all_nodes() - backups = self.list_backup() - if len(backups) != 0: - raise Exception("expected backup to be gone, " "but its still there: " + str(backups)) + self.test_hotbackup_after_upgrade() if self.dump_restore: print(self.backup_name) self.restore_everything(self.backup_name) @@ -389,7 +334,7 @@ def run(self): self.check_data_impl() if not is_keep_db_dir: self.starter_shutdown() - for starter in self.starter_instances: + for starter in self.get_running_starters(): starter.detect_fatal_errors() if is_uninstall_now: self.uninstall(self.old_installer if not self.new_installer else self.new_installer) @@ -411,6 +356,79 @@ def run(self): self.progress(False, "Runner of type {0} - Finished!".format(str(self.name))) + def test_hotbackup(self): + """test hotbackup""" + self.progress(False, "TESTING HOTBACKUP") + self.test_hotbackup_impl() + + def test_hotbackup_after_upgrade(self): + """test hotbackup after upgrade""" + self.progress(False, "TESTING HOTBACKUP AFTER UPGRADE") + self.test_hotbackup_after_upgrade_impl() + + @step + def test_hotbackup_impl(self): + """test hotbackup feature: general implementation""" + self.backup_name = self.create_backup("thy_name_is_" + self.name) + self.validate_local_backup(self.backup_name) + self.tcp_ping_all_nodes() + self.create_non_backup_data() + taken_backups = self.list_backup() + backup_no = len(taken_backups) - 1 + self.upload_backup(taken_backups[backup_no]) + self.tcp_ping_all_nodes() + self.delete_backup(taken_backups[backup_no]) + self.tcp_ping_all_nodes() + backups = self.list_backup() + if len(backups) != len(taken_backups) - 1: + raise Exception("expected backup to be gone, " "but its still there: " + str(backups)) + self.download_backup(self.backup_name) + self.validate_local_backup(self.backup_name) + self.tcp_ping_all_nodes() + backups = self.list_backup() + if backups[len(backups) - 1] != self.backup_name: + raise Exception("downloaded backup has different name? " + str(backups)) + self.before_backup() + self.restore_backup(backups[len(backups) - 1]) + self.tcp_ping_all_nodes() + self.after_backup() + time.sleep(20) # TODO fix + self.check_data_impl() + if not self.check_non_backup_data(): + raise Exception("data created after backup is still there??") + self.create_non_backup_data() + + @step + def test_hotbackup_after_upgrade_impl(self): + """test hotbackup after upgrade: general""" + self.check_data_impl() + backups = self.list_backup() + self.upload_backup(backups[0]) + self.tcp_ping_all_nodes() + self.delete_backup(backups[0]) + self.tcp_ping_all_nodes() + backups = self.list_backup() + if len(backups) != 0: + raise Exception("expected backup to be gone, " "but its still there: " + str(backups)) + self.download_backup(self.backup_name) + self.validate_local_backup(self.backup_name) + self.tcp_ping_all_nodes() + backups = self.list_backup() + if backups[0] != self.backup_name: + raise Exception("downloaded backup has different name? " + str(backups)) + time.sleep(20) # TODO fix + self.before_backup() + self.restore_backup(backups[0]) + self.tcp_ping_all_nodes() + self.after_backup() + if not self.check_non_backup_data(): + raise Exception("data created after backup is still there??") + self.delete_backup(backups[0]) + self.tcp_ping_all_nodes() + backups = self.list_backup() + if len(backups) != 0: + raise Exception("expected backup to be gone, " "but its still there: " + str(backups)) + def run_selenium(self): """fake to run the full lifecycle flow of this deployment""" @@ -424,12 +442,12 @@ def run_selenium(self): ) self.starter_prepare_env() self.finish_setup() # create the instances... - for starter in self.starter_instances: + for starter in self.get_running_starters(): # attach the PID of the starter instance: starter.attach_running_starter() # find out about its processes: starter.detect_instances() - print(self.starter_instances) + print(self.get_running_starters()) self.selenium.test_after_install() if self.new_installer: self.versionstr = "NEW[" + self.new_cfg.version + "] " @@ -499,7 +517,6 @@ def install(self, inst): sys_arangosh.js_version_check() # TODO: here we should invoke Makedata for the system installation. self.progress(True, "stop system service to make ports available for starter") - inst.stop_service() def get_selenium_status(self): @@ -649,7 +666,7 @@ def set_frontend_instances(self): def get_frontend_instances(self): """fetch all frontend instances""" frontends = [] - for starter in self.starter_instances: + for starter in self.get_running_starters(): if not starter.is_leader: continue for frontend in starter.get_frontends(): @@ -659,7 +676,7 @@ def get_frontend_instances(self): def get_frontend_starters(self): """fetch all frontend instances""" frontends = [] - for starter in self.starter_instances: + for starter in self.get_running_starters(): if not starter.is_leader: continue if len(starter.get_frontends()) > 0: @@ -669,7 +686,7 @@ def get_frontend_starters(self): @step def tcp_ping_all_nodes(self): """check whether all nodes react via tcp connection""" - for starter in self.starter_instances: + for starter in self.get_running_starters(): starter.tcp_ping_nodes() @step @@ -686,7 +703,7 @@ def print_frontend_instances(self): def print_all_instances_table(self): """print all http frontends to the user""" instances = [] - for starter in self.starter_instances: + for starter in self.get_running_starters(): instances += starter.get_instance_essentials() print_instances_table(instances) @@ -800,27 +817,28 @@ def dump_everything(self, name): assert starter.arango_dump, "dump everything: this starter doesn't have an dump instance!" self.backup_name = self.cfg.base_test_dir.resolve() / self.basedir / name args = [ - "--include-system-collections", - "true", - "--overwrite", - "true", - "--use-experimental-dump", - "true", - "--all-databases", - "true", - "--local-writer-threads", - "5", - "--local-network-threads", - "10", - "--dbserver-prefetch-batches", - "20", - "--split-files", - "true", + '--include-system-collections', + 'true', + '--overwrite', + 'true', + '--use-experimental-dump', + 'true', + '--all-databases', + 'true', + '--local-writer-threads', + '5', + '--local-network-threads', + '10', + '--dbserver-prefetch-batches', + '20', + '--split-files', + 'true' ] ret = starter.arango_dump.run_dump_monitored( - self.backup_name, args, progressive_timeout=progressive_timeout - ) - # self.after_backup_create_impl() + self.backup_name, + args, + progressive_timeout=progressive_timeout) + #self.after_backup_create_impl() return ret raise Exception("no frontend found.") @@ -846,9 +864,7 @@ def wait_for_self_heal(self, starter): } throw new Error("foxx routeing not ready on time!"); }; waitForSelfHeal(); - """, - ) - ) + """)) def restore_everything_from_dump(self, starter, path): """ do a full restore from a dump """ @@ -967,6 +983,12 @@ def delete_backup(self, name): return starter.hb_instance.delete(name) raise Exception("no frontend found.") + @step + def delete_all_backups(self): + """delete all locally-stored backups""" + for backup in self.list_backup(): + self.delete_backup(backup) + def wait_for_restore_impl(self, backup_starter): """wait for all restores to be finished""" if self.hot_backup: @@ -992,7 +1014,9 @@ def upload_backup(self, name, timeout=1200): continue assert starter.hb_instance, "upload backup: this starter doesn't have an hb instance!" hb_id = starter.hb_instance.upload(name, starter.hb_config, "12345") - return starter.hb_instance.upload_status(name, hb_id, self.backup_instance_count, timeout=timeout) + starter.hb_instance.upload_status(name, hb_id, self.backup_instance_count, timeout=timeout) + self.uploaded_backups.append(name) + return raise Exception("no frontend found.") @step @@ -1009,10 +1033,29 @@ def download_backup(self, name, timeout=1200): @step def validate_local_backup(self, name): """validate the local backup""" - for starter in self.starter_instances: + for starter in self.get_running_starters(): assert starter.hb_instance, "download backup: this starter doesn't have an hb instance!" starter.hb_instance.validate_local_backup(starter.basedir, name) + @step + def create_backup_and_upload(self, backup_name, delete_local=True): + """create a hotbackup, then upload and delete it""" + backup_name = self.create_backup(backup_name) + self.backup_name = backup_name + self.validate_local_backup(self.backup_name) + self.tcp_ping_all_nodes() + taken_backups = self.list_backup() + backup_no = len(taken_backups) - 1 + self.upload_backup(taken_backups[backup_no]) + self.tcp_ping_all_nodes() + if delete_local: + self.delete_backup(taken_backups[backup_no]) + self.tcp_ping_all_nodes() + backups = self.list_backup() + if len(backups) != len(taken_backups) - 1: + raise Exception("expected backup to be gone, " "but its still there: " + str(backups)) + return backup_name + @step def reload_routing(self): """reload the routing""" @@ -1030,7 +1073,7 @@ def reload_routing(self): def search_for_warnings(self, print_lines=True): """search for any warnings in any logfiles and dump them to the screen""" ret = False - for starter in self.starter_instances: + for starter in self.get_running_starters(): print("Ww" * 40) starter.search_for_warnings() for instance in starter.all_instances: @@ -1072,7 +1115,7 @@ def zip_test_dir(self): if self.cfg.base_test_dir.exists(): print("zipping test dir") if self.hot_backup: - for starter in self.starter_instances: + for starter in self.get_running_starters(): starter.cleanup_hotbackup_in_instance() # we just assume that we might have the "remote" directory in this subdir: backup_dir = self.basedir / "backup" @@ -1116,7 +1159,7 @@ def cleanup(self, reset_tmp=True): def _set_logging(self, instance_type): """turn on logging for all of instance_type""" - for starter_mgr in self.starter_instances: + for starter_mgr in self.get_running_starters(): starter_mgr.send_request( instance_type, requests.put, @@ -1142,7 +1185,9 @@ def coordinator_set_debug_logging(self): @step def get_collection_list(self): """get a list of collections and their shards""" - reply = self.starter_instances[0].send_request(InstanceType.COORDINATOR, requests.get, "/_api/collection", None) + reply = self.get_running_starters()[0].send_request( + InstanceType.COORDINATOR, requests.get, "/_api/collection", None + ) if reply[0].status_code != 200: raise Exception( "get Collections: Unsupported return code" + str(reply[0].status_code) + " - " + str(reply[0].body) @@ -1159,7 +1204,7 @@ def get_collection_list(self): def get_collection_cluster_details(self, collection_name): """get the shard details for a single collection""" - reply = self.starter_instances[0].send_request( + reply = self.get_running_starters()[0].send_request( InstanceType.COORDINATOR, requests.put, "/_db/_system/_admin/cluster/collectionShardDistribution", @@ -1255,7 +1300,7 @@ def set_selenium_instances(self): def export_instance_info(self): """resemble the testing.js INSTANCEINFO env""" starter_structs = [] - for starter in self.starter_instances: + for starter in self.get_running_starters(): starter_structs.append(starter.get_structure()) struct = starter_structs[0] for starter in starter_structs[1:]: @@ -1270,7 +1315,7 @@ def remove_server_from_agency(self, server_uuid, deadline=150): body = '{"server": "%s"}' % server_uuid deadline = datetime.now() + timedelta(seconds=deadline) while datetime.now() < deadline: - reply = self.starter_instances[0].send_request( + reply = self.get_running_starters()[0].send_request( InstanceType.COORDINATOR, requests.post, "/_admin/cluster/removeServer", @@ -1289,3 +1334,11 @@ def remove_server_from_agency(self, server_uuid, deadline=150): def makedata_databases(self): """return a list of databases that makedata tests must be ran in""" return [["_system", self.props.force_one_shard, 0]] + self.custom_databases.copy() + + def get_running_starters(self): + """get list of running starters""" + return [starter for starter in self.starter_instances if starter.is_running] + + def get_not_running_starters(self): + """get list of not running starters""" + return [starter for starter in self.starter_instances if not starter.is_running] diff --git a/release_tester/arangodb/starter/deployments/single.py b/release_tester/arangodb/starter/deployments/single.py index 5087b89d..fb8dc515 100644 --- a/release_tester/arangodb/starter/deployments/single.py +++ b/release_tester/arangodb/starter/deployments/single.py @@ -36,7 +36,9 @@ def __init__( runner_type, abort_on_error, installer_set, - RunnerProperties(rp, "Single", 400, 500, True, 1), + RunnerProperties( + rp, "Single", 400, 500, True, 1 + ), selenium, selenium_driver_args, selenium_include_suites, diff --git a/release_tester/arangodb/starter/manager.py b/release_tester/arangodb/starter/manager.py index 237fcde9..2242676e 100644 --- a/release_tester/arangodb/starter/manager.py +++ b/release_tester/arangodb/starter/manager.py @@ -45,6 +45,7 @@ IS_WINDOWS = sys.platform == "win32" +DEFAULT_ENCRYPTION_AT_REST_KEY="defaultencatrestkey_32chars_xxxx" # pylint: disable=too-many-lines disable=logging-fstring-interpolation class StarterManager: @@ -122,7 +123,7 @@ def __init__( if self.cfg.encryption_at_rest: self.keyfile = self.basedir / "key.txt" # generate pseudo random key of length 32: - self.keyfile.write_text((str(datetime.datetime.now()) * 5)[0:32]) + self.keyfile.write_text(DEFAULT_ENCRYPTION_AT_REST_KEY) self.moreopts += ["--rocksdb.encryption-keyfile", str(self.keyfile)] self.hb_instance = None self.hb_config = None @@ -312,25 +313,26 @@ def get_sync_masters(self): def get_frontend(self): """get the first frontendhost of this starter""" servers = self.get_frontends() - assert servers, "starter: don't have instances!" + print(repr(self)) + assert servers, "starter: don't have instances!" + repr(self) return servers[0] def get_dbserver(self): """get the first dbserver of this starter""" servers = self.get_dbservers() - assert servers, "starter: don't have instances!" + assert servers, "starter: don't have instances!" + repr(self) return servers[0] def get_agent(self): """get the first agent of this starter""" servers = self.get_agents() - assert servers, "starter: have no instances!" + assert servers, "starter: have no instances!" + repr(self) return servers[0] def get_sync_master(self): """get the first arangosync master of this starter""" servers = self.get_sync_masters() - assert servers, "starter: don't have instances!" + assert servers, "starter: don't have instances!" + repr(self) return servers[0] def have_this_instance(self, instance): @@ -380,6 +382,22 @@ def run_starter(self, expect_to_fail=False): if not expect_to_fail: self.wait_for_logfile() self.wait_for_port_bind() + self.is_running = True + + @step + def run_starter_and_wait(self): + """launch the starter and wait for all arnagod instances to come up""" + self.run_starter() + count = 0 + while not self.is_instance_up(): + logging.debug("waiting for mananger with logfile:" + str(self.log_file)) + progress(".") + time.sleep(1) + count += 1 + if count > 120: + raise Exception("Starter manager installation didn't come up in two minutes!") + self.detect_instances() + self.detect_instance_pids() @step def attach_running_starter(self): @@ -449,7 +467,8 @@ def set_passvoid(self, passvoid, write_to_server=True): self.arangosh.js_set_passvoid("root", passvoid) self.passvoidfile.write_text(passvoid, encoding="utf-8") else: - self.arangosh.cfg.passvoid = passvoid + if self.arangosh: + self.arangosh.cfg.passvoid = passvoid self.passvoidfile.write_text(passvoid, encoding="utf-8") self.passvoid = passvoid for i in self.all_instances: @@ -683,7 +702,7 @@ def replace_binary_for_upgrade(self, new_install_cfg, relaunch=True): self.replace_binary_setup_for_upgrade(new_install_cfg) with step("kill the starter processes of the old version"): if self.instance is None: - logging.error("StarterManager: don't have an instance!!") + logging.error("StarterManager: don't have an instance!!" + repr(self)) else: logging.info("StarterManager: Killing my instance [%s]", str(self.instance.pid)) self.kill_instance() @@ -978,14 +997,14 @@ def get_log_file(self): def read_db_logfile(self): """get the logfile of the dbserver instance""" server = self.get_dbserver() - assert server.logfile.exists(), "don't have logfile?" + assert server.logfile.exists(), "don't have logfile?" + repr(self) return server.logfile.read_text(errors="backslashreplace") @step def read_agent_logfile(self): """get the agent logfile of this instance""" server = self.get_agent() - assert server.logfile.exists(), "don't have logfile?" + assert server.logfile.exists(), "don't have logfile?" + repr(self) return server.logfile.read_text(errors="backslashreplace") @step @@ -1296,6 +1315,15 @@ def count_occurances_in_starter_log(self, substring: str): number_of_occurances = self.get_log_file().count(substring) return number_of_occurances + def stop_dbserver(self): + """stop db server managed by this starter""" + dbserver = self.get_dbserver() + self.kill_instance() + dbserver.terminate_instance() + self.all_instances.remove(dbserver) + self.moreopts.append("--cluster.start-dbserver=false") + self.run_starter() + class StarterNonManager(StarterManager): """this class is a dummy starter manager to work with similar interface"""