From feeac83ee0436d3f885a9433f4c015c5938d5271 Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 1 Aug 2023 20:23:15 +0800 Subject: [PATCH 01/23] =?UTF-8?q?gs=5Fddr=E5=B7=A5=E5=85=B7=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- script/gs_ddr | 96 + .../impl/dorado_disaster_recovery/__init__.py | 0 .../impl/dorado_disaster_recovery/ddr_base.py | 2504 +++++++++++++++++ .../dorado_disaster_recovery/ddr_constants.py | 91 + .../ddr_modules/__init__.py | 0 .../dorado_diaster_recovery_start.py | 246 ++ .../dorado_disaster_recovery_failover.py | 70 + .../dorado_disaster_recovery_query.py | 168 ++ .../dorado_disaster_recovery_stop.py | 105 + .../dorado_disaster_recovery_switchover.py | 476 ++++ .../params_handler.py | 346 +++ 11 files changed, 4102 insertions(+) create mode 100644 script/gs_ddr create mode 100644 script/impl/dorado_disaster_recovery/__init__.py create mode 100644 script/impl/dorado_disaster_recovery/ddr_base.py create mode 100644 script/impl/dorado_disaster_recovery/ddr_constants.py create mode 100644 script/impl/dorado_disaster_recovery/ddr_modules/__init__.py create mode 100644 script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py create mode 100644 script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py create mode 100644 script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py create mode 100644 script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py create mode 100644 script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py create mode 100644 script/impl/dorado_disaster_recovery/params_handler.py diff --git a/script/gs_ddr b/script/gs_ddr new file mode 100644 index 00000000..f7c3793f --- /dev/null +++ b/script/gs_ddr @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : gs_ddr is a utility for dorado +# disaster recovery fully options. +############################################################################# + +import os +import uuid + +from gspylib.common.Common import DefaultValue +from gspylib.common.ErrorCode import ErrorCode +from gspylib.common.GaussLog import GaussLog +from impl.dorado_disaster_recovery.ddr_constants import DoradoDisasterRecoveryConstants +from base_utils.os.user_util import UserUtil +from domain_utils.cluster_file.cluster_log import ClusterLog +from impl.dorado_disaster_recovery.params_handler import ParamsHandler +from impl.dorado_disaster_recovery.ddr_modules.\ + dorado_diaster_recovery_start import DisasterRecoveryStartHandler +from impl.dorado_disaster_recovery.ddr_modules.\ + dorado_disaster_recovery_stop import DisasterRecoveryStopHandler +from impl.dorado_disaster_recovery.ddr_modules.\ + dorado_disaster_recovery_failover import DisasterRecoveryFailoverHandler +from impl.dorado_disaster_recovery.ddr_modules.\ + dorado_disaster_recovery_switchover import DisasterRecoverySwitchoverHandler +from impl.dorado_disaster_recovery.ddr_modules.\ + dorado_disaster_recovery_query import StreamingQueryHandler + +HANDLER_MAPPING = { + "start": DisasterRecoveryStartHandler, + "stop": DisasterRecoveryStopHandler, + "switchover": DisasterRecoverySwitchoverHandler, + "failover": DisasterRecoveryFailoverHandler, + #"query": StreamingQueryHandler +} + + +class DoradoStorageDisasterRecoveryBase(object): + def __init__(self): + self.params = None + self.user = None + self.log_file = None + self.logger = None + self.trace_id = uuid.uuid1().hex + self.dorado_info = None + DoradoStorageDisasterRecoveryBase.mock_process_user_sensitive_info() + self.__init_globals() + + @staticmethod + def mock_process_user_sensitive_info(): + """mock_process_user_sensitive_info""" + cmdline = DefaultValue.get_proc_title("-W") + DefaultValue.set_proc_title(cmdline) + + def __init_globals(self): + self.user = UserUtil.getUserInfo()['name'] + tmp_logger_file = ClusterLog.getOMLogPath(DoradoDisasterRecoveryConstants.STREAMING_LOG_FILE, self.user) + tmp_logger = GaussLog(tmp_logger_file, 'parse_and_validate_params', trace_id=self.trace_id) + self.params = ParamsHandler(tmp_logger, self.trace_id).get_valid_params() + self.log_file = self.params.logFile if self.params.logFile else \ + ClusterLog.getOMLogPath(DoradoDisasterRecoveryConstants.STREAMING_LOG_FILE, self.user) + self.logger = GaussLog(self.log_file, self.params.task, trace_id=self.trace_id) + + +if __name__ == '__main__': + if os.getuid() == 0: + GaussLog.exitWithError(ErrorCode.GAUSS_501["GAUSS_50105"]) + + base = DoradoStorageDisasterRecoveryBase() + handler = HANDLER_MAPPING[base.params.task](base.params, base.user, + base.logger, base.trace_id, base.log_file) + handler.handle_lock_file(handler.trace_id, 'create') + try: + if base.params.task in DoradoDisasterRecoveryConstants.TASK_EXIST_CHECK: + handler.check_parallel_process_is_running() + handler.run() + except Exception as error: + handler.logger.error(error) + raise Exception(str(error)) + finally: + handler.handle_lock_file(handler.trace_id, 'remove') diff --git a/script/impl/dorado_disaster_recovery/__init__.py b/script/impl/dorado_disaster_recovery/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py new file mode 100644 index 00000000..0424c911 --- /dev/null +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -0,0 +1,2504 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_base.py is a base module for streaming disaster recovery. +############################################################################# +import json +import os +import re +import time +from datetime import datetime +from datetime import timedelta + +from domain_utils.cluster_file.version_info import VersionInfo +from impl.dorado_disaster_recovery.ddr_constants import DoradoDisasterRecoveryConstants +from impl.dorado_disaster_recovery.params_handler import check_local_cluster_conf +from impl.dorado_disaster_recovery.params_handler import check_remote_cluster_conf +from gspylib.common.DbClusterInfo import dbClusterInfo +from gspylib.common.Common import DefaultValue, ClusterInstanceConfig +from gspylib.common.ErrorCode import ErrorCode +from gspylib.common.Common import ClusterCommand +from gspylib.common.OMCommand import OMCommand +from gspylib.common.DbClusterStatus import DbClusterStatus +from gspylib.threads.SshTool import SshTool +from gspylib.threads.parallelTool import parallelTool +from gspylib.os.gsfile import g_file +from base_utils.os.cmd_util import CmdUtil +from base_utils.os.env_util import EnvUtil +from base_utils.os.net_util import NetUtil +from base_utils.os.file_util import FileUtil +from base_utils.os.user_util import UserUtil +from base_utils.security.sensitive_mask import SensitiveMask +from base_utils.common.constantsbase import ConstantsBase + + +class DoradoDisasterRecoveryBase(object): + def __init__(self, params, user, logger, trace_id, log_file=None): + self.user = user + self.params = params + self.logger = logger + self.trace_id = trace_id + self.log_file = log_file + self.cluster_info = None + self.gp_home = None + self.pg_host = None + self.gauss_home = None + self.bin_path = None + self.local_host = None + self.local_ip = None + self.is_single_inst = None + self.streaming_file_dir = None + self.streaming_xml = None + self.cluster_node_names = None + self.normal_cm_ips = [] + self.normal_node_list = [] + self.ssh_tool = None + self.mpp_file = None + self.status_info = None + self.step_file_path = "" + self.cluster_status = '' + self.normal_dn_ids = [] + self.normal_cn_ids = [] + self.normal_etcd_ids = [] + self.normal_gtm_ids = [] + self.normal_cm_ids = [] + self.normal_instances = [] + self.primary_dn_ids = [] + self.main_standby_ids = [] + self.cascade_standby_ids = [] + self.connected_nodes = [] + self.__init_globals() + self.backup_open_key = DoradoDisasterRecoveryConstants.BACKUP_OPEN % user + + def __init_globals(self): + self.cluster_info = dbClusterInfo() + self.cluster_info.initFromStaticConfig(self.user) + self.gp_home = EnvUtil.getEnvironmentParameterValue("GPHOME", self.user) + self.pg_host = EnvUtil.getEnvironmentParameterValue("PGHOST", self.user) + self.gauss_home = EnvUtil.getEnvironmentParameterValue("GAUSSHOME", self.user) + self.bin_path = os.path.join(os.path.realpath(self.gauss_home), 'bin') + self.local_host = NetUtil.GetHostIpOrName() + self.local_ip = DefaultValue.getIpByHostName() + self.is_single_inst = True if self.cluster_info.isSingleInstCluster() else None + self.cluster_node_names = self.cluster_info.getClusterNodeNames() + self.streaming_file_dir = os.path.join(self.pg_host, DoradoDisasterRecoveryConstants.DDR_FILES_DIR) + self.streaming_xml = os.path.join(self.streaming_file_dir, + DoradoDisasterRecoveryConstants.STREAMING_CONFIG_XML) + self.ssh_tool = SshTool(self.cluster_node_names, self.log_file) + self.mpp_file = EnvUtil.getMpprcFile() + self.dss_home_dir = "" + self._init_step_file_path() + + def init_cluster_conf(self): + """ + Init cluster conf from file + """ + if (not hasattr(self.params, "localClusterConf")) \ + or (not hasattr(self.params, "remoteClusterConf")): + self.logger.log("Parse cluster conf from file.") + local_conf, remote_conf = self.read_cluster_conf_record() + self.logger.debug("Start validte cluster conf info.") + check_local_cluster_conf(local_conf) + check_remote_cluster_conf(remote_conf) + setattr(self.params, "localClusterConf", local_conf) + setattr(self.params, "remoteClusterConf", remote_conf) + self.logger.log("Successfully parse cluster conf from file.") + + def _init_step_file_path(self): + """ + Init step file path + """ + if self.params.task == DoradoDisasterRecoveryConstants.ACTION_START: + if self.params.mode == "primary": + step_file_name = DoradoDisasterRecoveryConstants.DDR_STEP_FILES["start_primary"] + elif self.params.mode == "disaster_standby": + step_file_name = DoradoDisasterRecoveryConstants.DDR_STEP_FILES["start_standby"] + else: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "init step file path") + elif self.params.task == DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: + if self.params.mode == "primary": + step_file_name = DoradoDisasterRecoveryConstants.DDR_STEP_FILES["switchover_primary"] + elif self.params.mode == "disaster_standby": + step_file_name = DoradoDisasterRecoveryConstants.DDR_STEP_FILES["switchover_standby"] + else: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "init step file path") + else: + step_file_name = DoradoDisasterRecoveryConstants.DDR_STEP_FILES[self.params.task] + self.step_file_path = os.path.join(self.streaming_file_dir, step_file_name) + self.logger.debug("Init step file:%s." % self.step_file_path) + + def read_cluster_conf_record(self, check_file_exist=True): + """ + Read cluster conf from file + """ + cluster_conf_record = os.path.join(self.streaming_file_dir, + DoradoDisasterRecoveryConstants.DDR_CLUSTER_CONF_RECORD) + if not os.path.isfile(cluster_conf_record): + if check_file_exist: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "check cluster conf, cluster_conf_record is lost") + else: + self.logger.log("Not found file cluster_conf_record.") + return '', '' + content = DefaultValue.obtain_file_content(cluster_conf_record, is_list=False) + json_content = json.loads(content) + local_conf = json_content["localClusterConf"] + remote_conf = json_content["remoteClusterConf"] + return local_conf, remote_conf + + def handle_lock_file(self, trace_id, action): + """ + Create lock file for other streaming process. + """ + if self.params.task not in DoradoDisasterRecoveryConstants.TASK_EXIST_CHECK: + return + file_name = DoradoDisasterRecoveryConstants.PROCESS_LOCK_FILE + trace_id + file_path = os.path.join(self.pg_host, file_name) + self.logger.debug("Start %s lock file:%s." % (action, file_path)) + if action == 'create': + FileUtil.createFile(file_path, DefaultValue.KEY_FILE_MODE) + elif action == 'remove': + if os.path.isfile(file_path): + FileUtil.removeFile(file_path, DefaultValue.KEY_FILE_MODE) + else: + self.logger.warn("Not found:%s." % file_path) + self.logger.debug("Successfully %s lock file:%s." % (action, file_path)) + + def check_parallel_process_is_running(self): + """ + Check streaming process is running + """ + hostnames = ' -H '.join(self.cluster_node_names) + file_path = os.path.join(self.pg_host, DoradoDisasterRecoveryConstants.PROCESS_LOCK_FILE) + cmd = 'source %s && pssh -t 10 -H %s "ls %s*"' % (self.mpp_file, hostnames, file_path) + # waiting for check + time.sleep(DoradoDisasterRecoveryConstants.CHECK_PROCESS_WAIT_TIME) + _, output = CmdUtil.retryGetstatusoutput(cmd, retry_time=0) + host_file_str_list = re.findall(r'.* ?: *%s[^\*^\s]+' % file_path, output) + process_list = [] + for item in host_file_str_list: + hostname = item.split(':')[0].strip() + file_name = item.split(':')[1].strip() + uuid = os.path.basename(file_name).split('_')[-1] + if uuid != self.trace_id: + process_list.append([hostname, file_name]) + if process_list: + msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ + % 'check dorado disaster recovery process, please execute after other ' \ + 'process exited, if you ensure no other process is running, ' \ + 'remove the lock file [%s] on node [%s], and try again' \ + % (process_list[0][-1], process_list[0][0]) + self.logger.error(msg) + raise Exception(msg) + + def create_disaster_recovery_dir(self, dir_path): + """ + Create disaster recovery files dir + """ + cmd = g_file.SHELL_CMD_DICT["createDir"] % ( + dir_path, dir_path, DefaultValue.MAX_DIRECTORY_MODE) + self.ssh_tool.executeCommand(cmd) + self.logger.debug("Successfully create dir [%s] on all nodes." % dir_path) + + def check_hadr_pwd(self, only_mode=None): + """ + Check hadr pwd is correct or not + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Checking hadr user is not for mode:%s." % self.params.mode) + return + self.logger.debug("Start checking disaster user password.") + sql = "select 1;" + primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + if not primary_dns: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "obtain primary dn when check disaster user") + status, output = ClusterCommand.remoteSQLCommand( + sql, self.user, primary_dns[0].hostname, primary_dns[0].port, False, + user_name=self.params.hadrUserName, user_pwd=self.params.hadrUserPassword) + if status != 0: + if "Invalid username/password" in output: + self.logger.debug("Logging denied, please check your password.") + self.logger.logExit(ErrorCode.GAUSS_516['GAUSS_51632'] + % "check disaster user password") + self.logger.debug("Successfully check disaster user password.") + + def check_hadr_user(self, only_mode=None): + """ + Check hadr user is exist + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Checking hadr user is not for mode:%s." % self.params.mode) + return + self.logger.log("Start checking disaster recovery user.") + sql = "select usename, userepl from pg_user;" + primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + if not primary_dns: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "obtain primary dn when check disaster user") + status, output = ClusterCommand.remoteSQLCommand( + sql, self.user, primary_dns[0].hostname, primary_dns[0].port, True) + if status != 0: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "execute sql for checking disaster user.") + user_dict = {user_info.split('|')[0].strip(): user_info.split('|')[-1].strip() + for user_info in output.strip().split('\n')} + for user_name, repl in user_dict.items(): + if user_name == self.params.hadrUserName and repl == 't': + self.logger.log("Successfully check disaster recovery user.") + return + msg = ErrorCode.GAUSS_516['GAUSS_51632'] % 'checking disaster user, please confirm ' \ + 'disaster user is exist and with ' \ + 'replication role' + self.logger.logExit(msg + "Users:%s" % user_dict) + + def __copy_hadr_user_key(self, secure_dir_path, update=False): + """ + Copy hadr.key.cipher and hadr.key.rand + """ + self.logger.log("Start copy hadr user key files.") + hadr_cipher_path = os.path.join(self.bin_path, "hadr.key.cipher") + hadr_rand_path = os.path.join(self.bin_path, "hadr.key.rand") + secure_cipher_path = os.path.join(secure_dir_path, "hadr.key.cipher") + secure_rand_path = os.path.join(secure_dir_path, "hadr.key.rand") + if not update: + if (not os.path.isfile(hadr_cipher_path)) or (not os.path.isfile(hadr_rand_path)): + self.logger.debug("Not found hadr user key, no need to copy.") + return + FileUtil.cpFile(hadr_cipher_path, secure_cipher_path, cmd_type="shell") + FileUtil.cpFile(hadr_rand_path, secure_rand_path, cmd_type="shell") + self.logger.debug("Successfully copy hadr key files into temp secure dir.") + else: + if (not os.path.isfile(secure_cipher_path)) or (not os.path.isfile(secure_rand_path)): + self.logger.debug("Not found hadr user key, no need to update.") + return + host_names = self.get_all_connection_node_name("update_hadr_key") + self.ssh_tool.scpFiles(secure_cipher_path, self.bin_path, hostList=host_names) + self.ssh_tool.scpFiles(secure_rand_path, self.bin_path, hostList=host_names) + FileUtil.removeFile(secure_cipher_path) + FileUtil.removeFile(secure_rand_path) + self.logger.debug("Finished copy hadr key files to nodes:%s." % host_names) + + def remove_secure_dir(self, dir_path, host_name): + """ + Remove gs_secure_files dir in PGDATA + """ + secure_dir_path = os.path.join(dir_path, DoradoDisasterRecoveryConstants.GS_SECURE_FILES) + cmd = "echo \"if [ -d '%s' ];then rm -rf '%s';fi\" | pssh -s -H %s" % \ + (secure_dir_path, secure_dir_path, host_name) + status, output = CmdUtil.retryGetstatusoutput(cmd) + self.logger.debug("Remove gs_secure_files cmd:%s" % cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + " Error: \n%s " % output) + + def __stream_copy_file_to_all_dn(self, temp_secure_dir_path): + """ + copy key file dir to all dn dir + """ + dn_infos = DefaultValue.get_dn_info(self.cluster_info) + self.logger.debug("Got dns:%s" % dn_infos) + copy_succeed = 0 + host_names = self.get_all_connection_node_name("copy gs_secure_files to dns") + for dn_info in dn_infos: + if dn_info["host_name"] not in host_names: + continue + self.logger.debug("Copy disaster recovery secure files to inst[%s][%s][%s]." % + (dn_info['id'], dn_info['data_dir'], dn_info['host_name'])) + try: + self.remove_secure_dir(dn_info['data_dir'], dn_info['host_name']) + self.ssh_tool.scpFiles( + temp_secure_dir_path, dn_info['data_dir'], [dn_info['host_name']]) + copy_succeed += 1 + except Exception as error: + self.logger.debug("Failed copy secure files to inst[%s][%s][%s],error:%s." % + (dn_info['id'], dn_info['data_dir'], dn_info['host_name'], + str(error))) + if copy_succeed == 0: + raise Exception( + ErrorCode.GAUSS_516["GAUSS_51632"] % "copy secure dir to all dn data dir") + self.logger.log("Successfully copy secure files.") + + def __prepare_cluster_user_record(self, temp_secure_dir_path): + """ + Save cluster user record + """ + cluster_user_record = os.path.join(temp_secure_dir_path, + DoradoDisasterRecoveryConstants.CLUSTER_USER_RECORD) + DefaultValue.write_content_on_file(cluster_user_record, self.user) + self.logger.debug("Record current cluster user:%s." % self.user) + + def prepare_gs_secure_files(self, only_mode=None): + """ + Prepare gs_secure_files on primary cluster + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Prepare gs_secure_files is not for mode:%s." % self.params.mode) + return + self.logger.log("Start prepare secure files.") + secure_dir_name = DoradoDisasterRecoveryConstants.GS_SECURE_FILES + temp_secure_dir_path = os.path.realpath( + os.path.join(self.streaming_file_dir, secure_dir_name)) + if os.path.isdir(temp_secure_dir_path): + self.logger.debug("Secure file dir exist, cleaning...") + FileUtil.removeDirectory(temp_secure_dir_path) + FileUtil.createDirectory(temp_secure_dir_path, True, DefaultValue.KEY_DIRECTORY_MODE) + if os.path.isdir(temp_secure_dir_path): + self.logger.debug("Successfully create secure file dir.") + version_file_path = os.path.realpath(os.path.join(self.gp_home, "version.cfg")) + FileUtil.cpFile(version_file_path, temp_secure_dir_path) + self.__prepare_cluster_user_record(temp_secure_dir_path) + self.__copy_hadr_user_key(temp_secure_dir_path, update=False) + self.__stream_copy_file_to_all_dn(temp_secure_dir_path) + FileUtil.removeDirectory(temp_secure_dir_path) + + def stream_clean_gs_secure(self, params): + """ + clean gs secure dir + """ + inst, file_path = params + self.logger.debug("Starting clean instance %s gs secure dir." % inst.instanceId) + cmd = "source %s && pssh -s -H %s 'if [ -d %s ]; then rm -rf %s; fi'" \ + % (self.mpp_file, inst.hostname, file_path, file_path) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + self.logger.debug("Clean gs secure dir for instance [%s] result:%s." % + (inst.instanceId, output)) + self.logger.debug("Successfully clean instance %s gs secure dir." % inst.instanceId) + + def clean_gs_secure_dir(self, only_mode=None): + """ + Clean gs secure dir if exist + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Clean gs_secure_files is not for mode:%s." % self.params.mode) + return + self.logger.debug("Start clean gs secure dir.") + params = [] + for node in self.cluster_info.dbNodes: + for inst in node.datanodes: + if inst.hostname not in self.connected_nodes: + continue + file_path = os.path.realpath(os.path.join( + inst.datadir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES)) + params.append((inst, file_path)) + if params: + parallelTool.parallelExecute(self.stream_clean_gs_secure, params) + self.logger.debug("Finished clean gs secure dir.") + + def remove_streaming_dir(self, dir_path): + """ + Remove streaming files dir + """ + cmd = "if [ -d %s ]; then rm %s -rf;fi" % (dir_path, self.streaming_file_dir) + self.ssh_tool.executeCommand(cmd) + self.logger.debug("Successfully remove dir [%s] on all nodes." % dir_path) + + def query_streaming_step(self): + """ + Streaming step + """ + step = -1 + if os.path.isfile(self.step_file_path): + step_list = FileUtil.readFile(self.step_file_path) + if step_list: + step = int(step_list[0].split("_")[0]) + if step == -1: + self.logger.log("Got the step for action:[%s]." % self.params.task) + else: + self.logger.log("Got the continue step:[%s] for action:[%s]." % + (step, self.params.task)) + return step + + def write_streaming_step(self, step): + """ + write streaming step + :return: NA + """ + self.logger.debug("Streaming action:[%s] record current step:[%s]" + % (self.params.task, step)) + with os.fdopen(os.open(self.step_file_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, + DefaultValue.KEY_FILE_MODE_IN_OS), "w") as fp_write: + fp_write.write(step) + + def init_cluster_status(self): + """ + Generate cluster status file + """ + tmp_file = os.path.join(self.streaming_file_dir, + DoradoDisasterRecoveryConstants.DDR_CLUSTER_STATUS_TMP_FILE) + cmd = ClusterCommand.getQueryStatusCmd("", tmp_file) + self.logger.debug("Command for checking cluster state: %s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + msg = ErrorCode.GAUSS_516["GAUSS_51632"] \ + % "check cluster state, status:%s, output:%s" % (status, output) + self.logger.debug(msg) + raise Exception(msg) + self.logger.debug("Successfully init cluster status.") + + def query_cluster_info(self, cm_check=False): + """ + Query cluster info + """ + cmd = ClusterCommand.getQueryStatusCmd() + if cm_check: + cmd = "source %s; cm_ctl query -Cv" % self.mpp_file + self.logger.debug("Command for checking cluster state: %s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0 or not output.strip(): + msg = ErrorCode.GAUSS_516["GAUSS_51632"] \ + % "check cluster state, status:%s, output:%s" % (status, output) + self.logger.debug(msg) + return "" + return output.strip() + + def __clean_cluster_status(self): + """ + Clean status + """ + self.normal_cm_ids = [] + self.normal_gtm_ids = [] + self.normal_cn_ids = [] + self.primary_dn_ids = [] + self.main_standby_ids = [] + self.cascade_standby_ids = [] + self.normal_dn_ids = [] + self.normal_etcd_ids = [] + self.normal_instances = [] + + def __parse_instance_status(self): + """ + Parse instance status + """ + abnormal_insts = [] + for db_node in self.status_info.dbNodes: + for cms_inst in db_node.cmservers: + if cms_inst.status in ["Primary", "Standby"]: + self.normal_cm_ids.append(cms_inst.instanceId) + self.normal_instances.append(cms_inst) + else: + abnormal_insts.append({cms_inst.instanceId: cms_inst.status}) + for gtm_inst in db_node.gtms: + if gtm_inst.status in ["Primary", "Standby"] and gtm_inst.isInstanceHealthy(): + self.normal_gtm_ids.append(gtm_inst.instanceId) + self.normal_instances.append(gtm_inst) + else: + abnormal_insts.append({gtm_inst.instanceId: gtm_inst.status}) + for coo_inst in db_node.coordinators: + if coo_inst.status == "Normal": + self.normal_cn_ids.append(coo_inst.instanceId) + self.normal_instances.append(coo_inst) + else: + abnormal_insts.append({coo_inst.instanceId: coo_inst.status}) + for data_inst in db_node.datanodes: + if data_inst.status in ["Primary"]: + self.primary_dn_ids.append(data_inst.instanceId) + if data_inst.status in ["Main Standby"]: + self.main_standby_ids.append(data_inst.instanceId) + if data_inst.status in ["Cascade Standby"]: + self.cascade_standby_ids.append(data_inst.instanceId) + if data_inst.status in ["Primary", "Standby", "Cascade Standby", "Main Standby" + ] and data_inst.isInstanceHealthy(): + self.normal_dn_ids.append(data_inst.instanceId) + self.normal_instances.append(data_inst) + else: + abnormal_insts.append({data_inst.instanceId: data_inst.status}) + for etcd_inst in db_node.etcds: + if etcd_inst.status in ["StateLeader", "StateFollower"] \ + and etcd_inst.isInstanceHealthy(): + self.normal_etcd_ids.append(etcd_inst.instanceId) + self.normal_instances.append(etcd_inst) + else: + abnormal_insts.append({etcd_inst.instanceId: etcd_inst.status}) + return abnormal_insts + + def parse_cluster_status(self, current_status=None): + """ + Parse cluster status + """ + tmp_file = os.path.join(self.streaming_file_dir, + DoradoDisasterRecoveryConstants.DDR_CLUSTER_STATUS_TMP_FILE) + if (not os.path.isfile(tmp_file)) and (not current_status): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] + % "cluster status file:%s" % tmp_file) + self.status_info = DbClusterStatus() + self.__clean_cluster_status() + if current_status: + self.status_info.init_from_content(current_status) + else: + self.status_info.initFromFile(tmp_file) + self.cluster_status = self.status_info.clusterStatus + self.logger.debug("Current cluster status is:%s." % self.cluster_status) + # Parse instance status + abnormal_insts = self.__parse_instance_status() + # Get node names of normal nodes with nodeId + for instance in self.normal_instances: + self.normal_node_list.append(self.cluster_info.getDbNodeByID(int(instance.nodeId)).name) + self.normal_node_list = list(set(self.normal_node_list)) + for node_id in list(set(self.normal_cm_ids)): + self.normal_cm_ips.append(self.cluster_info.getDbNodeByID(int(node_id)).name) + self.logger.debug("Parsed primary dns:%s" % self.primary_dn_ids) + self.logger.debug("Parsed Main standby dns:%s" % self.main_standby_ids) + if abnormal_insts: + self.logger.debug("Abnormal instances:%s" % abnormal_insts) + else: + self.logger.debug("Checked all instances is normal:%s" + % set([inst.instanceId for inst in self.normal_instances])) + + def check_cluster_status(self, status_allowed, only_check=False, + check_current=False, is_log=True): + """ + Stream disaster cluster switch to check cluster status + """ + cluster_status = self.cluster_status + if check_current: + self.logger.debug("Starting check CLuster status") + check_cmd = "source %s && cm_ctl query | grep cluster_state | awk '{print $NF}'"\ + % self.mpp_file + status, output = CmdUtil.retryGetstatusoutput(check_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51600"] + + "status(%d), output(%s)" % (status, output)) + cluster_status = output.strip() + self.logger.debug("Checked cluster status is:%s" % cluster_status) + if cluster_status not in status_allowed: + if only_check is True: + self.logger.debug("Current cluster status is %s" % cluster_status) + return False + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "check cluster status") + if is_log: + self.logger.log("Successfully check cluster status is: %s." % cluster_status) + else: + self.logger.debug("Checked cluster status is: %s." % cluster_status) + return True + + def check_is_under_upgrade(self): + """ + Check is cluster is not doing upgrade + """ + if DefaultValue.isUnderUpgrade(self.user): + self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] + % "check upgrade binary file, please ensure upgrade " + "is finished and upgrade files has been cleaned") + self.logger.debug("Successfully check cluster is not under upgrade opts.") + + def check_cluster_is_common(self): + """ + Check no main standby and cascade standby + """ + if self.main_standby_ids or self.cascade_standby_ids: + self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] + % "check instance status, there are already main standby " + "or cascade standby, main standby:%s, cascade stadnby:%s" + % (self.main_standby_ids, self.cascade_standby_ids)) + self.logger.log("Successfully check instance status.") + + def check_dn_instance_params(self): + """set_dn_instance_params""" + check_dick = {"enable_dcf": "off", "synchronous_commit": "on"} + dn_insts = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + if len(dn_insts) <= 2: + self.logger.debug("Need set most available for current cluster.") + check_dick.update({"most_available_sync": "on"}) + primary_dn_insts = [inst for inst in dn_insts if inst.instanceId in self.primary_dn_ids] + if not primary_dn_insts: + self.logger.debug("The primary dn not exist, do not need check dn inst params.") + return + execute_dn = primary_dn_insts[0] + param_list = [] + guc_backup_file = os.path.join(self.streaming_file_dir, DoradoDisasterRecoveryConstants.GUC_BACKUP_FILE) + if not os.path.isfile(guc_backup_file): + FileUtil.createFileInSafeMode(guc_backup_file, DefaultValue.KEY_FILE_MODE_IN_OS) + for peer_check, idx in list(check_dick.items()): + param_list.append((execute_dn, {peer_check: idx})) + ret = parallelTool.parallelExecute(self._check_dn_inst_param, param_list) + self.ssh_tool.scpFiles(guc_backup_file, self.streaming_file_dir, self.cluster_node_names) + if any(ret): + self.logger.logExit('\n'.join(filter(bool, ret))) + self.logger.debug("Successfully check dn inst default value.") + + def _check_dn_inst_param(self, param): + """check_dn_inst_param""" + self.logger.debug("Check dn inst params: %s." % param[1]) + if len(param) != 2: + error_msg = ErrorCode.GAUSS_521["GAUSS_52102"] % param + return error_msg + guc_backup_file = os.path.join(self.streaming_file_dir, DoradoDisasterRecoveryConstants.GUC_BACKUP_FILE) + for sql_key, value in list(param[1].items()): + sql = "show %s;" % sql_key + (status, output) = ClusterCommand.remoteSQLCommand(sql, + self.user, param[0].hostname, + str(param[0].port)) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % sql, "\nError:%s" % output) + if output.strip() != value: + if sql_key in DoradoDisasterRecoveryConstants.GUC_CHANGE_MAP.keys(): + content = "%s,%s,%s" % (sql_key, output.strip(), self.trace_id) + FileUtil.write_add_file(guc_backup_file, content, + DefaultValue.KEY_FILE_MODE_IN_OS) + self.__set_guc_param(sql_key, DoradoDisasterRecoveryConstants.GUC_CHANGE_MAP[sql_key], + mode="reload", inst_type="dn", raise_error=True) + return + error_msg = ErrorCode.GAUSS_516["GAUSS_51632"] \ + % "check [%s], Actual value: [%s], expect value: [%s]" \ + % (sql, output, value) + return error_msg + self.logger.debug("Successfully check and rectify dn inst value:%s." % param[1]) + + def restore_guc_params(self): + """ + Restore guc params in .streaming_guc_backup + """ + self.logger.debug("Start restore guc params.") + guc_backup_file = os.path.join(self.streaming_file_dir, DoradoDisasterRecoveryConstants.GUC_BACKUP_FILE) + if not os.path.isfile(guc_backup_file): + self.logger.debug("Not found guc backup file, no need restore guc params.") + params_record = DefaultValue.obtain_file_content(guc_backup_file) + params_record.reverse() + restored_keys = [] + for param in params_record: + guc_key, guc_value, trace_id = param.split(",") + self.logger.debug("Got guc param:%s, value:%s, trace id:%s in guc backup file." + % (guc_key, guc_value, trace_id)) + if guc_key not in DoradoDisasterRecoveryConstants.GUC_CHANGE_MAP.keys(): + continue + # When the number of dns <=2, ensure that the maximum available mode is always on. + dn_insts = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + if guc_key in restored_keys or len(dn_insts) <= 2 \ + and guc_key in ["most_available_sync"]: + continue + guc_value = "off" if guc_value not in ["on", "off"] else guc_value + self.__set_guc_param(guc_key, guc_value, mode="reload", + inst_type="dn", raise_error=False) + restored_keys.append(guc_key) + + def set_most_available(self, mode='set', inst_type='dn', raise_error=True): + dn_insts = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if int(dn_inst.mirrorId) == 1] + if len(dn_insts) > 2: + self.logger.debug("No need set most available for current cluster.") + return + self.__set_guc_param("most_available_sync", "on", mode=mode, + inst_type=inst_type, raise_error=raise_error) + + self.__set_guc_param("synchronous_commit", "on", mode=mode, + inst_type=inst_type, raise_error=raise_error) + + def __set_guc_param(self, key, value, mode='set', inst_type='dn', raise_error=True): + """ + Set guc param + """ + if inst_type == 'dn': + instance = '-Z datanode' + elif inst_type == 'cn': + instance = '-Z coordinator' + else: + instance = "-Z datanode -Z coordinator" + cmd = "source %s && gs_guc %s %s -N all -I all " \ + "-c \"%s=%s\"" \ + % (self.mpp_file, mode, instance, key, value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + if raise_error: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + "Error:%s" % output) + else: + self.logger.debug(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + "Error:%s" % output) + else: + self.logger.debug("Successfully change %s %s with mode %s." % (key, value, mode)) + + def distribute_cluster_conf(self): + """ + Record cluster conf in files + """ + data = {"remoteClusterConf": self.params.remoteClusterConf, + "localClusterConf": self.params.localClusterConf} + file_path = os.path.join(self.streaming_file_dir, + DoradoDisasterRecoveryConstants.DDR_CLUSTER_CONF_RECORD) + FileUtil.write_update_file(file_path, data, DefaultValue.KEY_FILE_MODE_IN_OS) + self.ssh_tool.scpFiles(file_path, self.streaming_file_dir, self.cluster_node_names) + + def __record_wal_keep_segments(self, param_list): + """ + record wal_keep_segments value to .wal_keep_segments_record + """ + dn_inst, sql_check, wal_keep_segments = param_list + self.logger.debug("Starting record wal_keep_segments default " + "value for isntance:%s." % dn_inst.instanceId) + (status, output) = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_inst.hostname, dn_inst.port, True) + self.logger.debug("Got %s wal_keep_segments, status=%d, output: %s." % + (dn_inst.instanceId, status, SensitiveMask.mask_pwd(output))) + if status == 0 and output.strip(): + value = output.strip() + FileUtil.createFile(wal_keep_segments, True, DefaultValue.KEY_FILE_MODE) + FileUtil.writeFile(wal_keep_segments, [str(dn_inst.instanceId) + ":" + str(value)]) + self.logger.debug("Successfully record %s wal_keep_segments default value:%s" % + (dn_inst.hostname, value)) + else: + raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] + % "wal_keep_segments default value of %s" % dn_inst.instanceId) + + def get_default_wal_keep_segments(self, only_mode=None): + """ + get wal_keep_segments default value in primary dn + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Wal keep segment opts not for mode:%s." % self.params.mode) + return + self.logger.debug("Starting get wal_keep_segments default value.") + wal_keep_segments = os.path.join( + self.streaming_file_dir, DoradoDisasterRecoveryConstants.WAL_KEEP_SEGMENTS) + sql_check = "show wal_keep_segments;" + param_list = [(dn_inst, sql_check, wal_keep_segments) for db_node in + self.cluster_info.dbNodes for dn_inst in db_node.datanodes + if dn_inst.instanceId in self.primary_dn_ids] + if not param_list: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain param list for get wal_keep_segments") + parallelTool.parallelExecute(self.__record_wal_keep_segments, param_list) + self.logger.debug("Successfully get wal_keep_segments default value.") + + def __set_wal_keep_segments_each_inst(self, params_list): + """ + Set wal_keep_segments value in primary dn + """ + (inst, opt_type, value, mpprc_file) = params_list + self.logger.debug("Start [%s] shardNum [%s] node [%s] wal_keep_segments value [%s]." + % (opt_type, inst.mirrorId, inst.hostname, value)) + cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ + "-Z datanode -D %s -c \\\"wal_keep_segments = '%s'\\\"\"" % \ + (mpprc_file, inst.hostname, mpprc_file, opt_type, inst.datadir, value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + "Options:%s, Error: \n%s " + % ("set wal_keep_segments for inst:%s" % inst.instanceId, str(output))) + self.logger.debug("Successfully [%s] shardNum [%s] node [%s] wal_keep_segments " + "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) + + def set_wal_keep_segments(self, opt_type, value, restore_flag=False, only_mode=None): + """ + guc set wal_keep_segments value in primary dn + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Set wal_keep_segments opts not for mode:%s." % self.params.mode) + return + self.logger.log("Starting %s wal_keep_segments value: %s." % (opt_type, value)) + if restore_flag and isinstance(value, dict): + params_list = [(inst, opt_type, value.get(inst.instanceId, 128), self.mpp_file) for + node in self.cluster_info.dbNodes for inst in node.datanodes + if inst.instanceId in self.primary_dn_ids] + else: + params_list = [(inst, opt_type, value, self.mpp_file) for node in + self.cluster_info.dbNodes for inst in node.datanodes + if inst.instanceId in self.primary_dn_ids] + if not params_list: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain param list for set wal_keep_segments") + parallelTool.parallelExecute(self.__set_wal_keep_segments_each_inst, params_list) + self.logger.log("Successfully %s wal_keep_segments value: %s." % (opt_type, value)) + + def __stop_one_node(self, node_id): + """ + Stop one node by node id + """ + self.logger.debug("Start stop node:%s" % node_id) + cmd = ClusterCommand.getStopCmd(int(node_id), "i", 1800) + self.logger.debug("Streaming disaster calling cm_ctl to stop cluster, cmd=[%s]" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + self.logger.debug("Failed stop node:%s, error:%s" % (node_id, output)) + else: + self.logger.debug("Successfully stop node:%s" % node_id) + + def stop_cluster_by_node(self, only_mode=None): + """ + stop the cluster by node + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Stop cluster by node not for mode:%s." % self.params.mode) + return + self.logger.log("Stopping the cluster by node.") + static_config = "%s/cluster_static_config" % self.bin_path + cm_ctl_file = "%s/cm_ctl" % self.bin_path + if not os.path.isfile(static_config) or not os.path.isfile(cm_ctl_file): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] + % (static_config + " or " + cm_ctl_file)) + node_id_list = list(set([instance.nodeId for instance in self.normal_instances])) + parallelTool.parallelExecute(self.__stop_one_node, node_id_list) + self.logger.log("Successfully stopped the cluster by node for streaming cluster.") + + def get_all_connection_node_name(self, action_flag="", no_update=True): + """ + get all connection node name + """ + if self.connected_nodes and no_update: + self.logger.debug("Got connected nodes:%s for action:%s" + % (self.connected_nodes, action_flag)) + return self.connected_nodes + rets = parallelTool.parallelExecute(DefaultValue.fast_ping, self.cluster_node_names) + self.logger.debug("Check connect for action:%s, result:%s" % (action_flag, str(rets))) + connected_hosts = [ret[0] for ret in rets if ret[-1]] + self.connected_nodes = connected_hosts + return self.connected_nodes + + def update_streaming_pg_hba(self): + """ + update pg_hba.conf, read config_param.json file and set other cluster ip + :return:NA + """ + self.logger.log("Start update pg_hba config.") + FileUtil.cpFile(self.params.xml_path, self.streaming_xml) + cmd = "source %s; %s -U %s -X '%s' --try-reload" % ( + self.mpp_file, OMCommand.getLocalScript( + "Local_Config_Hba"), self.user, self.streaming_xml) + self.logger.debug("Command for changing instance pg_hba.conf file: %s" % cmd) + self.get_all_connection_node_name("update_streaming_pg_hba") + try: + self.ssh_tool.scpFiles(self.streaming_xml, self.streaming_file_dir) + self.ssh_tool.executeCommand(cmd, hostList=self.connected_nodes) + except Exception as error: + msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ + % "update streaming pg_hba with error:%s" % error + self.logger.debug(msg) + raise Exception(msg) + self.logger.log("Successfully update pg_hba config.") + + def __get_repl_info_cmd(self, node_name, ret, dn_inst, opt_mode, idx): + """ + get_repl_info_cmd + """ + if node_name != self.local_host: + set_cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ + "-Z datanode -D %s -c " \ + "\\\"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s " \ + "remoteservice=%s iscascade=%s iscrossregion=%s'\\\"\"" + set_cmd = set_cmd % (self.mpp_file, node_name, + self.mpp_file, opt_mode, + dn_inst.datadir, idx, ret.group(1), + ret.group(2), ret.group(3), ret.group(4), + ret.group(5), ret.group(6), ret.group(7), + ret.group(8), "true", "false") + else: + set_cmd = "source %s ; gs_guc %s -Z datanode -D %s -c " \ + "\"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s " \ + "remoteservice=%s iscascade=%s iscrossregion=%s'\"" + set_cmd = set_cmd % (self.mpp_file, opt_mode, + dn_inst.datadir, idx, ret.group(1), + ret.group(2), ret.group(3), ret.group(4), + ret.group(5), ret.group(6), ret.group(7), + ret.group(8), "true", "false") + return set_cmd + + def __set_original_repl_info(self, dn_inst, node_name, opt_mode="set"): + """ + Rectify original replconninfos + """ + orignal_ports = None + if not all([dn_inst, node_name]): + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain dn infos") + for idx in range(1, DoradoDisasterRecoveryConstants.MAX_REPLICATION_NUMS + 1): + if node_name == self.local_host: + cmd = "source %s; gs_guc check -Z datanode -D %s " \ + "-c 'cross_cluster_replconninfo%s'" % (self.mpp_file, dn_inst.datadir, idx) + else: + cmd = "source %s; pssh -H %s 'source %s; gs_guc check " \ + "-Z datanode -D %s -c \"cross_cluster_replconninfo%s\"'" \ + % (self.mpp_file, node_name, self.mpp_file, dn_inst.datadir, idx) + self.logger.debug("Check original repl infos with cmd:%s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + " Error: \n%s " % output) + if output.count("=NULL") > 2 or "iscrossregion=true" in output.lower(): + self.logger.debug("InstanceID:%s, Index:%s" % (dn_inst.instanceId, idx)) + return idx, orignal_ports + ret = re.search( + r"cross_cluster_replconninfo%s='localhost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})" + r" localport=(\d{4,5}) " + r"remotehost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}) " + r"remoteport=(\d{4,5}) " % idx, output) + if not ret: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "search repl infos") + set_cmd = self.__get_repl_info_cmd(node_name, ret, dn_inst, opt_mode, idx) + self.logger.debug("Set original repl infos with cmd:%s" % set_cmd) + status, output = CmdUtil.retryGetstatusoutput(set_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % set_cmd + + " Error: \n%s " % output) + orignal_ports = (ret.group(2), ret.group(3), ret.group(4)) + self.logger.debug("Successfully rectify original repl infos for instance:%s." + % dn_inst.instanceId) + + def __get_local_data_ip(self, inst_host): + """ + Get local data ip + """ + local_cluster_info = self.params.localClusterConf + shards = local_cluster_info["shards"] + inst_ips = DefaultValue.get_remote_ips(inst_host, self.mpp_file) + for shard in shards: + for node in shard: + ip = node["ip"] + data_ip = node["dataIp"] + if ip in inst_ips: + self.logger.debug("Got ip[%s], dataIp[%s]." % (ip, data_ip)) + return data_ip + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "obtain shards from local cluster info") + + def __config_one_dn_instance(self, params): + """ + Config replconninfo for one dn instance + """ + inst, opt_mode, remote_cluster_info = params + local_data_ip = self.__get_local_data_ip(inst.hostname) + base_dn_port = self.params.remoteClusterConf['port'] + self.logger.debug("Start config instance:[%s], got dataIp:[%s], port:[%s]." + % (inst.instanceId, local_data_ip, base_dn_port)) + if not all([local_data_ip, base_dn_port]): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] + % "dn port or dataIp for config instance") + inst_index, original_ports = self.__set_original_repl_info( + inst, inst.hostname, opt_mode=opt_mode) + repl_params = [] + shards = remote_cluster_info.get("shards") + for shard in shards: + for node_info in shard: + data_ip = node_info.get("dataIp") + shard_num = node_info.get("shardNum", '1') + if str(inst.mirrorId) == str(shard_num): + repl_params.append(( + shard_num, inst.hostname, local_data_ip, + inst.datadir, data_ip, inst_index, + original_ports, base_dn_port, opt_mode)) + inst_index += 1 + return repl_params + + def __do_config_dn_repl_info(self, params): + """ + function:config postgres conf + :return:NA + """ + shard_num, host, local_data_ip, data_dir, data_ip, index, \ + original_ports, base_port, opt_mode = params + local_port, local_heartbeat, local_service = original_ports + remote_base = int(base_port) + self.logger.debug("shard num %s base port is %s" % (shard_num, remote_base)) + remote_port = remote_base + 1 + remote_heartbeat = remote_base + 5 + remote_service = remote_base + 4 + is_cascade = "false" + if self.local_host == host: + guc_cmd = "source %s ; gs_guc %s -Z datanode -D %s " \ + "-c \"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s remoteservice=%s " \ + "iscascade=%s iscrossregion=true'\"" \ + % (self.mpp_file, opt_mode, data_dir, index, local_data_ip, local_port, + local_heartbeat, local_service, data_ip, remote_port, + remote_heartbeat, remote_service, is_cascade) + self.logger.debug("Set datanode postgres file for streaming " + "disaster cluster with cmd:%s" % guc_cmd) + else: + guc_cmd = "source %s; pssh -s -H %s \"source %s ; gs_guc %s -Z datanode -D %s " \ + "-c \\\"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s remoteservice=%s " \ + "iscascade=%s iscrossregion=true'\\\"\"" \ + % (self.mpp_file, host, + self.mpp_file, opt_mode, data_dir, index, + local_data_ip, local_port, local_heartbeat, + local_service, data_ip, remote_port, + remote_heartbeat, remote_service, is_cascade) + self.logger.debug("Set datanode postgres file for streaming " + "disaster cluster with cmd:%s" % guc_cmd) + status, output = CmdUtil.retryGetstatusoutput(guc_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % guc_cmd + + " Error: \n%s " % output) + + def config_cross_cluster_repl_info(self): + """ + update postgresql.conf for replconninfo + """ + self.logger.debug("set all datanode guc param in postgres conf for ddr cluster.") + repl_params = [] + opt_mode = "reload" if self.params.mode == "primary" else "set" + config_repl_params = [] + datanode_instance = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes] + + for inst in datanode_instance: + config_repl_params.append((inst, opt_mode, self.params.remoteClusterConf)) + rets = parallelTool.parallelExecute(self.__config_one_dn_instance, config_repl_params) + for param in rets: + repl_params += param + self.logger.debug("Got repl params:%s" % str(repl_params)) + parallelTool.parallelExecute(self.__do_config_dn_repl_info, repl_params) + self.logger.debug( + "Successfully set all datanode guc param in postgres conf for streaming cluster.") + + def set_cmserver_guc(self, guc_parameter, guc_value, guc_type, only_mode=None): + """ + set cmserver guc param + :return: NA + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Set cms guc [%s] to [%s] not for mode:%s." + % (guc_parameter, guc_value, self.params.mode)) + return + cmd = "gs_guc %s -Z cmserver -N all -I all -c \"%s=%s\" " % \ + (guc_type, guc_parameter, guc_value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ + % "set cm server guc [%s] to [%s], output:%s" \ + % (guc_parameter, guc_value, output) + self.logger.debug(msg) + + def set_cmagent_guc(self, guc_parameter, guc_value, guc_type, only_mode=None): + """ + set cmagent guc param + :return: NA + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Set cma guc [%s] to [%s] not for mode:%s." + % (guc_parameter, guc_value, self.params.mode)) + return + cmd = "gs_guc %s -Z cmagent -N all -I all -c \"%s=%s\" " % \ + (guc_type, guc_parameter, guc_value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ + % "set cm agent guc [%s] to [%s], output:%s" \ + % (guc_parameter, guc_value, output) + self.logger.debug(msg) + + def __check_datanode_data_ip_connection(self, inst): + """ + Check remote data ip can connect or not + """ + any_connected = False + node_infos = [node_info for shard in self.params.remoteClusterConf.get("shards", []) + for node_info in shard] + local_data_ip = self.__get_local_data_ip(inst.hostname) + for node_info in node_infos: + data_ip = node_info.get("dataIp") + shard_num = node_info.get("shardNum", '1') + if str(shard_num) != str(inst.mirrorId): + continue + _, ret = DefaultValue.fast_ping_on_node(inst.hostname, local_data_ip, + data_ip, self.logger) + if ret: + any_connected = True + break + if not any_connected: + self.logger.error("Failed check data ip connection for inst:%s." % inst.instanceId) + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "check data ip connection") + self.logger.debug("Successfully check main standby data ip connection.") + + def __pghba_backup_handler(self, node_name, dir_name, inst_id, mode="backup"): + """ + Backup or restore pg_hba file. + """ + file_path = os.path.join(dir_name, "pg_hba.conf") + old_file_path = os.path.join(dir_name, "pg_hba.conf.old") + dest_file = os.path.join(self.streaming_file_dir, "%s_pg_hba.conf" % inst_id) + if self.local_host == node_name: + if mode == "backup" and not os.path.isfile(dest_file): + if os.path.isfile(file_path): + self.logger.debug("Backup file from[%s] to[%s]." % ( + file_path, dest_file)) + FileUtil.cpFile(file_path, dest_file) + else: + self.logger.debug("Backup file from[%s] to[%s]." % ( + old_file_path, dest_file)) + FileUtil.cpFile(old_file_path, dest_file) + if mode == "restore": + self.logger.debug("Restore file from[%s] to[%s]." % ( + dest_file, file_path)) + FileUtil.cpFile(dest_file, file_path) + FileUtil.removeFile(dest_file) + else: + if mode == "backup": + cmd = "source %s; pssh -s -H %s \"if [ ! -f '%s' ];then if [ -f '%s' ];" \ + "then cp '%s' '%s';else cp '%s' '%s';fi;fi\"" \ + % (self.mpp_file, node_name, dest_file, file_path, file_path, + dest_file, old_file_path, dest_file) + self.logger.debug("Backup file on node[%s] with cmd [%s]." % ( + node_name, cmd)) + else: + cmd = "source %s; pssh -s -H %s \"cp %s %s && rm -f %s\"" % ( + self.mpp_file, node_name, dest_file, file_path, dest_file) + self.logger.debug("Restore file on node[%s] from[%s] to[%s]." % ( + node_name, file_path, dest_file)) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + " Error: \n%s " % output) + + def __pg_ident_backup_handler(self, node_name, dir_name, inst_id, mode="backup"): + """ + Backup or restore pg_ident file. + """ + file_path = os.path.join(dir_name, "pg_ident.conf") + dest_file = os.path.join(self.streaming_file_dir, "%s_pg_ident.conf" % inst_id) + if self.local_host == node_name: + if mode == "backup" and not os.path.isfile(dest_file): + if os.path.isfile(file_path): + self.logger.debug("Backup file from[%s] to[%s]." % ( + file_path, dest_file)) + FileUtil.cpFile(file_path, dest_file) + + if mode == "restore" and os.path.isfile(dest_file): + self.logger.debug("Restore file from[%s] to[%s]." % ( + dest_file, file_path)) + FileUtil.cpFile(dest_file, file_path) + FileUtil.removeFile(dest_file) + else: + if mode == "backup": + cmd = "source %s; pssh -s -H %s \"if [ ! -f '%s' ];then if [ -f '%s' ];" \ + "then cp '%s' '%s';fi;fi\"" \ + % (self.mpp_file, node_name, dest_file, file_path, file_path, dest_file) + self.logger.debug("Backup file on node[%s] with cmd [%s]." % ( + node_name, cmd)) + else: + cmd = "source %s; pssh -s -H %s \"if [ -f '%s' ];then cp '%s' '%s' && " \ + "rm -f '%s';fi\"" % (self.mpp_file, node_name, dest_file, dest_file, + file_path, dest_file) + self.logger.debug("Restore file on node[%s] from[%s] to[%s]." % ( + node_name, file_path, dest_file)) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + " Error: \n%s " % output) + + def __start_main_standby_dn(self, start_params): + """ + Start single main standby dn + """ + local_ip, inst, bin_path, distribute_arg, build_timeout = start_params + self.logger.debug("Starting start dn:%s" % inst.instanceId) + if local_ip == inst.hostname: + cmd_start = "source %s; %s/gs_ctl start -D %s -M hadr_main_standby%s" % ( + self.mpp_file, bin_path, inst.datadir, distribute_arg) + else: + cmd_start = "source %s; pssh -s -t %s -H %s \"source %s; %s/gs_ctl start -D %s " \ + "-M hadr_main_standby%s\"" \ + % (self.mpp_file, DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT + 10, inst.hostname, + self.mpp_file, bin_path, inst.datadir, distribute_arg) + self.logger.debug("Start dn with cmd:%s." % cmd_start) + status, output = CmdUtil.retry_util_timeout(cmd_start, build_timeout) + if status != 0: + raise Exception( + ErrorCode.GAUSS_514[ + "GAUSS_51400"] % cmd_start + " Error: \n%s " % output) + self.logger.debug("Successfully start dn:%s" % inst.instanceId) + + def __build_main_standby_dn(self, params): + """ + Build single main standby dn + """ + inst, build_timeout, local_ip, bin_path, distribute_arg, rds_backup, backup_pwd = params + self.logger.debug("Start build main standby dn:%s" % inst.instanceId) + self.__check_datanode_data_ip_connection(inst) + self.__pghba_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="backup") + self.__pg_ident_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="backup") + # -t 1209600 means default value 14 days + if local_ip == inst.hostname: + cmd = "source %s; %s/gs_ctl build -D %s -b cross_cluster_full -g 0 -q -t %s" \ + % (self.mpp_file, bin_path, inst.datadir, + DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT) + else: + cmd = "echo \"source %s; %s/gs_ctl build -D %s -b cross_cluster_full -g 0 -q " \ + " -t %s\" | pssh -s -t %s -H %s" \ + % (self.mpp_file, bin_path, inst.datadir, + DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT, + DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT + 10, inst.hostname) + cmd_log = cmd.replace(backup_pwd, '***') + self.logger.debug("Building with cmd:%s." % cmd_log) + status, output = CmdUtil.retry_util_timeout(cmd, build_timeout) + if status != 0: + error_detail = "Error: Failed to do build because of pssh timeout." \ + if "was killed or timeout" in output else \ + "Error: Failed to do build because of retry timeout in %s s." \ + % build_timeout + self.logger.debug("Failed to do gs_ctl build. " + error_detail) + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "full build from remote cluster" + error_detail) + self.logger.debug("Successfully build main standby dn:%s" % inst.instanceId) + self.__pghba_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="restore") + self.__pg_ident_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="restore") + start_params = (local_ip, inst, bin_path, distribute_arg, build_timeout) + self.__start_main_standby_dn(start_params) + + def __build_cascade_standby_dn(self, params): + """ + Build single main standby dn + """ + inst, build_timeout, local_ip, bin_path, distribute_arg = params + self.logger.debug("Start build cascade standby dn:%s" % inst.instanceId) + # -t 1209600 means default value 14 days + if local_ip == inst.hostname: + cmd = "source %s; %s/gs_ctl build -D %s -M cascade_standby " \ + "-b standby_full -r 7200%s -t %s" \ + % (self.mpp_file, bin_path, inst.datadir, distribute_arg, + DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT) + else: + cmd = "echo \"source %s; %s/gs_ctl build -D %s -M cascade_standby -b standby_full " \ + "-r 7200%s -t %s\" | pssh -s -t %s -H %s" \ + % (self.mpp_file, bin_path, inst.datadir, distribute_arg, + DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT, + DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT + 10, inst.hostname) + self.logger.debug("Building with cmd:%s." % cmd) + status, output = CmdUtil.retry_util_timeout(cmd, build_timeout) + if status != 0: + error_detail = "Error: Failed to do build because of pssh timeout." \ + if "was killed or timeout" in output else \ + "Error: Failed to do build because of retry timeout in %s s." \ + % build_timeout + self.logger.debug("Failed to do gs_ctl build. " + error_detail) + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "full build from remote cluster" + error_detail) + self.logger.debug("Successfully build cascade standby dn:%s" % inst.instanceId) + + def start_dss_instance(self, only_mode=None): + """ + Start dss server process + """ + cmd = "source %s; export DSS_MAINTAIN=TRUE; dssserver -D %s & " % self.dss_home_dir + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + self.logger.error(ErrorCode.GAUSS_516["GAUSS_51600"] + + "status(%d), output(%s)" % (status, output)) + return output + + def kill_dss_instance(self, only_mode=None): + """ + Start dss server process + """ + cmd = "source %s; ps ux | grep dssserver | grep -v grep | awk '{print $2}' | xargs kill -9" % self.mpp_file + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + self.logger.error(ErrorCode.GAUSS_516["GAUSS_51600"] + + "status(%d), output(%s)" % (status, output)) + return output + + def build_dn_instance(self, only_mode=None): + """ + Build dn instance + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Build dn step is not for mode:%s." % self.params.mode) + return + self.logger.debug("Start building process.") + distribute_arg = "" if self.cluster_info.isSingleInstCluster() else " -Z datanode" + main_params = [] + cascade_params = [] + datanode_instance = [inst for node in self.cluster_info.dbNodes + for inst in node.datanodes] + for inst in datanode_instance: + if inst.instanceId in self.main_standby_ids + self.primary_dn_ids: + main_params.append((inst, self.params.waitingTimeout, self.local_host, + self.bin_path, distribute_arg, self.params.hadrUserName, + self.params.hadrUserPassword)) + else: + cascade_params.append((inst, self.params.waitingTimeout, self.local_host, + self.bin_path, distribute_arg)) + if main_params: + parallelTool.parallelExecute(self.__build_main_standby_dn, main_params) + self.logger.debug("Finished build main standby dns.") + #if cascade_params: + # parallelTool.parallelExecute(self.__build_cascade_standby_dn, cascade_params) + # self.logger.debug("Finished build cascade standby dns.") + del self.params.hadrUserPassword + + def query_cluster(self): + """ + query cluster + :return: output + """ + cmd = "source %s; cm_ctl query -v -C -s -i -d" % self.mpp_file + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + self.logger.error(ErrorCode.GAUSS_516["GAUSS_51600"] + + "status(%d), output(%s)" % (status, output)) + return output + + def start_cluster(self, cm_timeout=None, only_mode=None): + """ + start the cluster + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Start cluster is not for mode:%s." % self.params.mode) + return + self.logger.log("Starting the cluster.") + cm_timeout = cm_timeout or 300 + user, group = UserUtil.getPathOwner(self.gp_home) + if user == "" or group == "": + raise Exception("Failed to obtain the owner of application.") + end_time = datetime.now() + timedelta(seconds=cm_timeout) + cmd = ClusterCommand.getStartCmd(0, cm_timeout) + self.logger.debug("Calling cm_ctl to start cluster, cmd=[%s]" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd, retry_time=0) + if status != 0: + error_str = ErrorCode.GAUSS_516["GAUSS_51607"] % "the cluster" + \ + " Error:\n%s." % output + self.logger.debug(error_str) + self.logger.log("Warning: the cluster is not normal, please check cluster status!") + else: + self.logger.log("Successfully started primary instance. " + "Please wait for standby instances.") + + cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL, + DefaultValue.CLUSTER_STATUS_DEGRADED] + while True: + time.sleep(5) + self.logger.log('Waiting cluster normal.') + check_ret = self.check_cluster_status(cluster_normal_status, only_check=True, + check_current=True, is_log=False) + if check_ret: + self.logger.log("Successfully started standby instances.") + break + if datetime.now() >= end_time: + query_result = self.query_cluster() + self.logger.log("Timeout. Failed to start the cluster in (%s)s." % cm_timeout) + self.logger.log("Current cluster status (%s)." % query_result) + self.logger.log("It will continue to start in the background.") + break + + def __check_one_main_standby_connection(self, param_list): + """ + concurrent check main standby is connected primary dn + """ + (dn_inst, sql_check) = param_list + self.logger.debug("Node %s primary dn instanceId [%s] Check main standby is connected " + "with cmd:%s." % (dn_inst.hostname, dn_inst.instanceId, sql_check)) + status, output = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_inst.hostname, dn_inst.port) + if status == 0 and output.strip(): + self.logger.debug("Successfully check main standby connected " + "primary dn on inst:[%s]." % dn_inst.instanceId) + return True + self.logger.debug("Retry check main standby connected on inst:[%s]." % dn_inst.instanceId) + + def check_main_standby_connection_primary_dn(self, p_inst_list): + """ + check connection main_standby connected primary dn + """ + if not p_inst_list: + self.logger.debug("The primary dn does not exist on current cluster.") + return + self.primary_dn_ids = p_inst_list + sql_check = "select 1 from pg_catalog.gs_hadr_local_rto_and_rpo_stat();" + sql_check_2 = "select 1 from pg_catalog.pg_stat_get_wal_senders() where " \ + "sync_state='Async' and peer_role='Standby' and peer_state='Normal';" + param_list = [(dn_inst, sql_check) for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes + if dn_inst.instanceId in self.primary_dn_ids] + param_list_2 = [(dn_inst, sql_check_2) for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if dn_inst.instanceId + in self.primary_dn_ids] + if not param_list: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain param list for check main standby connection on primary dn") + self.logger.debug("Start check main standby connection with sql:%s." % sql_check) + results = parallelTool.parallelExecute(self.__check_one_main_standby_connection, + param_list) + self.logger.debug("Start check main standby connection with sql:%s." % sql_check_2) + results_2 = parallelTool.parallelExecute(self.__check_one_main_standby_connection, + param_list_2) + + return all(results+results_2) + + def wait_main_standby_connection(self, only_mode=None): + if only_mode and self.params.mode != only_mode: + self.logger.debug("Start cluster is not for mode:%s." % self.params.mode) + return + self.logger.log("Waiting for the main standby connection.") + end_time = datetime.now() + timedelta(seconds=self.params.waitingTimeout) + while True: + p_inst_list = [int(i) for i in DefaultValue.get_primary_dn_instance_id("Primary", + ignore=True)] + if self.check_main_standby_connection_primary_dn(p_inst_list): + break + if datetime.now() >= end_time: + raise Exception( + ErrorCode.GAUSS_516["GAUSS_51632"] % "check main standby connection" + + " Because Waiting timeout: %ss" % str(self.params.waitingTimeout)) + time.sleep(5) + self.logger.log("Main standby already connected.") + + def hadr_key_generator(self, key_name): + """ + Generate key_name.key.cipher & key_name.key.rand + """ + self.logger.log("Start generate hadr key files.") + if not os.path.exists(self.bin_path): + msg = ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain bin path." + self.logger.debug(msg) + raise Exception(msg) + if not os.path.exists(self.gp_home): + msg = ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain env GPHOME" + self.logger.debug(msg) + raise Exception(msg) + key_cipher = os.path.join(self.bin_path, "%s.key.cipher" % key_name) + key_rand = os.path.join(self.bin_path, "%s.key.rand" % key_name) + cmd = "export LD_LIBRARY_PATH=%s/script/gspylib/clib && source %s " \ + "&& gs_guc generate -S default -o %s -D '%s' && %s && %s" \ + % (self.gp_home, self.mpp_file, key_name, self.bin_path, + CmdUtil.getChmodCmd(str(ConstantsBase.KEY_FILE_MODE), key_cipher), + CmdUtil.getChmodCmd(str(ConstantsBase.KEY_FILE_MODE), key_rand)) + if (not os.path.isfile(key_cipher)) or (not os.path.isfile(key_rand)): + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0 or (not os.path.isfile(key_cipher)) \ + or (not os.path.isfile(key_rand)): + msg = ErrorCode.GAUSS_516["GAUSS_51632"] \ + % "generate hadr key files" + "Error:%s" % output + self.logger.error(msg) + raise Exception(msg) + else: + self.logger.log("Streaming key files already exist.") + + self.ssh_tool.scpFiles(key_cipher, self.bin_path) + self.ssh_tool.scpFiles(key_rand, self.bin_path) + self.logger.log("Finished generate and distribute hadr key files.") + + def encrypt_hadr_user_info(self, key_name, hadr_user, hadr_pwd): + """ + Encrypt hadr user info. + """ + self.logger.log("Start encrypt hadr user info.") + cmd = "source %s && gs_encrypt -f %s \"%s|%s\"" \ + % (self.mpp_file, key_name, hadr_user, hadr_pwd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0 or not output: + msg = ErrorCode.GAUSS_516["GAUSS_51632"] % "encrypt hadr user info" + self.logger.error(msg) + raise Exception(msg) + self.logger.log("Successfully encrypt hadr user info.") + return output + + def keep_hadr_user_info(self, info_str, retry=5): + """ + Keep hadr user info into GLOBAL CONFIGURATION + """ + self.logger.log("Start save hadr user info into database.") + sql = "ALTER GLOBAL CONFIGURATION with(hadr_user_info ='%s');" % info_str + primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + primary_dns = primary_dns * retry + output = "None" + for dn_inst in primary_dns: + status, output = ClusterCommand.remoteSQLCommand( + sql, self.user, dn_inst.hostname, dn_inst.port, True) + if status == 0: + self.logger.log("Successfully save hadr user info into database.") + return + msg = ErrorCode.GAUSS_516['GAUSS_51632'] % "save hadr user info into database" + self.logger.error(msg + "Error:%s" % SensitiveMask.mask_pwd(output)) + raise Exception(msg) + + def restore_wal_keep_segments(self, only_mode=None): + """ + restore wal_keep_segments default value + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Restore wal_keep_segments not for mode:%s." % self.params.mode) + return + self.logger.debug("Starting restore wal_keep_segments default value.") + default_value_dict = {} + wal_keep_segments = os.path.join(self.streaming_file_dir, + DoradoDisasterRecoveryConstants.WAL_KEEP_SEGMENTS) + if not os.path.isfile(wal_keep_segments): + self.logger.debug("Not found wal keep segments record file, no need restore.") + return + wal_keep_segments_list = FileUtil.readFile(wal_keep_segments) + if not wal_keep_segments_list: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] % "obtain record wal_keep_segments") + for each_dn in wal_keep_segments_list: + DefaultValue.checkGuc(each_dn.split(":")[1].strip()) + default_value_dict[each_dn.split(":")[0].strip()] = each_dn.split(":")[1].strip() + self.set_wal_keep_segments("reload", default_value_dict, True) + self.logger.debug("Successfully restore wal_keep_segments default value.") + + def __clean_streaming_files_on_local_node(self, file_name_list): + file_name_list = [file_name_list] \ + if not isinstance(file_name_list, list) else file_name_list + for file_name in file_name_list: + file_path = os.path.join(self.streaming_file_dir, file_name) + if os.path.isfile(file_path): + FileUtil.removeFile(file_path) + self.logger.debug("Successfully removed file:[%s]" % file_path) + + def clean_step_file(self): + """ + Clean step file for each action + """ + step_file = os.path.basename(self.step_file_path) + self.__clean_streaming_files_on_local_node(step_file) + self.logger.log("Successfully removed step file.") + + def check_action_and_mode(self): + """ + Check action and mode if step file exist. + if any streaming options not finished(step file exist), + not allowed doing any other streaming options except query. + """ + self.logger.debug("Checking action and mode.") + exist_step_file_names = [] + for file_name in DoradoDisasterRecoveryConstants.DDR_STEP_FILES.values(): + step_file_path = os.path.join(self.streaming_file_dir, file_name) + if os.path.isfile(step_file_path) and file_name != ".ddr_query.step": + exist_step_file_names.append(file_name) + if exist_step_file_names and set(exist_step_file_names) ^ {os.path.basename( + self.step_file_path)}: + exist_action = [key for key, value in DoradoDisasterRecoveryConstants.DDR_STEP_FILES.items() + if value in exist_step_file_names] + self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] + % "check action and mode, the step files %s already exist, " + "please ensure the action %s is finished before " + "doing current options" % (exist_step_file_names, exist_action)) + self.logger.debug("Successfully checked action and mode.") + + def clean_streaming_dir(self): + """ + Clean streaming dir when stop or failover + """ + self.logger.debug("Start clean streaming dir:%s." % self.streaming_file_dir) + cmd = g_file.SHELL_CMD_DICT["deleteDir"] % (self.streaming_file_dir, + self.streaming_file_dir) + try: + self.ssh_tool.executeCommand(cmd, hostList=self.cluster_info.getClusterNodeNames()) + except Exception as error: + self.logger.debug( + "Failed to remove streaming dir with error:%s" % error) + self.logger.log("Finished remove streaming dir.") + + def clean_global_config(self): + """ + Clean global config + """ + self.logger.log("Clean hadr user info.") + sql = "DROP GLOBAL CONFIGURATION hadr_user_info;" + primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + output = "None" + for dn_inst in primary_dns: + status, output = ClusterCommand.remoteSQLCommand( + sql, self.user, dn_inst.hostname, dn_inst.port, True) + if status == 0: + self.logger.log("Successfully clean hadr user info from database.") + return + msg = ErrorCode.GAUSS_516['GAUSS_51632'] % "clean hadr user info from database" + self.logger.debug(msg + "Error:%s" % SensitiveMask.mask_pwd(output)) + + def get_build_info(self): + """ + Assemble build infos + """ + # 1. Get local primary dn inst dir, host + self.logger.debug("Start assemble build info") + dn_inst_info = [] + dn_instances = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if int(dn_inst.mirrorId) == 1] + for dn_inst in dn_instances: + dn_info = dict() + dn_info["port"] = dn_inst.port + 1 + dn_info["data_dir"] = dn_inst.datadir + dn_info["host_name"] = dn_inst.hostname + dn_info["listen_ip"] = self.__get_local_data_ip(dn_inst.hostname) + self.logger.debug("Got build listen ips:%s, ip:%s selected." + % (str(dn_inst.listenIps), dn_info["listen_ip"])) + dn_inst_info.append(dn_info) + + # 2. Get remote dn ip and port + remote_ip_port = [] + shards = self.params.remoteClusterConf["shards"] + remote_port = int(self.params.remoteClusterConf["port"]) + 1 + shard_info = [info for shard in shards for info in shard + if info.get("shardNum", "1") == "1"] + for node_info in shard_info: + remote_ip = node_info.get("dataIp") + remote_ip_port.append((remote_ip, remote_port)) + if (not dn_inst_info) or (not remote_ip_port): + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain dn info") + self.logger.debug("Successfully get remote dn info:%s." % remote_ip_port) + return dn_inst_info, remote_ip_port + + def build_file_from_remote(self): + """ + Build files from remote cluster + """ + local_dn_info, remote_ip_port = self.get_build_info() + cmd_local = 'source %s; %s/gs_ctl build -D %s -M standby -b copy_secure_files -Z datanode' \ + ' -U %s -P "%s" -C "localhost=%s localport=%s remotehost=%s remoteport=%s"' + cmd_remote = "echo \"source %s; %s/gs_ctl build -D %s -M standby -b copy_secure_files -Z " \ + "datanode -U %s -P '%s' -C 'localhost=%s localport=%s " \ + "remotehost=%s remoteport=%s'\"" \ + " | pssh -s -H %s" + + end_time = datetime.now() + timedelta(seconds=self.params.waitingTimeout) + self.logger.debug("Retry Building with timeout:%ss." % self.params.waitingTimeout) + succeed = False + while datetime.now() < end_time: + for local_primary in local_dn_info: + for remote_ip, remote_port in remote_ip_port: + if local_primary["host_name"] == NetUtil.GetHostIpOrName(): + cmd = cmd_local % (self.mpp_file, "%s/bin" % self.gauss_home, + local_primary["data_dir"], + self.params.hadrUserName, self.params.hadrUserPassword, + local_primary["listen_ip"], local_primary["port"], + remote_ip, remote_port) + else: + cmd = cmd_remote % (self.mpp_file, "%s/bin" % self.gauss_home, + local_primary["data_dir"], + self.params.hadrUserName, self.params.hadrUserPassword, + local_primary["listen_ip"], local_primary["port"], + remote_ip, remote_port, local_primary["host_name"]) + result = DefaultValue.fast_ping_on_node(local_primary["host_name"], + local_primary["listen_ip"], + remote_ip, self.logger) + if not result[-1]: + self.logger.debug("Ignore build from %s, ping result:%s" + % (remote_ip, result[-1])) + continue + if self.cluster_info.isSingleInstCluster(): + cmd = cmd.replace(" -Z datanode", "") + self.logger.debug("Building with cmd:%s." + % cmd.replace(self.params.hadrUserPassword, "***")) + status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd) + if status == 0: + succeed = True + self.logger.debug("Successfully Building with cmd:%s." + % cmd.replace(self.params.hadrUserPassword, "***")) + return succeed + else: + self.logger.debug("Building result:%s." % SensitiveMask.mask_pwd(output)) + time.sleep(1) + return succeed + + def __copy_secure_dir_from_dn_dir(self): + """ + Find and copy key file dir from all dn dir + """ + local_temp_secure_path = os.path.join( + self.streaming_file_dir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES) + if os.path.isdir(local_temp_secure_path): + FileUtil.removeDirectory(local_temp_secure_path) + rand_path = os.path.join(local_temp_secure_path, DoradoDisasterRecoveryConstants.HADR_KEY_RAND) + cipher_path = os.path.join(local_temp_secure_path, DoradoDisasterRecoveryConstants.HADR_KEY_CIPHER) + cmd_tep = "echo \"if [ -d '%s' ];then source %s && pscp --trace-id %s -H %s '%s' '%s' " \ + "&& rm -rf '%s';fi\" | pssh -s -H %s" + succeed = False + for db_node in self.cluster_info.dbNodes: + for dn_inst in db_node.datanodes: + if int(dn_inst.mirrorId) == 1: + key_file_path = os.path.realpath(os.path.join( + dn_inst.datadir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES)) + cmd_copy_dir = cmd_tep % (key_file_path, self.mpp_file, self.trace_id, + self.local_host, key_file_path, + self.streaming_file_dir, + key_file_path, dn_inst.hostname) + status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd_copy_dir) + self.logger.debug("Copy cmd:%s" % cmd_copy_dir) + if status != 0: + self.logger.debug("Try copy secure dir from:[%s][%s], error:%s" % ( + dn_inst.hostname, key_file_path, output)) + if os.path.isdir(local_temp_secure_path) and os.path.isfile(rand_path) \ + and os.path.isfile(cipher_path): + succeed = True + if not succeed: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "copy secure file dir") + self.logger.debug("Successfully copy secure dir, file list:%s." % + os.listdir(local_temp_secure_path)) + + def build_and_distribute_key_files(self, only_mode=None): + """ + Distribute key files + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Wal keep segment opts not for mode:%s." % self.params.mode) + return + self.logger.log("Start build key files from remote cluster.") + # build file + if not self.build_file_from_remote(): + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] % "build files from cluster") + # copy file from data dir to streaming dir + self.__copy_secure_dir_from_dn_dir() + # check version consistency + self.__check_version_file() + # check cluster user consistency + self.__check_cluster_user() + # distribute key files to all node + secure_dir_path = os.path.join(self.streaming_file_dir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES) + self.__copy_hadr_user_key(secure_dir_path, update=True) + FileUtil.removeDirectory(secure_dir_path) + self.logger.log("Successfully build and distribute key files to all nodes.") + + def __check_version_file(self): + """ + function: Check whether the version numbers of the host + cluster and the disaster recovery cluster are the same + """ + gs_secure_version = os.path.realpath(os.path.join(self.streaming_file_dir, + "gs_secure_files/version.cfg")) + master_commit_id = VersionInfo.get_version_info(gs_secure_version)[-1] + local_version_file = VersionInfo.get_version_file() + local_commit_id = VersionInfo.get_version_info(local_version_file)[-1] + self.logger.debug("The committed of the host cluster is %s, " + "and the committed of the disaster recovery cluster is %s" % + (master_commit_id, local_commit_id)) + if local_commit_id != master_commit_id: + raise ValueError(ErrorCode.GAUSS_516["GAUSS_51632"] % + "check version. Different version of cluster and disaster recovery") + + def __check_cluster_user(self): + """ + function: Check whether the version numbers of the host + cluster and the disaster recovery cluster are the same + """ + user_file = os.path.realpath(os.path.join(self.streaming_file_dir, + DoradoDisasterRecoveryConstants.GS_SECURE_FILES, + DoradoDisasterRecoveryConstants.CLUSTER_USER_RECORD)) + remote_user = DefaultValue.obtain_file_content(user_file, is_list=False) + if remote_user.strip() != self.user: + self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] + % "check cluster user consistency, remote:%s, local:%s" + % (remote_user, self.user)) + self.logger.debug("Successfully checked cluster user consistency.") + + def check_cluster_type(self, allowed_type): + """ + Check cluster type is allowed type or not + """ + if allowed_type == 'primary' and self.main_standby_ids: + self.logger.logExit(ErrorCode.GAUSS_516['GAUSS_51632'] + % "check cluster type, standby cluster is not supported for %s" + % self.params.task) + elif allowed_type == 'standby' and self.primary_dn_ids: + self.logger.logExit(ErrorCode.GAUSS_516['GAUSS_51632'] + % "check cluster type, primary cluster is not supported for %s" + % self.params.task) + else: + self.logger.log("Check cluster type succeed.") + + def __remove_streaming_repl_info(self, params): + """ + Remove streaming repl info from single dn instances. + """ + dn_inst, guc_mode, dn_num = params + self.logger.debug("Start remove replconninfo for instance:%s" % dn_inst.instanceId) + for idx in range(1, dn_num + 1): + if dn_inst.hostname == self.local_host: + cmd = "source %s; gs_guc check -Z datanode -D %s " \ + "-c 'replconninfo%s'" % (self.mpp_file, dn_inst.datadir, idx) + else: + cmd = "source %s; pssh -H %s 'source %s; gs_guc check " \ + "-Z datanode -D %s -c \"replconninfo%s\"'" \ + % (self.mpp_file, dn_inst.hostname, self.mpp_file, dn_inst.datadir, idx) + self.logger.debug("Check original repl infos with cmd:%s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + " Error: \n%s " % output) + if output.count("=NULL") > 2: + continue + elif "iscrossregion=false" in output.lower(): + ret = re.search( + r"replconninfo%s='localhost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})" + r" localport=(\d{4,5}) localheartbeatport=(\d{4,5}) " + r"localservice=(\d{4,5}) " + r"remotehost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}) " + r"remoteport=(\d{4,5}) remoteheartbeatport=(\d{4,5}) " + r"remoteservice=(\d{4,5})" % idx, output) + if not ret: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "search repl infos") + if dn_inst.hostname != NetUtil.GetHostIpOrName(): + set_cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ + "-Z datanode -D %s -c " \ + "\\\"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s " \ + "remoteservice=%s'\\\"\"" + set_cmd = set_cmd % (self.mpp_file, dn_inst.hostname, + self.mpp_file, guc_mode, + dn_inst.datadir, idx, ret.group(1), + ret.group(2), ret.group(3), ret.group(4), + ret.group(5), ret.group(6), ret.group(7), + ret.group(8)) + else: + set_cmd = "source %s ; gs_guc %s -Z datanode -D %s -c " \ + "\"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s " \ + "remoteservice=%s'\"" + set_cmd = set_cmd % (self.mpp_file, guc_mode, + dn_inst.datadir, idx, ret.group(1), + ret.group(2), ret.group(3), ret.group(4), + ret.group(5), ret.group(6), ret.group(7), + ret.group(8)) + self.logger.debug("Set original repl infos with cmd:%s" % set_cmd) + status, output = CmdUtil.retryGetstatusoutput(set_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % set_cmd + + " Error: \n%s " % output) + self.logger.debug("Successfully remove original repl infos with cmd:%s." + % set_cmd) + elif "iscrossregion=true" in output.lower(): + if dn_inst.hostname != self.local_host: + set_cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ + "-Z datanode -D %s -c \\\"replconninfo%s\\\"\"" + set_cmd = set_cmd % (self.mpp_file, dn_inst.hostname, + self.mpp_file, guc_mode, + dn_inst.datadir, idx) + else: + set_cmd = "source %s ; gs_guc %s -Z datanode -D %s -c " \ + "\"replconninfo%s\"" + set_cmd = set_cmd % (self.mpp_file, guc_mode, + dn_inst.datadir, idx) + self.logger.debug("Remove stream repl infos with cmd:%s" % set_cmd) + status, output = CmdUtil.retryGetstatusoutput(set_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % set_cmd + + " Error: \n%s " % output) + self.logger.debug("Successfully remove stream repl infos with cmd:%s." + % set_cmd) + self.logger.debug("Successfully removed replconninfo for instance:%s" % dn_inst.instanceId) + + def remove_all_stream_repl_infos(self, guc_mode="set"): + """ + Remove retreaming disaster repl infos from all instances + """ + params = [] + dn_instances = [inst for node in self.cluster_info.dbNodes + for inst in node.datanodes] + cluster_conf = os.path.join(self.streaming_file_dir, + DoradoDisasterRecoveryConstants.DDR_CLUSTER_CONF_RECORD) + dn_num = DefaultValue.get_all_dn_num_for_dr(cluster_conf, dn_instances[0], + self.cluster_info, self.logger) + for inst in dn_instances: + if inst.instanceId not in self.normal_dn_ids: + self.logger.error("Ignore rectify repl info of dn:%s" % inst.instanceId) + continue + params.append((inst, guc_mode, dn_num)) + if params: + self.logger.log("Starting remove all node dn instances repl infos.") + parallelTool.parallelExecute(self.__remove_streaming_repl_info, params) + self.logger.log("Successfully remove all node dn instances repl infos.") + + def remove_streaming_cluster_file(self): + """ + function: remove the parameter file for config pg_hba + :return: NA + """ + self.logger.log("Start remove cluster file.") + cluster_info_file = os.path.join(self.streaming_file_dir, + DoradoDisasterRecoveryConstants.DDR_CLUSTER_CONF_RECORD) + cmd = g_file.SHELL_CMD_DICT["deleteFile"] % (cluster_info_file, cluster_info_file) + try: + self.ssh_tool.executeCommand(cmd, hostList=self.cluster_info.getClusterNodeNames()) + except Exception as error: + self.logger.debug( + "Failed to remove cluster file with error:%s" % error) + self.logger.log("Finished remove cluster file.") + + def remove_streaming_pg_hba(self, ignore_error=False): + """ + Remove remote ips from pg hba of streaming disaster + """ + self.logger.log("Start remove pg_hba config.") + remove_ips = [] + shards = self.params.remoteClusterConf["shards"] + for shard in shards: + for node_info in shard: + data_ip = node_info.get("dataIp") + remove_ips.append(data_ip) + remove_ips = list(set(remove_ips)) + host_names = self.get_all_connection_node_name("remove_streaming_pg_hba") + self.logger.debug("Remove ips:%s from pg_hba on nodes:%s" % ( + str(remove_ips), str(host_names))) + cmd = "%s -U '%s' -l '%s'" % (OMCommand.getLocalScript("Local_Config_Hba"), + self.user, self.log_file) + remove_ips_str = "" + for node_ip in remove_ips: + remove_ips_str += " --remove-ip %s" % node_ip + cmd += remove_ips_str + self.logger.debug("Command for updating pg_hba:%s." % cmd) + try: + self.ssh_tool.executeCommand(cmd, DefaultValue.SUCCESS, host_names) + except Exception as error: + self.logger.debug("Failed updating pg_hba with error:%s." % error) + if not ignore_error: + raise error + self.logger.log("Finished remove pg_hba config.") + + def streaming_drop_replication_slot(self, dn_inst, drop_slots): + """ + Delete dn_xxx_hadr on all dn nodes if dn_xxx_hadr exists when the disaster tolerance + relationship is lifted + """ + if not drop_slots: + self.logger.debug("WARNING:Not found dn_xxx_hadr on %s node, No need to " + "delete." % dn_inst.instanceId) + else: + for slot in drop_slots: + self.logger.debug("starting drop inst %s %s" % (dn_inst.instanceId, slot.strip())) + sql = "select * from pg_catalog.pg_drop_replication_slot('%s');" % slot.strip() + status_dr, output_dr = ClusterCommand.remoteSQLCommand( + sql, self.user, dn_inst.hostname, dn_inst.port, maintenance_mode=True) + self.logger.debug("get %s need drop replication_slots, status=%d, " + "output: %s." % (dn_inst.hostname, status_dr, + SensitiveMask.mask_pwd(output_dr))) + if status_dr != 0: + self.logger.debug("Failed to remove inst %s %s with error: %s" % ( + dn_inst.instanceId, slot.strip(), output_dr)) + self.logger.debug( + "Successfully drop node %s %s" % (dn_inst.instanceId, slot.strip())) + + def concurrent_drop_slot(self, dn_inst): + """ + concurrent drop all dn replication slots + """ + sql_check = "select * from pg_catalog.pg_get_replication_slots();" + self.logger.debug("Starting concurrent drop node %s instance [%s] replication slots" % + (dn_inst.hostname, dn_inst.instanceId)) + status, output = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_inst.hostname, dn_inst.port, maintenance_mode=True) + self.logger.debug("get %s all replication slots, status=%d, output: %s." % + (dn_inst.instanceId, status, SensitiveMask.mask_pwd(output))) + if status == 0 and output.strip(): + drop_slots = [] + if str(dn_inst.instanceId).startswith("6"): + drop_slots = re.findall(r"dn_\d+_hadr", output.strip()) + if str(dn_inst.instanceId).startswith("5"): + drop_slots = re.findall(r"cn_\d+_\d+\.\d+\.\d+\.\d+_\d+", output.strip()) + self.logger.debug("Waiting to delete instance [%s] replication slots is: %s" % + (dn_inst.instanceId, drop_slots)) + self.streaming_drop_replication_slot(dn_inst, drop_slots) + else: + self.logger.debug("Obtain all replication slot results:%s." % output) + + def streaming_clean_replication_slot(self): + """ + Delete dn_xxx_hadr on all dn nodes if dn_xxx_hadr exists when the disaster tolerance + relationship is lifted + """ + self.logger.log("Starting drop all node replication slots") + params = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if dn_inst.instanceId in self.normal_dn_ids] + self.logger.debug("need drop all node replication slots: %s" % + [inst.instanceId for inst in params]) + parallelTool.parallelExecute(self.concurrent_drop_slot, params) + self.logger.log("Finished drop all node replication slots") + + def update_streaming_info(self, key, value, only_mode=None): + """ + Update info for streaming status + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Update query status [%s] to [%s] " + "not for mode:%s." % (key, value, self.params.mode)) + return + self.logger.debug("Update query [%s] to [%s]." % (key, value)) + try: + if key == "cluster": + key_stat = DoradoDisasterRecoveryConstants.HADR_CLUSTER_STAT + elif key == DoradoDisasterRecoveryConstants.ACTION_FAILOVER: + key_stat = DoradoDisasterRecoveryConstants.HADR_FAILOVER_STAT + elif key == DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: + key_stat = DoradoDisasterRecoveryConstants.HADR_SWICHOVER_STAT + elif key == DoradoDisasterRecoveryConstants.ACTION_ESTABLISH: + key_stat = DoradoDisasterRecoveryConstants.HADR_ESTABLISH_STAT + else: + self.logger.debug("key error.") + return + file_path = os.path.realpath(os.path.join(self.streaming_file_dir, key_stat)) + with os.fdopen(os.open(file_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, + DefaultValue.KEY_FILE_MODE_IN_OS), "w") as fp_write: + fp_write.write(value) + host_names = self.get_all_connection_node_name( + action_flag="update_streaming_info", no_update=True) + self.ssh_tool.scpFiles(file_path, self.streaming_file_dir, host_names) + except Exception as error: + self.logger.debug("Failed write info, key:%s, value:%s, " + "error:%s." % (key, value, error)) + + def create_cluster_maintance_file(self, value): + """ + add cluster_maintance file for streaming failover and switchover disaster_standby + """ + self.logger.debug("Start create cluster_maintance file.") + try: + cluster_maintance_file = os.path.realpath(os.path.join(self.gauss_home, + "bin/cluster_maintance")) + with os.fdopen(os.open(cluster_maintance_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, + DefaultValue.KEY_FILE_MODE_IN_OS), "w") as fp_write: + fp_write.write(value) + host_names = self.get_all_connection_node_name("create_cluster_maintance_file") + self.ssh_tool.scpFiles(cluster_maintance_file, + os.path.join(self.gauss_home, "bin"), host_names) + except Exception as error: + self.logger.debug("WARNING: Failed create cluster_maintance file, value:%s, " + "error:%s." % (value, str(error))) + self.logger.debug("Successfully create cluster_maintance file.") + + def streaming_failover_single_inst(self, stream_disaster_step, action_flag=None): + """ + streaming disaster recovery failover for single_inst cluster + """ + self.create_cluster_maintance_file("streaming failover") + if action_flag != DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: + self.update_streaming_info("cluster", "promote") + # 0. check cluster status and get normal instance list + if stream_disaster_step < 0: + if action_flag == DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: + self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "10%") + else: + self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "10%") + self.init_cluster_status() + self.parse_cluster_status() + self.write_streaming_step("0_check_cluster_status_done_for_failover") + # 1.Specify max xid and max ter to start etcd + max_term_record = os.path.join(self.streaming_file_dir, ".max_term_record") + if stream_disaster_step < 1: + max_term = self.get_term_info() + term_key = "/%s/CMServer/status_key/term" % self.user + para_dict = {term_key: max_term, self.backup_open_key: "0"} + ClusterInstanceConfig.set_data_on_dcc(self.cluster_info, + self.logger, self.user, para_dict) + DefaultValue.write_content_on_file(max_term_record, max_term) + self.write_streaming_step("1_start_etcd_done_for_failover") + self._failover_config_step(stream_disaster_step, action_flag) + self._failover_start_step(stream_disaster_step, action_flag, max_term_record) + + def _failover_start_step(self, stream_disaster_step, action_flag, max_term_record): + """ + Failover step 5 & 6 + """ + if stream_disaster_step < 5: + if action_flag == DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: + self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "80%") + else: + self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "80%") + if not os.path.isfile(max_term_record): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % max_term_record) + _, dn_infos = self.get_specified_dn_infos() + max_term_list = DefaultValue.obtain_file_content(max_term_record) + if not max_term_list: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "read max term") + params = [(dn_info, max_term_list[0]) for dn_info in dn_infos] + if params: + parallelTool.parallelExecute(self.start_primary_dn, params) + self.write_streaming_step("5_start_primary_dn_done") + if stream_disaster_step < 6: + self.start_cluster() + cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL, + DefaultValue.CLUSTER_STATUS_DEGRADED] + self.check_cluster_status(cluster_normal_status, check_current=True) + cluster_info = self.query_cluster_info() + self.parse_cluster_status(current_status=cluster_info) + if action_flag != DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: + self.clean_global_config() + self.restore_guc_params() + self.streaming_clean_archive_slot() + if action_flag != DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: + self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "100%") + self.update_streaming_info("cluster", "normal") + else: + self.update_streaming_info("cluster", "archive") + + def streaming_clean_archive_slot(self): + """ + drop lot_type is physical and slot_name not contain (gs_roach_full,gs_roach_inc, + cn_xxx,dn_xxx, dn_xxx_hadr) on all cn node and all primary dn node if the + slot_name exists when the disaster cluster become primary cluster + """ + self.logger.debug("Starting drop archive slots") + params = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + self.logger.debug("need drop all node archive slots: %s" % + [inst.instanceId for inst in params]) + parallelTool.parallelExecute(self.parallel_drop_archive_slot, params) + self.logger.debug("Successfully drop all node archive slots") + + def parallel_drop_archive_slot(self, dn_inst): + """ + concurrent drop all primary dn and all cn archive slots + """ + sql_check = "select slot_name from pg_catalog.pg_get_replication_slots() " \ + "where slot_type='physical' and slot_name not in " \ + "('gs_roach_full', 'gs_roach_inc') and slot_name not like 'cn_%' and " \ + "slot_name not like 'dn_%';" + self.logger.debug("Starting concurrent drop node %s instance [%s] archive slots" % + (dn_inst.hostname, dn_inst.instanceId)) + (status, output) = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_inst.hostname, dn_inst.port) + self.logger.debug("get %s all archive slots, status=%d, output: %s." % + (dn_inst.instanceId, status, output)) + if status == 0 and output.strip(): + archive_slots = output.strip().split('\n') + self.logger.debug("Waiting to delete instance [%s] archive slots is: %s" % + (dn_inst.instanceId, archive_slots)) + self.streaming_drop_replication_slot(dn_inst, archive_slots) + + def get_specified_dn_infos(self, update=False, dn_status="Primary"): + + """ + Get specified dn infos + """ + tmp_file = os.path.join(self.streaming_file_dir, "cluster_state_tmp") + if not os.path.isfile(tmp_file) or update: + cmd = ClusterCommand.getQueryStatusCmd(self.user, 0, tmp_file) + self.logger.debug("Update cluster state with cmd: %s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain primary dn infos" + "Error:%s" % output) + cluster_info = DbClusterStatus() + cluster_info.initFromFile(tmp_file) + dn_infos = [] + dn_ids = [] + dn_instances = [(inst, db_node.name) for db_node in cluster_info.dbNodes + for inst in db_node.datanodes] + for data_inst, db_node_name in dn_instances: + if data_inst.status == dn_status: + one_dn_info = dict() + one_dn_info["node_ip"] = db_node_name + one_dn_info["instance_id"] = data_inst.instanceId + one_dn_info["data_dir"] = data_inst.datadir + dn_ids.append(data_inst.instanceId) + dn_infos.append(one_dn_info) + self.logger.debug("Got primary dn infos: %s:%s" % (dn_ids, dn_infos)) + return dn_ids, dn_infos + + def start_primary_dn(self, params): + """ + Start main standby as primary dn in streaming failover. + """ + dn_info, max_term = params + opt_type = " -Z datanode" if not self.cluster_info.isSingleInstCluster() else "" + self.logger.debug("Starting primary dn %s, max term:%s." % + (dn_info["instance_id"], max_term)) + bin_path = "%s/bin" % self.cluster_info.appPath + instance_id = dn_info["instance_id"] + hostname = dn_info["node_ip"] + data_dir = dn_info["data_dir"] + if self.local_ip == hostname: + cmd_start = "source %s; %s/gs_ctl start%s -D %s -M pending -t 600" % \ + (self.mpp_file, bin_path, opt_type, data_dir) + else: + cmd_start = "source %s; pssh -s -t 900 -H %s \"source %s; " \ + "%s/gs_ctl start%s -D %s -M pending" \ + " -t 600\"" % (self.mpp_file, hostname, self.mpp_file, + bin_path, opt_type, data_dir) + self.logger.debug("Start primary dn with cmd:%s" % cmd_start) + status, output = CmdUtil.retryGetstatusoutput(cmd_start) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "start primary dn %s with error:%s" + % (instance_id, output)) + self.logger.debug("Successfully start primary dn %s" % instance_id) + if self.local_ip == hostname: + cmd_config = "source %s; %s/gs_ctl notify%s -D %s -M primary -T %s -t 600" \ + % (self.mpp_file, bin_path, opt_type, data_dir, max_term) + else: + cmd_config = "source %s; pssh -s -t 900 -H %s \"source %s; %s/gs_ctl notify%s -D %s " \ + "-M primary -T %s -t 600\"" % (self.mpp_file, self.mpp_file, hostname, + bin_path, opt_type, data_dir, max_term) + self.logger.debug("Config primary dn with cmd:%s" % cmd_config) + status, output = CmdUtil.retryGetstatusoutput(cmd_config) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "config primary dn %s with error:%s" + % (instance_id, output)) + self.logger.debug("Successfully start and config primary dn:%s" % instance_id) + + def stream_disaster_set_cmserver_guc(self, guc_parameter, guc_value, guc_type): + """ + set cmserver guc param + :param guc_parameter: guc param + :param guc_value: value + :param guc_type: init type + :return: NA + """ + self.logger.debug("Starting set cm server for streaming disaster.") + cmd = "source %s && gs_guc %s -Z cmserver -D 'cm_instance_data_path' -c \"%s=%s\" " \ + % (self.mpp_file, guc_type, guc_parameter, guc_value) + self.logger.debug("streaming disaster calling set cms, cmd=[%s]" % cmd) + self.ssh_tool.executeCommand(cmd, hostList=self.normal_cm_ips) + self.logger.debug("Successfully set cm server for streaming disaster.") + + def stream_disaster_set_cmagent_guc(self, guc_parameter, guc_value, guc_type): + """ + set cmagent guc param + :param guc_parameter: guc param + :param guc_value: value + :param guc_type: init type + :return: NA + """ + self.logger.debug("Starting set cm agent for streaming disaster.") + cmd = "source %s && gs_guc %s -Z cmagent -D 'cm_instance_data_path' -c \"%s=%s\" " \ + % (self.mpp_file, guc_type, guc_parameter, guc_value) + self.logger.debug("streaming disaster calling set cma, cmd=[%s]" % cmd) + self.ssh_tool.executeCommand(cmd, hostList=self.normal_node_list) + self.logger.debug("Successfully set cm agent for streaming disaster.") + + def _failover_config_step(self, stream_disaster_step, action_flag): + """ + Failover step 2 - 4 + """ + # 2.Stop the cluster by node + if stream_disaster_step < 2: + if action_flag != DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: + self.streaming_clean_replication_slot() + self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "30%") + self.stop_cluster_by_node() + self.write_streaming_step("2_stop_cluster_done_for_failover") + # 3.Start the cluster in the main cluster mode + if stream_disaster_step < 3: + self.set_cmserver_guc("backup_open", "0", "set") + self.stream_disaster_set_cmagent_guc("agent_backup_open", "0", "set") + self.write_streaming_step("3_set_backup_open_for_failover") + # 4.Delete the relevant guc parameters and remove the disaster tolerance relationship + # based on streaming disaster recovery cluster, No need to delete for switchover. + if not action_flag: + if stream_disaster_step < 4: + self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "50%") + self.remove_all_stream_repl_infos() + self.remove_streaming_pg_hba(True) + self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "70%") + self.write_streaming_step("4_remove_hba_repl_done_for_failover") + + def get_term_info(self): + """get_term_info""" + # get max term from dns + return self.get_term() + + def get_term(self, normal_dn=True): + """ + get etcd term + """ + max_term = 0 + sql_cmd = "select term from pg_last_xlog_replay_location();" + params_list = [(inst, sql_cmd, max_term, normal_dn) for db_node in + self.cluster_info.dbNodes for inst in db_node.datanodes] + if params_list: + term_list = parallelTool.parallelExecute(self.get_max_term_by_compare, params_list) + self.logger.debug("Get term list: %s." % term_list) + if not term_list: + max_term = 0 + else: + max_term = int(max(term_list)) + if int(max_term) == 0: + raise Exception("Failed get term") + max_term = int(max_term) + 100 + self.logger.debug("Get max term %s in dns" % max_term) + return max_term + + def streaming_switchover_roll_back_condition(self): + """ + check need rollback or not by Main Standby dn status + output: return True means need rollback + """ + self.logger.debug("Starting check switchover rollback condition.") + cluster_status = self.query_cluster_info(cm_check=True) + if not cluster_status: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "query cluster status when check rollback condition") + + rollback_check_list = ["Main Standby Need repair(Disconnected)", + "Main Standby Need repair(Connecting)"] + need_rollback = False + for check_status in rollback_check_list: + if check_status in cluster_status: + need_rollback = True + self.logger.debug("Successfully check rollback condition: %s." % need_rollback) + self.logger.debug("Cluster status: %s." % cluster_status) + return need_rollback + + def get_max_term_by_compare(self, params): + """ + get max term by compare + """ + instance, sql_cmd, max_term, normal_dn = params + if (normal_dn is True and instance.instanceId in self.normal_dn_ids) or \ + (normal_dn is False and instance.instanceType == DefaultValue.MASTER_INSTANCE): + (status, output) = ClusterCommand.remoteSQLCommand( + sql_cmd, self.user, instance.hostname, instance.port, maintenance_mode=True) + if status != 0 or self.find_error(output): + raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % + sql_cmd + "\nError: %s" % output) + self.logger.debug("TERM %s, Instance %s" % (output, instance.instanceId)) + term = output.strip() + if int(term) > int(max_term): + max_term = term + return int(max_term) + + def remove_cluster_maintance_file(self): + """ + function: remove the cluster_maintance file + :return: NA + """ + self.logger.debug("Start remove cluster_maintance file.") + cluster_maintance_file = os.path.realpath(os.path.join(self.gauss_home, + "bin/cluster_maintance")) + cmd = g_file.SHELL_CMD_DICT["deleteFile"] % (cluster_maintance_file, cluster_maintance_file) + host_names = self.get_all_connection_node_name("remove_cluster_maintance_file") + try: + self.ssh_tool.executeCommand(cmd, hostList=host_names) + except Exception as error: + self.logger.debug( + "Failed to remove cluster_maintance file with error: %s" % str(error)) + self.logger.debug("Successfully remove %s cluster_maintance file." % host_names) + + def get_node_sship_from_nodeid(self, node_id): + """ + get node sship from nodeid + :param node_id: node id + :return: + """ + for nodename in self.cluster_info.dbNodes: + if int(node_id) == int(nodename.id): + return nodename.sshIps[0] + + def delivery_file_to_other_node(self, path_name, file_name, node_list=None): + """delivery_file_to_other_node""" + send_file = "%s/%s" % (path_name, file_name) + send_file_bak = "%s/%s_bak" % (path_name, file_name) + if not os.path.isfile(send_file): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % send_file) + + if node_list: + p_node_list = " -H ".join(node_list) + elif self.cluster_info.getClusterNodeNames(): + p_node_list = " -H ".join(self.cluster_info.getClusterNodeNames()) + else: + raise Exception("Failed to delivery file: %s, node information does not exits" + % file_name) + pscp_cmd = "cp %s %s && source %s && pscp -t 60 -H %s %s %s && rm -f %s" % \ + (send_file, send_file_bak, self.mpp_file, p_node_list, + send_file_bak, send_file, send_file_bak) + status, output = CmdUtil.retryGetstatusoutput(pscp_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % pscp_cmd + " Error:\n%s" % output) + else: + self.logger.debug("Successfully send %s to all nodes" % send_file) + + @staticmethod + def find_error(output): + """ + error rule + :param output: error info + :return:bool + """ + error_msg_flag = "(ERROR|FATAL|PANIC)" + error_pattern = "^%s:.*" % error_msg_flag + pattern = re.compile(error_pattern) + for line in output.split("\n"): + line = line.strip() + result = pattern.match(line) + if result is not None: + return True + return False + + def set_stream_cluster_run_mode_guc(self, guc_mode, fail_over=False): + """ + function: set cluster run mode guc + :return: + """ + cluster_run_mode = "cluster_primary" if self.params.mode == "primary" \ + else "cluster_standby" + if fail_over: + cluster_run_mode = "cluster_primary" + guc_cmd = "source %s && gs_guc %s -Z datanode -N all -I all -c " \ + "\"stream_cluster_run_mode = '%s'\"" % \ + (self.mpp_file, guc_mode, cluster_run_mode) + host_names = self.cluster_info.getClusterNodeNames() + ignore_node = [node for node in host_names if node not in self.normal_node_list] + if ignore_node: + self.logger.debug( + "WARNING: cluster_run_mode for datanode ignore nodes:%s" % ignore_node) + nodes = ",".join(ignore_node) + guc_cmd = guc_cmd + " --ignore-node %s" % nodes + self.logger.debug("Set dn stream_cluster_run_mode with cmd:%s" % guc_cmd) + (status, output) = CmdUtil.retryGetstatusoutput(guc_cmd) + if status != 0: + self.logger.debug("Warning: Failed %s dn stream_cluster_run_mode=%s, output: %s" % + (guc_mode, cluster_run_mode, str(output))) + else: + self.logger.debug("Successfully %s streaming cluster run mode for " + "datanode param %s" % (guc_mode, cluster_run_mode)) + + guc_cmd_cn = "source %s && gs_guc %s -Z coordinator -N all -I all -c " \ + "\"stream_cluster_run_mode = '%s'\"" % \ + (self.mpp_file, guc_mode, cluster_run_mode) + if ignore_node: + self.logger.debug( + "WARNING: cluster_run_mode for coordinator ignore nodes:%s" % ignore_node) + nodes = ",".join(ignore_node) + guc_cmd_cn = guc_cmd_cn + " --ignore-node %s" % nodes + self.logger.debug("Set cn stream_cluster_run_mode with cmd:%s" % guc_cmd_cn) + (status, output) = CmdUtil.retryGetstatusoutput(guc_cmd_cn) + if status != 0: + self.logger.debug("Warning: Failed %s cn stream_cluster_run_mode=%s, output: %s" % + (guc_mode, cluster_run_mode, str(output))) + else: + self.logger.debug("Successfully %s streaming cluster run mode for " + "coordinator param %s" % (guc_mode, cluster_run_mode)) + + def set_data_in_dcc(self, key, value, only_mode=None): + """ + Set data in dcc + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("set [%s][%s] not for mode:%s." % (key, value, self.params.mode)) + return + self.logger.debug("Start set data: [%s][%s] in dcc." % (key, value)) + ClusterInstanceConfig.set_data_on_dcc(self.cluster_info, + self.logger, self.user, + {key: value}) + self.logger.log("Successfully set [%s][%s]." % (key, value)) + + def stop_cluster(self, action=None): + """ + stop the cluster + """ + self.logger.log("Stopping the cluster.") + static_config = "%s/bin/cluster_static_config" % self.cluster_info.appPath + cm_ctl_file = "%s/bin/cm_ctl" % self.cluster_info.appPath + if not os.path.isfile(static_config) or not os.path.isfile(cm_ctl_file): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % + (static_config + " or " + cm_ctl_file)) + cmd = ClusterCommand.getStopCmd(0, "i", 1800) + if action: + cmd = ClusterCommand.getStopCmd(0, timeout=1800) + self.logger.debug("disaster cluster calling cm_ctl to stop cluster, cmd=[%s]" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd, retry_time=0) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51610"] % + ("the cluster" + " Error:\n%s." % output)) + self.logger.log("Successfully stopped the cluster.") diff --git a/script/impl/dorado_disaster_recovery/ddr_constants.py b/script/impl/dorado_disaster_recovery/ddr_constants.py new file mode 100644 index 00000000..6e185b35 --- /dev/null +++ b/script/impl/dorado_disaster_recovery/ddr_constants.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_constants.py is utility for defining constants +# of streaming disaster recovery. +############################################################################# + + +class DoradoDisasterRecoveryConstants: + + # streaming files + DDR_LOG_FILE = "gs_ddr.log" + DDR_FILES_DIR = 'ddr_cabin' + DDR_CLUSTER_STATUS_TMP_FILE = "cluster_state_tmp" + WAL_KEEP_SEGMENTS = ".wal_keep_segments_record" + DDR_CLUSTER_CONF_RECORD = "cluster_conf_record" + GS_SECURE_FILES = "gs_secure_files" + HADR_KEY_CIPHER = "hadr.key.cipher" + HADR_KEY_RAND = "hadr.key.rand" + STREAM_SWITCHOVER_STATE = ".switchover_cluster_state" + MAX_TERM_RECORD = ".max_term_record" + PROCESS_LOCK_FILE = 'ddr_lock_' + STREAMING_CONFIG_XML = "ddr_config.xml" + GUC_BACKUP_FILE = ".ddr_guc_backup" + CLUSTER_USER_RECORD = ".cluster_user_record" + + ACTION_START = "start" + ACTION_SWITCHOVER = "switchover" + ACTION_FAILOVER = "failover" + + ACTION_ESTABLISH = "establish" + + # streaming query temp file + HADR_CLUSTER_STAT = ".hadr_cluster_stat" + HADR_FAILOVER_STAT = ".hadr_failover_stat" + HADR_SWICHOVER_STAT = ".hadr_switchover_stat" + HADR_ESTABLISH_STAT = ".hadr_establish_stat" + + STREAM_DISTRIBUTE_ACTION = "distribute_stream_failover" + + # GUC CHANGE MAP + GUC_CHANGE_MAP = {"most_available_sync": "on", "synchronous_commit": "on"} + + # params in json file for each module + STREAMING_JSON_PARAMS = { + "start": ["localClusterConf", "remoteClusterConf"], + "stop": ["localClusterConf", "remoteClusterConf"], + "switchover": [], + "failover": [] + } + + # step file of each module + DDR_STEP_FILES = { + "start_primary": ".ddr_start_primary.step", + "start_standby": ".ddr_start_standby.step", + "stop": ".ddr_stop.step", + "switchover_primary": ".ddr_switchover_primary.step", + "switchover_standby": ".ddr_switchover_standby.step", + "failover": ".ddr_failover.step", + "query": ".ddr_query.step", + } + # task need check process is exist + TASK_EXIST_CHECK = ["start", "stop", "switchover", "failover"] + + # default values + MAX_WAL_KEEP_SEGMENTS = 16384 + MAX_REPLICATION_NUMS = 8 + MAX_BUILD_TIMEOUT = 1209600 + STANDBY_START_TIMEOUT = 3600 * 24 * 7 + CHECK_PROCESS_WAIT_TIME = 3 + + # backup open key + BACKUP_OPEN = "/%s/CMServer/backup_open" + + # log remark + LOG_REMARK = "-" * 80 diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/__init__.py b/script/impl/dorado_disaster_recovery/ddr_modules/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py new file mode 100644 index 00000000..ee341be5 --- /dev/null +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_disaster_recovery_start.py is utility for creating +# relationship between primary cluster and standby cluster. + +import os + +from base_utils.security.sensitive_mask import SensitiveMask +from gspylib.common.ErrorCode import ErrorCode +from gspylib.common.Common import DefaultValue, ClusterCommand +from impl.dorado_disaster_recovery.ddr_base import DoradoDisasterRecoveryBase +from impl.dorado_disaster_recovery.ddr_constants import DoradoDisasterRecoveryConstants + + +class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _first_step_for_ddr_start(self, step): + """ + First step for streaming start + """ + if step >= 2: + return + self.logger.debug("Start first step of DisasterRecovery start.") + #创建容灾过程使用的临时目录 + self.create_disaster_recovery_dir(self.streaming_file_dir) + #检查执行的标志文件 + self.check_action_and_mode() + self.init_cluster_status() + + def _second_step_for_ddr_start(self, step): + """ + Second step for ddr start + """ + if step >= 2: + return + self.logger.debug("Start second step of ddr start.") + self.check_cluster_status(status_allowed=['Normal']) + self.check_cluster_is_common() + cm_exist = DefaultValue.check_is_cm_cluster(self.logger) + if not cm_exist: + self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] % + "check cm_ctl is available for current cluster") + self.check_is_under_upgrade() + #检查dn的GUC参数 + #self.check_dn_instance_params() + self.write_streaming_step("2_check_cluster_step") + + def _third_step_for_ddr_start(self, step): + """ + Third step for streaming start + """ + if step >= 3: + return + self.logger.debug("Start third step of streaming start.") + #self.drop_replication_slot_on_dr_cluster(only_mode="disaster_standby") + #self.prepare_gs_secure_files(only_mode='primary') + #self.build_and_distribute_key_files(only_mode='disaster_standby') + #self.get_default_wal_keep_segments(only_mode='primary') + self.write_streaming_step("3_set_wal_segments_step") + + def drop_replication_slot_on_dr_cluster(self, only_mode=None): + """ + Drop replication slot on dr cluster + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Drop replication slot opts not for mode:%s." % self.params.mode) + return + sql_check = "select slot_name from pg_get_replication_slots() where slot_type='logical'" + primary_dns = DefaultValue.get_primary_dn_instance_id("Primary", ignore=True) + if not primary_dns: + return + primary_insts = [inst for node in self.cluster_info.dbNodes + for inst in node.datanodes if str(inst.instanceId) in primary_dns] + dn_inst = primary_insts[0] + self.logger.debug("Start drop node %s [%s] slots" % (dn_inst.hostname, dn_inst.instanceId)) + status, output = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_inst.hostname, dn_inst.port) + self.logger.debug("Get %s all replication slots, status=%d, output: %s." % + (dn_inst.instanceId, status, SensitiveMask.mask_pwd(output))) + if status == 0 and output.strip(): + drop_slots = output.strip().split('\n') + for slot in drop_slots: + self.logger.debug("Starting drop node %s %s" % (dn_inst.instanceId, slot.strip())) + sql = "select * from pg_drop_replication_slot('%s');" % slot.strip() + status_dr, output_dr = ClusterCommand.remoteSQLCommand( + sql, self.user, dn_inst.hostname, dn_inst.port) + if status_dr != 0: + self.logger.debug("Failed to remove node %s %s with error: %s" % ( + dn_inst.hostname, slot.strip(), SensitiveMask.mask_pwd(output_dr))) + self.logger.debug( + "Successfully drop node %s %s" % (dn_inst.instanceId, slot.strip())) + + def _fourth_step_for_ddr_start(self, step): + """ + Fourth step for streaming start + """ + if step >= 4: + return + self.logger.debug("Start fourth step of streaming start.") + self.set_wal_keep_segments( + "reload", DoradoDisasterRecoveryConstants.MAX_WAL_KEEP_SEGMENTS, only_mode='primary') + self.write_streaming_step("4_set_wal_segments_step") + + def _fifth_step_for_ddr_start(self, step): + """ + Fifth step for streaming start + """ + if step >= 5: + return + self.logger.debug("Start fifth step of streaming start.") + self.set_data_in_dcc(self.backup_open_key, "0", only_mode='primary') + self.set_data_in_dcc(self.backup_open_key, "1", only_mode='disaster_standby') + #self.set_most_available(mode="reload", raise_error=False) + self.stop_cluster_by_node(only_mode='disaster_standby') + self.write_streaming_step("5_set_wal_segments_step") + + def common_step_for_ddr_start(self): + """ + Common step for ddr start between step 1 and 2 + """ + self.logger.debug("Start common config step of ddr start.") + self.distribute_cluster_conf() + self.update_streaming_pg_hba() + self.config_cross_cluster_repl_info() + + def _sixth_step_for_ddr_start(self, step): + """ + Sixth step for streaming start + """ + if step >= 6: + return + self.logger.debug("Start sixth step of streaming start.") + self.set_cmserver_guc("backup_open", "1", "set", only_mode='disaster_standby') + self.set_cmagent_guc("agent_backup_open", "1", "set", only_mode='disaster_standby') + self.write_streaming_step("6_set_guc_step") + + def _seventh_step_for_ddr_start(self, step): + """ + Seventh step for streaming start + """ + if step >= 7: + return + self.logger.debug("Start seventh step of streaming start.") + self.update_streaming_info("cluster", "restore", only_mode='disaster_standby') + try: + self.start_dss_instance(only_mode='disaster_standby') + self.build_dn_instance(only_mode='disaster_standby') + self.kill_dss_instance(only_mode='disaster_standby') + except Exception as error: + self.update_streaming_info("cluster", "restore_fail", only_mode='disaster_standby') + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "build dns" + "Error:%s" % error) + self.write_streaming_step("7_build_dn_instance_step") + + def _eighth_step_for_ddr_start(self, step): + """ + Eighth step for streaming start + """ + if step >= 8: + return + self.logger.debug("Start eighth step of streaming start.") + self.start_cluster(cm_timeout=DoradoDisasterRecoveryConstants.STANDBY_START_TIMEOUT, + only_mode='disaster_standby') + self.update_streaming_info("cluster", "full_backup", only_mode='primary') + try: + self.wait_main_standby_connection(only_mode='primary') + except Exception as error: + self.update_streaming_info("cluster", "backup_fail", only_mode='primary') + raise Exception(str(error)) + ret = self.check_cluster_status(status_allowed=['Normal'], + only_check=True, check_current=True) + query_status = "recovery" if ret else "recovery_fail" + self.update_streaming_info("cluster", query_status, only_mode='disaster_standby') + self.update_streaming_info("cluster", "archive", only_mode='primary') + self.write_streaming_step("8_start_cluster_step") + + def _ninth_step_for_ddr_start(self, step): + """ + ninth step for streaming start + """ + if step >= 9: + return + self.logger.debug("Start ninth step of streaming start.") + #self.restore_wal_keep_segments(only_mode='primary') + self.clean_gs_secure_dir() + self.clean_step_file() + + def _check_and_refresh_disaster_user_permission(self): + """check and refresh disaster user permission""" + if self.params.mode != "primary": + return + self.check_hadr_user(only_mode='primary') + self.check_hadr_pwd(only_mode='primary') + self.logger.debug("Encrypt hadr user info to database not " + "for mode:%s." % self.params.mode) + hadr_cipher_path = os.path.join(self.bin_path, "hadr.key.cipher") + hadr_rand_path = os.path.join(self.bin_path, "hadr.key.rand") + if not os.path.isfile(hadr_cipher_path) or not os.path.isfile(hadr_rand_path): + self.hadr_key_generator('hadr') + user_info = DefaultValue.obtain_hadr_user_encrypt_str(self.cluster_info, self.user, + self.logger, False, True) + if user_info: + self.clean_global_config() + pass_str = self.encrypt_hadr_user_info( + 'hadr', self.params.hadrUserName, self.params.hadrUserPassword) + self.keep_hadr_user_info(pass_str) + + def run(self): + self.logger.log("Start create dorado storage disaster relationship.") + step = self.query_streaming_step() + self._first_step_for_ddr_start(step) + #1.检查集群状态正常 + self.parse_cluster_status() + #dorado存储复制没有流复制user + #self._check_and_refresh_disaster_user_permission() + self._second_step_for_ddr_start(step) + #更新pg_hba和replinfo + self.common_step_for_ddr_start() + self._third_step_for_ddr_start(step) + self._fourth_step_for_ddr_start(step) + self._fifth_step_for_ddr_start(step) + #设置CM backup_open参数,灾备backup_open=1, 主集群backup_open=0 + self._sixth_step_for_ddr_start(step) + #start dss,build main standby + self._seventh_step_for_ddr_start(step) + self._eighth_step_for_ddr_start(step) + self._ninth_step_for_ddr_start(step) + self.logger.log("Successfully do streaming disaster recovery start.") + \ No newline at end of file diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py new file mode 100644 index 00000000..77bdacc4 --- /dev/null +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_disaster_recovery_failover.py is utility for +# standby cluster failover to primary cluster. + + +from gspylib.common.Common import DefaultValue +from gspylib.common.ErrorCode import ErrorCode +from impl.dorado_disaster_recovery.ddr_base import DoradoDisasterRecoveryBase + + +class DisasterRecoveryFailoverHandler(DoradoDisasterRecoveryBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def run(self): + self.logger.log("Start streaming disaster failover.") + self.check_action_and_mode() + step = self.check_streaming_failover_workable(check_type_step=3, check_status_step=0) + self.check_is_under_upgrade() + self.init_cluster_conf() + try: + self.streaming_failover_single_inst(step) + self.update_streaming_info("cluster", "normal") + self.clean_step_file() + except Exception as error: + self.update_streaming_info("cluster", "promote_fail") + raise Exception( + ErrorCode.GAUSS_516["GAUSS_51632"] % "centralize failover" + "Error:%s" % error) + finally: + self.remove_cluster_maintance_file() + self.clean_streaming_dir() + self.logger.log("Successfully do streaming disaster recovery failover.") + + def check_streaming_failover_workable(self, check_type_step=0, check_status_step=0): + """ + Check streaming failover is workable. + """ + self.logger.debug("Streaming disaster distribute cluster failover...") + stream_disaster_step = self.query_streaming_step() + if not DefaultValue.is_disaster_cluster(self.cluster_info) \ + and stream_disaster_step < check_type_step: + self.logger.debug("The primary dn exist, do nothing except record the result file.") + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "streaming disaster cluster failover, Because the primary cluster " + "does not support failover") + cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL, + DefaultValue.CLUSTER_STATUS_DEGRADED] + if stream_disaster_step < check_status_step: + self.init_cluster_status() + self.parse_cluster_status() + if stream_disaster_step < check_status_step: + self.check_cluster_status(cluster_normal_status) + return stream_disaster_step diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py new file mode 100644 index 00000000..dc7ffea3 --- /dev/null +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_disaster_recovery_query.py is utility for +# query streaming disaster recovery condition. + +import os + +from base_utils.security.sensitive_mask import SensitiveMask +from impl.dorado_disaster_recovery.ddr_constants import DoradoDisasterRecoveryConstants +from gspylib.common.Common import ClusterCommand +from impl.dorado_disaster_recovery.ddr_base import DoradoDisasterRecoveryBase + + +class StreamingQueryHandler(DoradoDisasterRecoveryBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_streaming_cluster_query_value(self, file_name): + """ + Query infos from files. + """ + file_path = os.path.realpath(os.path.join(self.streaming_file_dir, file_name)) + if not os.path.isfile(file_path) and file_name in [DoradoDisasterRecoveryConstants.HADR_CLUSTER_STAT]: + return "normal" + if not os.path.isfile(file_path): + return "0%" + with open(file_path, 'r') as read_file: + value = read_file.read().strip() + return value + + def check_archive(self, former_status, cluster_status): + """ + Check for archive. + """ + self.logger.log("Start check archive.") + if former_status.strip() not in ["archive", "archive_fail"]: + self.logger.debug("Ignore for status:%s" % former_status) + return + archive_status = "archive_fail" + if cluster_status.lower() not in ["normal", "degraded"]: + self.logger.debug("Cluster status:%s,archive fail." % cluster_status) + return archive_status + if self.main_standby_ids or (not self.primary_dn_ids): + self.logger.debug("Ignore update archive for disaster_standby cluster.") + return archive_status + sql_check = "select 1 from pg_catalog.pg_stat_get_wal_senders() where sync_state" \ + "='Async' and peer_role='Standby' and peer_state='Normal';" + dn_instances = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes + if inst.instanceId in self.primary_dn_ids] + self.logger.debug("Check archive with cmd:%s." % sql_check) + if dn_instances: + status, output = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_instances[0].hostname, + dn_instances[0].port) + if status == 0 and output and output.strip(): + archive_status = "archive" + self.logger.debug("Successfully check archive, results:%s." % + SensitiveMask.mask_pwd(output)) + return archive_status + elif status == 0 and not output.strip(): + self.logger.debug("Check archive fail.") + return archive_status + else: + self.logger.debug("Check archive status:%s, output:%s." + % (status, output)) + self.logger.debug("Check archive result:%s." % archive_status) + return archive_status + + def check_recovery(self, former_status, cluster_status="normal"): + """ + Check for recovery. + """ + self.logger.log("Start check recovery.") + if former_status.strip() not in ["recovery", "recovery_fail"]: + self.logger.debug("Ignore for check recovery status:%s" % former_status) + return + recovery_status = "recovery_fail" + if cluster_status.lower() not in ["normal", "degraded"]: + self.logger.debug("Cluster status:%s,recovery fail." % cluster_status) + return recovery_status + if self.primary_dn_ids or (not self.main_standby_ids): + self.logger.debug("Ignore update recovery for primary cluster.") + return recovery_status + return "recovery" + + def get_max_rpo_rto(self): + """ + Get max rpo and rto. + """ + self.logger.log("Start check RPO & RTO.") + rpo_sql = "SELECT current_rpo FROM dbe_perf.global_streaming_hadr_rto_and_rpo_stat;" + rto_sql = "SELECT current_rto FROM dbe_perf.global_streaming_hadr_rto_and_rpo_stat;" + rto_rpo_sql = rpo_sql + rto_sql + if not self.primary_dn_ids: + self.logger.debug("Not found primary dn in cluster, cluster status:%s, " + "main standby:%s." % (self.cluster_status, self.main_standby_ids)) + return "", "" + log_info = "Execute sql [%s] on node [%s: %s] with result:%s" + dn_instances = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes + if inst.instanceId in self.primary_dn_ids] + if dn_instances: + status, output = ClusterCommand.remoteSQLCommand( + rto_rpo_sql, self.user, dn_instances[0].hostname, dn_instances[0].port) + if status == 0 and output: + try: + rets = output.strip().split('\n') + length = len(rets) // 2 + rpo_list = [int(i) for i in rets[:length]] + rto_list = [int(j) for j in rets[length:]] + max_rpo, max_rto = str(max(rpo_list)), str(max(rto_list)) + except ValueError: + return "", "" + self.logger.debug("Successfully get max rpo:%s, rto:%s, output:%s" + % (max_rpo, max_rto, ','.join(output.split('\n')))) + return max_rpo, max_rto + else: + self.logger.debug(log_info % (rto_rpo_sql, dn_instances[0].hostname, + dn_instances[0].port, ','.join(output.split('\n')))) + return "", "" + + def run(self): + self.logger.log("Start streaming disaster query.") + cluster_info = self.query_cluster_info() + if cluster_info: + self.parse_cluster_status(current_status=cluster_info) + self.check_is_under_upgrade() + check_cluster_stat = self.get_streaming_cluster_query_value( + DoradoDisasterRecoveryConstants.HADR_CLUSTER_STAT) + archive_status = self.check_archive(check_cluster_stat, self.cluster_status) + recovery_status = self.check_recovery(check_cluster_stat, self.cluster_status) + hadr_cluster_stat = archive_status or recovery_status or check_cluster_stat + + hadr_failover_stat = self.get_streaming_cluster_query_value( + DoradoDisasterRecoveryConstants.HADR_FAILOVER_STAT) + hadr_switchover_stat = self.get_streaming_cluster_query_value( + DoradoDisasterRecoveryConstants.HADR_SWICHOVER_STAT) + if hadr_cluster_stat != "promote": + hadr_failover_stat = "" + if hadr_cluster_stat != "switchover": + hadr_switchover_stat = "" + + self.logger.debug("Start check max rpo and rto.") + max_rpo, max_rto = self.get_max_rpo_rto() + self.logger.debug("Finished check max rpo and rto.") + values = dict() + values["hadr_cluster_stat"] = hadr_cluster_stat + values["hadr_failover_stat"] = hadr_failover_stat + values["hadr_switchover_stat"] = hadr_switchover_stat + values["RPO"] = max_rpo + values["RTO"] = max_rto + self.logger.log("Successfully executed streaming disaster " + "recovery query, result:\n%s" % values) diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py new file mode 100644 index 00000000..abe08902 --- /dev/null +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_disaster_recovery_stop.py is a utility for stopping +# streaming disaster recovery on primary cluster. + +from impl.dorado_disaster_recovery.ddr_base import DoradoDisasterRecoveryBase + + +class DisasterRecoveryStopHandler(DoradoDisasterRecoveryBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _first_step_for_streaming_stop(self, step): + """ + First step for streaming stop + """ + if step >= 2: + return + self.logger.debug("Start first step of streaming stop.") + self.init_cluster_status() + self.check_action_and_mode() + + def _second_step_for_streaming_stop(self, step): + """ + Second step for streaming stop + """ + if step >= 2: + return + self.logger.debug("Start second step of streaming start.") + self.check_cluster_status(status_allowed=['Normal']) + self.check_cluster_type(allowed_type='primary') + self.check_is_under_upgrade() + self.write_streaming_step("2_check_cluster_step") + + def _third_step_for_streaming_stop(self, step): + """ + Third step for streaming stop + """ + if step >= 3: + return + self.logger.debug("Start third step of streaming stop.") + self.remove_all_stream_repl_infos(guc_mode="reload") + self.remove_streaming_cluster_file() + self.write_streaming_step("3_remove_config_step") + + def _fourth_step_for_streaming_stop(self, step): + """ + Fourth step for streaming stop + """ + if step >= 4: + return + self.logger.debug("Start fourth step of streaming stop.") + self.remove_streaming_pg_hba() + self.restore_guc_params() + self.write_streaming_step("4_remove_pg_hba_step") + + def _fifth_step_for_streaming_stop(self, step): + """ + Fifth step for streaming stop + """ + if step >= 5: + return + self.logger.debug("Start fifth step of streaming start.") + self.streaming_clean_replication_slot() + self.write_streaming_step("5_update_config_step") + + def _sixth_step_for_streaming_stop(self, step): + """ + Sixth step for streaming stop + """ + if step >= 6: + return + self.logger.debug("Start sixth step of streaming stop.") + self.check_cluster_status(['Normal']) + self.clean_global_config() + self.update_streaming_info("cluster", "normal") + self.clean_streaming_dir() + + def run(self): + self.logger.log("Start remove streaming disaster relationship.") + step = self.query_streaming_step() + self._first_step_for_streaming_stop(step) + self.parse_cluster_status() + self._second_step_for_streaming_stop(step) + self._third_step_for_streaming_stop(step) + self._fourth_step_for_streaming_stop(step) + self._fifth_step_for_streaming_stop(step) + self._sixth_step_for_streaming_stop(step) + self.logger.log("Successfully do streaming disaster recovery stop.") diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py new file mode 100644 index 00000000..2763ae77 --- /dev/null +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py @@ -0,0 +1,476 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_disaster_recovery_switchover.py is a utility for +# changing role between primary cluster and standby cluster. + +import os +import time +from datetime import datetime, timedelta + +from base_utils.os.cmd_util import CmdUtil +from base_utils.os.env_util import EnvUtil +from gspylib.common.Common import DefaultValue, ClusterCommand, ClusterInstanceConfig +from gspylib.common.DbClusterStatus import DbClusterStatus +from gspylib.common.ErrorCode import ErrorCode +from gspylib.threads.parallelTool import parallelTool +from impl.dorado_disaster_recovery.ddr_base import DoradoDisasterRecoveryBase +from impl.dorado_disaster_recovery.ddr_constants import DoradoDisasterRecoveryConstants + + +class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def run(self): + """ + streaming disaster recovery switchover + """ + self.logger.log("Start streaming disaster switchover.") + self.check_action_and_mode() + self.check_switchover_workable() + self.init_cluster_conf() + self.check_dn_instance_params() + self.check_is_under_upgrade() + try: + self.streaming_switchover_single_inst() + self.clean_step_file() + except Exception as error: + if self.params.mode == "primary": + self.update_streaming_info("cluster", "promote_fail") + raise Exception( + ErrorCode.GAUSS_516["GAUSS_51632"] % "switchover" + "Error:%s" % str(error)) + finally: + self.remove_cluster_maintance_file_for_switchover() + self.remove_cluster_maintance_file() + self.logger.log("Successfully do streaming disaster recovery switchover.") + + def streaming_switchover_single_inst(self): + """ + streaming disaster recovery switchover for single_inst cluster + disaster_standby: expect primary cluster becomes standby + primary: expect standby cluster becomes primary + """ + self.create_cluster_maintance_file("streaming switchover") + self.update_streaming_info("cluster", DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER) + stream_disaster_step = self.query_streaming_step() + if self.params.mode == "primary": + end_time = datetime.now() + timedelta(seconds=self.params.waitingTimeout) + self.logger.log("Waiting for switchover barrier.") + while True: + switchover_barrier_list = self.check_streaming_disaster_switchover_barrier() + if len(switchover_barrier_list) == len(self.normal_dn_ids): + break + if datetime.now() >= end_time: + self.restart_cluster() + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "check switchover_barrier on all main standby dn" + + " Because check timeout: %ss" % + str(self.params.waitingTimeout)) + time.sleep(5) + self.streaming_failover_single_inst(stream_disaster_step, + DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER) + else: + self.add_cluster_maintance_file_for_switchover() + try: + if stream_disaster_step < 1: + self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "10%") + self.stop_cluster() + self.start_cluster() + self.streaming_disaster_set_master_cluster_in_switchover() + self.write_streaming_step("1_streaming_disaster_set_master_in_switchover") + if stream_disaster_step < 2: + self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "30%") + ClusterInstanceConfig.set_data_on_dcc(self.cluster_info, + self.logger, self.user, + {self.backup_open_key: "2"}) + self.stop_cluster() + self.write_streaming_step("2_stop_cluster_for_switchover") + if stream_disaster_step < 3: + self.set_cmserver_guc("backup_open", "2", "set") + self.set_cmagent_guc("agent_backup_open", "2", "set") + self.write_streaming_step("3_set_backup_open_2_done") + if stream_disaster_step < 4: + self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "50%") + self.remove_cluster_maintance_file_for_switchover() + self.remove_cluster_maintance_file() + self.start_cluster() + self.write_streaming_step("4_start_cluster_done") + if stream_disaster_step < 5: + self.wait_for_normal(timeout=self.params.waitingTimeout, + streaming_switchover="streaming_switchover") + self.streaming_clean_replication_slot() + self.update_streaming_info("cluster", "recovery") + except Exception as error: + self.logger.error("Failed to do streaming disaster cluster switchover, Error:" + " \n%s" % str(error)) + rollback_step = self.query_streaming_step() + self.logger.debug("Roll back switchover step:%s" % rollback_step) + self.remove_cluster_maintance_file_for_switchover() + self.remove_cluster_maintance_file() + if rollback_step < 4 or (rollback_step >= 4 and + self.streaming_switchover_roll_back_condition()): + self.streaming_switchover_roll_back(update_query=True) + self.clean_step_file() + raise Exception(error) + self.remove_hadr_switchover_process_file() + + def remove_hadr_switchover_process_file(self): + self.logger.debug("Remove hadr switchover process file for switchover.") + process_file = os.path.realpath(os.path.join(self.streaming_file_dir, + ".hadr_switchover_stat")) + cmd = "if [ -f {0} ]; then rm -rf {0}; fi".format(process_file) + self.ssh_tool.executeCommand(cmd, hostList=self.connected_nodes) + self.logger.debug("Successfully remove switchover process on all connected nodes.") + + @staticmethod + def clean_file_on_node(params): + """ + clean file on dest node with path + """ + dest_ip, dest_path, timeout = params + cmd = "source %s && pssh -s -t %s -H %s 'if [ -f %s ]; then rm -f %s; fi'" % ( + EnvUtil.getMpprcFile(), timeout, dest_ip, dest_path, dest_path) + status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd) + return status, output, dest_ip + + def restart_cluster(self, restart_timeout=DefaultValue.TIMEOUT_CLUSTER_START): + """ + Restart cluster + """ + self.logger.log("Restart cluster.") + static_config = "%s/bin/cluster_static_config" % self.bin_path + cm_ctl_file = "%s/bin/cm_ctl" % self.bin_path + if not os.path.isfile(static_config): + self.logger.debug("Checked file %s lost." % static_config) + if not os.path.isfile(cm_ctl_file): + self.logger.debug("Checked file %s lost." % cm_ctl_file) + stop_cmd = ClusterCommand.getStopCmd(0, timeout=restart_timeout) + status, output = CmdUtil.retryGetstatusoutput(stop_cmd, retry_time=0) + self.logger.debug("Stop cluster result:[%s][%s]." % (status, output)) + start_cmd = ClusterCommand.getStartCmd(0, timeout=restart_timeout) + status, output = CmdUtil.retryGetstatusoutput(start_cmd, retry_time=0) + self.logger.debug("Start cluster result:[%s][%s]." % (status, output)) + + def remove_cluster_maintance_file_for_switchover(self): + """ + function: remove the cluster_maintance file + :return: NA + """ + self.logger.debug("Remove cluster_maintance file for switchover.") + cluster_maintance_file = os.path.realpath(os.path.join(self.gauss_home, + "bin/cluster_maintance")) + host_names = \ + self.get_all_connection_node_name("remove_cluster_maintance_file_for_switchover") + try: + pscp_params = [] + all_instances = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + if not self.cluster_info.isSingleInstCluster(): + all_instances.extend([dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.coordinators]) + for dn_inst in all_instances: + if dn_inst.hostname in host_names: + pscp_params.append([dn_inst.hostname, os.path.join( + dn_inst.datadir, os.path.basename(cluster_maintance_file)), 10]) + if len(pscp_params) > 0: + results = parallelTool.parallelExecute(self.clean_file_on_node, pscp_params) + for ret in results: + if ret[0] != 0: + self.logger.debug("clean maintance file to node[%s] with status[%s], " + "output[%s]" % (ret[-1], ret[0], ret[1])) + except Exception as error: + self.logger.debug( + "Failed to remove cluster_maintance file for switchover with error: %s" + % str(error)) + self.logger.debug("Successfully remove %s cluster_maintance file for switchover." + % host_names) + + def add_cluster_maintance_file_for_switchover(self): + """ + add cluster_maintance file for streaming disaster switchover to disaster_standby + """ + self.logger.debug("Start add cluster_maintance file for switchover.") + try: + cluster_maintance_file = os.path.realpath(os.path.join(self.gauss_home, + "bin/cluster_maintance")) + host_names = \ + self.get_all_connection_node_name("add_cluster_maintance_file_for_switchover", True) + pscp_params = [] + all_instances = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + for dn_inst in all_instances: + if dn_inst.hostname in host_names: + pscp_params.append([dn_inst.hostname, cluster_maintance_file, + os.path.join(dn_inst.datadir, "cluster_maintance"), 10]) + if len(pscp_params) > 0: + results = parallelTool.parallelExecute( + DefaultValue.distribute_file_to_node, pscp_params) + for ret in results: + if ret[0] != 0: + self.logger.debug("Distribute maintance file for switchover to node[%s] " + "with status[%s], output[%s]" % (ret[-1], ret[0], ret[1])) + except Exception as error: + self.logger.debug("WARNING: Failed add cluster_maintance file for switchover, " + "error:%s." % (str(error))) + self.logger.debug("Successfully add cluster_maintance file for switchover.") + + def streaming_disaster_set_master_cluster_in_switchover(self): + """ + streaming disaster set master cluster in switchover + """ + self.logger.debug("Starting set streaming master cluster in switchover.") + primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if + dn_inst.instanceId in self.primary_dn_ids] + if not primary_dns: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain primary dns for switchover") + if self.streaming_dr_in_switchover(primary_dns): + if self.streaming_dr_service_truncation_check(primary_dns): + self.logger.debug("Successfully set streaming master cluster in switchover.") + + def streaming_dr_service_truncation_check(self, primary_dns_list): + """ + streaming dr service truncation check + """ + self.logger.log("Waiting for truncation.") + results = parallelTool.parallelExecute(self.concurrent_check_dr_service_truncation, + primary_dns_list) + return all(results) + + def concurrent_check_dr_service_truncation(self, dn_inst): + """ + Wait for the log playback to complete. + """ + self.logger.debug("Starting check node %s shardNum %s instance %s streaming service " + "truncation." % (dn_inst.hostname, dn_inst.mirrorId, dn_inst.instanceId)) + sql_check = "select * from gs_streaming_dr_service_truncation_check();" + end_time = datetime.now() + timedelta(seconds=1200) + succeed = False + while datetime.now() < end_time: + status, output = ClusterCommand.remoteSQLCommand(sql_check, self.user, dn_inst.hostname, + dn_inst.port) + if status == 0 and output and output.strip() == "t": + succeed = True + break + time.sleep(5) + self.logger.debug("Retry truncation check shardNum %s in node %s instance %s." % + (dn_inst.mirrorId, dn_inst.hostname, dn_inst.instanceId)) + if not succeed: + self.logger.error("Failed to execute the command: %s, Error:\n%s" % (sql_check, output)) + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "check truncate service before switchover") + self.logger.debug("Successfully check node %s shardNum %s instance %s streaming service " + "truncation." % (dn_inst.hostname, dn_inst.mirrorId, dn_inst.instanceId)) + return True + + def streaming_dr_in_switchover(self, primary_dns_list): + """ + set steaming dr in switchover + """ + results = parallelTool.parallelExecute(self.concurrent_set_dr_in_switchover, + primary_dns_list) + return all(results) + + def concurrent_set_dr_in_switchover(self, dn_inst): + """ + Switchover requires log truncation first + """ + self.logger.debug("Starting set shardNum %s node %s streaming dr in switchover." % + (dn_inst.mirrorId, dn_inst.hostname)) + sql_cmd = "select * from gs_streaming_dr_in_switchover();" + # We need to use the normal port to transmit service truncation, + # not the OM port. + port = int(dn_inst.port) - 1 + (status, output) = ClusterCommand.remoteSQLCommand(sql_cmd, + self.user, dn_inst.hostname, str(port)) + self.logger.debug("check streaming in switchover, status=%d, output: %s." + % (status, output)) + if status != 0 or self.find_error(output) or output.strip() != "t": + self.logger.error("Failed to execute the command: %s, Error:\n%s" % (sql_cmd, output)) + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "generate switchover barrier before switchover") + self.logger.debug("Successfully set shardNum %s node %s streaming dr in switchover." % + (dn_inst.mirrorId, dn_inst.hostname)) + return True + + def wait_for_normal(self, timeout=DefaultValue.TIMEOUT_CLUSTER_START, + streaming_switchover=None): + """ + function:Wait the cluster become Normal or Degraded + input:NA + output:NA + """ + self.logger.debug("Waiting for cluster status being satisfied.") + end_time = None if timeout <= 0 else datetime.now() + timedelta(seconds=timeout) + + check_status = 0 + while True: + time.sleep(10) + if end_time is not None and datetime.now() >= end_time: + check_status = 1 + self.logger.debug("Timeout. The cluster is not available.") + break + # View the cluster status + status_file = "/home/%s/gauss_check_status_%d.dat" % (self.user, os.getpid()) + cmd = ClusterCommand.getQueryStatusCmd(outFile=status_file) + (status, output) = CmdUtil.retryGetstatusoutput(cmd, retry_time=0) + if status != 0: + if os.path.exists(status_file): + os.remove(status_file) + self.logger.debug("Failed to obtain the cluster status. Error: \n%s" % output) + continue + # Determine whether the cluster status is normal or degraded + cluster_status = DbClusterStatus() + cluster_status.initFromFile(status_file) + if os.path.exists(status_file): + os.remove(status_file) + if cluster_status.clusterStatus == "Normal": + self.logger.log("The cluster status is Normal.") + break + else: + self.logger.debug("Cluster status is %s(%s)." % ( + cluster_status.clusterStatus, cluster_status.clusterStatusDetail)) + + if check_status != 0: + if streaming_switchover == "streaming_switchover": + raise Exception( + ErrorCode.GAUSS_528["GAUSS_52800"] % (cluster_status.clusterStatus, + cluster_status.clusterStatusDetail)) + self.logger.logExit(ErrorCode.GAUSS_528["GAUSS_52800"] % ( + cluster_status.clusterStatus, cluster_status.clusterStatusDetail)) + self.logger.debug("Successfully wait for cluster status become Normal.", "constant") + + def set_auto_csn_barrier_guc(self, guc_mode, action_flag=False, roll_back=False): + """ + auto_csn_barrier : 0 / 1 + """ + guc_value = 1 if self.params.mode == "primary" else 0 + if action_flag: + guc_value = 0 + if roll_back: + guc_value = 1 + self.logger.debug("Starting %s auto_csn_barrier is %s." % (guc_mode, guc_value)) + cmd = 'source %s && gs_guc %s -Z coordinator -N all -I all ' \ + '-c "auto_csn_barrier=%s"' % (self.mpp_file, guc_mode, guc_value) + host_names = self.cluster_info.getClusterNodeNames() + ignore_node = [node for node in host_names if node not in self.normal_node_list] + if ignore_node: + self.logger.debug( + "WARNING: auto_csn_barrier need ignore host name is %s" % ignore_node) + nodes = ",".join(ignore_node) + cmd = cmd + " --ignore-node %s" % nodes + self.logger.debug("Set auto_csn_barrier with cmd:%s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "set auto_csn_barrier" + "Error:%s" % output) + self.logger.debug("Successfully %s auto_csn_barrier is %s." % (guc_mode, guc_value)) + + def streaming_switchover_roll_back(self, update_query=False): + """ + streaming disaster cluster roll back in switchover + """ + self.logger.log("Roll back streaming disaster cluster switchover...") + ClusterInstanceConfig.set_data_on_dcc(self.cluster_info, + self.logger, self.user, + {self.backup_open_key: "0"}) + self.stop_cluster() + self.set_cmserver_guc("backup_open", "0", "set") + self.set_cmagent_guc("agent_backup_open", "0", "set") + self.logger.log("Successfully modify cma and cms parameters to start according to primary " + "cluster mode") + if update_query: + self.update_streaming_info("cluster", "archive") + self.start_cluster() + self.logger.log("Successfully Roll back streaming disaster cluster switchover.") + + def check_streaming_disaster_switchover_barrier(self): + """ + check whether get switchover_barrier on all dn + """ + self.logger.debug("check streaming disaster switchover barrier...") + sql_cmd = "select * from gs_streaming_dr_get_switchover_barrier();" + switchover_barrier_list = [] + for db_node in self.cluster_info.dbNodes: + for dn_inst in db_node.datanodes: + if dn_inst.instanceId not in self.normal_dn_ids: + self.logger.debug("Warning: Not check for abnormal instance %s %s" % ( + dn_inst.instanceType, dn_inst.instanceId)) + continue + (status, output) = ClusterCommand.remoteSQLCommand( + sql_cmd, self.user, dn_inst.hostname, dn_inst.port, maintenance_mode=True) + self.logger.debug("Check inst has switchover barrier, status=%d, " + "output: %s." % (status, output)) + if status == 0 and output.strip() == "t": + self.logger.debug("Successfully check instance %s %s has switchover " + "barrier." % (dn_inst.instanceType, dn_inst.instanceId)) + switchover_barrier_list.append(dn_inst.instanceId) + return switchover_barrier_list + + def check_switchover_workable(self): + """ + Check switchover is workable + """ + if not DefaultValue.is_disaster_cluster(self.cluster_info) \ + and self.params.mode == "primary": + self.logger.debug("The primary dn exist, do nothing except record the result file.") + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "streaming disaster cluster switchover, Because the primary cluster " + "[drClusterMode] parameter must be disaster_standby") + if DefaultValue.is_disaster_cluster(self.cluster_info) and \ + self.params.mode == "disaster_standby": + self.logger.debug("The primary dn not exist, do nothing except record the result file.") + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "streaming disaster cluster switchover, Because the disaster_standby " + "cluster [drClusterMode] parameter must be primary") + self.logger.log("Waiting for cluster and all instances normal.") + if self.params.mode == "primary": + end_time = datetime.now() + timedelta(seconds=600) + while True: + self.init_cluster_status() + self.parse_cluster_status() + if self.check_cluster_status(status_allowed=['Normal'], only_check=True, + is_log=False) and self.check_instances_ready_for_switchover(): + break + if datetime.now() >= end_time: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "check cluster and instances status" + " with timeout: %ss" % str(600)) + time.sleep(5) + self.logger.debug("Retry check stream disaster standby cluster status...") + else: + self.init_cluster_status() + self.parse_cluster_status() + if (not self.check_cluster_status(status_allowed=['Normal'], only_check=True, + is_log=False)) \ + or (not self.check_instances_ready_for_switchover()): + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] % "check cluster status") + + def check_instances_ready_for_switchover(self): + """ + Check cns and dns is ready for switchover + """ + dn_instances = [dn_inst.instanceId for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + if len(dn_instances) != len(self.normal_dn_ids): + self.logger.debug("Not all dn instances is normal.") + return False + self.logger.debug("Successfully check cn and dn instances are normal.") + return True diff --git a/script/impl/dorado_disaster_recovery/params_handler.py b/script/impl/dorado_disaster_recovery/params_handler.py new file mode 100644 index 00000000..530d7d6a --- /dev/null +++ b/script/impl/dorado_disaster_recovery/params_handler.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : params_handler.py is a utility for parsing and verifying streaming +# disaster recovery params. +############################################################################# + +import os +import sys +import json +import optparse +import getpass + +from impl.streaming_disaster_recovery.streaming_constants import DoradoDisasterRecoveryConstants +from gspylib.common.DbClusterInfo import dbClusterInfo +from gspylib.common.ErrorCode import ErrorCode +from base_utils.security.security_checker import SecurityChecker, ValidationError +from domain_utils.cluster_file.version_info import VersionInfo + + +def check_streaming_start_mode(mode): + """ + Check start mode + """ + if mode not in ["primary", "disaster_standby"]: + raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50011"] % ('-m', mode)) + + +def check_xml_file(file): + """ + Check xml file param + """ + if not file: + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50001'] % 'X') + SecurityChecker.check_is_string('xml file path', file) + if not os.path.isfile(file): + raise ValidationError(ErrorCode.GAUSS_502["GAUSS_50201"] % file) + + +def check_hadr_user(value): + """ + Check disaster user + """ + description = "disaster username" + SecurityChecker.check_db_user(description, value) + + +def check_hadr_pwd(value): + """ + Check disaster user password + """ + description = "disaster user password" + # check_db_password will be used in cloud scene + SecurityChecker.check_db_user(description, value) + + +def check_wait_timeout(value): + """ + Check wait timeout + """ + description = "wait timeout" + SecurityChecker.check_is_digit(description, value) + + +def check_local_cluster_conf(value): + """ + Check local cluster conf + """ + SecurityChecker.check_is_dict("localClusterConf", value) + port = value.get('port') + SecurityChecker.check_port_valid('port of localClusterConf', port) + shards = value.get('shards') + SecurityChecker.check_is_list('shards of localClusterConf', shards) + for shard in shards: + for node in shard: + ip = node.get('ip') + data_ip = node.get('dataIp') + SecurityChecker.check_ip_valid('ip of localClusterConf', ip) + SecurityChecker.check_ip_valid('dataIp of localClusterConf', data_ip) + + +def check_remote_cluster_conf(value): + """ + Check local cluster conf + """ + SecurityChecker.check_is_dict("remoteClusterConf", value) + port = value.get('port') + SecurityChecker.check_port_valid('port of remoteClusterConf', port) + shards = value.get('shards') + SecurityChecker.check_is_list('shards of remoteClusterConf', shards) + for shard in shards: + for node in shard: + ip = node.get('ip') + data_ip = node.get('dataIp') + SecurityChecker.check_ip_valid('ip of remoteClusterConf', ip) + SecurityChecker.check_ip_valid('dataIp of remoteClusterConf', data_ip) + + +STREAMING_PARAMS_FOR_MODULE = { + "start": { + "mode": check_streaming_start_mode, + "xml_path": check_xml_file, + "hadrUserName": check_hadr_user, + "hadrUserPassword": check_hadr_pwd, + "waitingTimeout": check_wait_timeout, + "localClusterConf": check_local_cluster_conf, + "remoteClusterConf": check_remote_cluster_conf + }, + "stop": { + "xml_path": check_xml_file, + "waitingTimeout": check_wait_timeout, + "localClusterConf": check_local_cluster_conf, + "remoteClusterConf": check_remote_cluster_conf + }, + "switchover": { + "mode": check_streaming_start_mode, + "waitingTimeout": check_wait_timeout + }, + "failover": { + "waitingTimeout": check_wait_timeout, + }, + "query": {} +} + +HELP_MSG = """ +gs_sdr is a utility for streaming disaster recovery fully options. + +Usage: + gs_sdr -? | --help + gs_sdr -V | --version + gs_sdr -t start -m [primary|disaster_standby] -X XMLFILE [-U DR_USERNAME] [-W DR_PASSWORD] [--json JSONFILE] [--time-out=SECS] [-l LOGFILE] + gs_sdr -t stop -X XMLFILE|--json JSONFILE [-l LOGFILE] + gs_sdr -t switchover -m [primary|disaster_standby] [--time-out=SECS] [-l LOGFILE] + gs_sdr -t failover [-l LOGFILE] + gs_sdr -t query [-l LOGFILE] +General options: + -?, --help Show help information for this utility, + and exit the command line mode. + -V, --version Show version information. + -t Task name, it could be: + "start", "stop", "switchover", "failover", "query". + -m Option mode, it could be: + "primary", "disaster_standby". + -U Disaster recovery user name. + -W Disaster recovery user password. + -X Path of the XML configuration file. + -l Path of log file. + --json Path of params file for streaming options. + --time-out=SECS Maximum waiting time when Main standby connect to the primary dn, + default value is 1200s. +""" + + +class ParamsHandler(object): + """ + Parse and check params. + """ + def __init__(self, logger, trace_id): + self.params = None + self.logger = logger + self.trace_id = trace_id + + @staticmethod + def option_parser(): + """ + parsing parameters + :return: param obj + """ + parser = optparse.OptionParser(conflict_handler='resolve') + parser.disable_interspersed_args() + parser.epilog = "Example: gs_sdr -t " \ + "start -m primary -X clusterConfig.xml " \ + "--time-out=1200." + parser.add_option('-V', "--version", dest='version_info', action='store_true', + help='-V|--version show version info.') + parser.add_option('-?', "--help", dest='help_info', action='store_true', + help='-?|--help show help message and exist.') + parser.add_option('-t', dest='task', type='string', + help='Task name. It could be "start", "stop", ' + '"switchover", "failover", "query"') + parser.add_option('-m', dest='mode', type='string', + help='Cluster run mode. It could be ["primary", "disaster_standby"].') + parser.add_option('-U', dest='hadrusername', type='string', + help='hadr user name.') + parser.add_option('-W', dest='hadruserpasswd', type='string', + help='hadr user password.') + parser.add_option('-X', dest='xml_path', type='string', + help='Cluster config xml path.') + parser.add_option('--json', dest='json_path', type='string', + help='Config json file of streaming options') + parser.add_option('--time-out=', dest='timeout', default="1200", type='string', + help='time out.') + parser.add_option("-l", dest='logFile', type='string', + help='Path of log file.') + parser.add_option("--dorado-info", dest='dorado_info', type='string', + help='Path of dorado xlog share disk.') + return parser + + def __print_usage(self): + """ + Print help message + """ + if self.params.help_info: + print(HELP_MSG) + sys.exit(0) + + def __print_version_info(self): + """ + Print version info + """ + if self.params.version_info: + print("%s %s" % (sys.argv[0].split("/")[-1], + VersionInfo.COMMON_VERSION)) + sys.exit(0) + + def __cluster_conf_parser(self, file_path): + """ + Parse params in json file + """ + if self.params.json_path: + if not os.path.isfile(file_path): + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50010'] + % '--json' + " Json file is not exist.") + with open(file_path, 'r') as read_fp: + param_dict = json.load(read_fp) + for key, value in param_dict.items(): + if key not in DoradoDisasterRecoveryConstants.STREAMING_JSON_PARAMS[self.params.task]: + continue + setattr(self.params, key, value) + return + cluster_info = dbClusterInfo() + if not self.params.xml_path or not os.path.isfile(self.params.xml_path): + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50010'] + % '-X' + " XML file and json file are all not exist.") + cluster_info.initFromXml(self.params.xml_path) + remote_cluster_conf = dict() + remote_cluster_conf.setdefault("port", cluster_info.remote_dn_base_port) + remote_cluster_conf.setdefault("shards", cluster_info.remote_stream_ip_map) + setattr(self.params, "remoteClusterConf", remote_cluster_conf) + self.logger.debug("Remote stream cluster conf: %s." % str(remote_cluster_conf)) + + local_cluster_conf = dict() + local_cluster_conf.setdefault("port", cluster_info.local_dn_base_port) + local_cluster_conf.setdefault("shards", cluster_info.local_stream_ip_map) + setattr(self.params, "localClusterConf", local_cluster_conf) + self.logger.debug("Local stream cluster conf: %s." % str(local_cluster_conf)) + if not remote_cluster_conf["shards"] or len(remote_cluster_conf["shards"])\ + != len(local_cluster_conf["shards"]): + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50026'] % "streaming DR") + + def __init_default_params(self): + """ + Init params if need default value + """ + if not self.params.timeout.isdigit(): + raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50004"] % "--time-out") + self.params.waitingTimeout = int(self.params.timeout) + + def __parse_args(self): + """ + Parse arguments + """ + parser = ParamsHandler.option_parser() + self.params, _ = parser.parse_args() + self.__print_usage() + self.__print_version_info() + if not hasattr(self.params, 'task') or not self.params.task: + raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50001"] % 't' + ".") + if self.params.task not in DoradoDisasterRecoveryConstants.STREAMING_JSON_PARAMS.keys(): + raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50004"] % 't') + # parse arguments in json/xml file + if DoradoDisasterRecoveryConstants.STREAMING_JSON_PARAMS[self.params.task]: + self.__cluster_conf_parser(self.params.json_path) + + def __reload_hadr_user_info(self): + """ + Input hadr user info + """ + if self.params.task not in ["start"]: + return + if self.params.hadrusername and self.params.hadruserpasswd: + self.params.hadrUserName = self.params.hadrusername + self.params.hadrUserPassword = self.params.hadruserpasswd + del self.params.hadruserpasswd + return + user_name = "" + if not self.params.hadrusername: + user_name = input("Please enter disaster user name:") + self.params.hadrUserName = user_name if user_name else self.params.hadrusername + if self.params.hadruserpasswd: + self.params.hadrUserPassword = self.params.hadruserpasswd + del self.params.hadruserpasswd + return + for i in range(3): + user_passwd = getpass.getpass("Please enter password for [%s]:" % + self.params.hadrUserName) + user_passwd_check = getpass.getpass("Please repeat enter for password for [%s]:" + % self.params.hadrUserName) + if user_passwd == user_passwd_check: + break + if i == 2: + self.logger.logExit("The two passwords entered for too many " + "times are inconsistent. Authentication failed.") + self.logger.error( + ErrorCode.GAUSS_503["GAUSS_50306"] % user_name + + "The two passwords are different, please enter password again.") + self.params.hadrUserPassword = user_passwd + del user_passwd + del user_passwd_check + self.logger.debug("The hadr user information is successfully loaded.") + + def get_valid_params(self): + """ + Check params + """ + try: + self.__parse_args() + self.logger.log(DoradoDisasterRecoveryConstants.LOG_REMARK) + self.logger.log('Streaming disaster recovery ' + self.params.task + ' ' + self.trace_id) + self.logger.log(DoradoDisasterRecoveryConstants.LOG_REMARK) + self.__init_default_params() + self.__reload_hadr_user_info() + for param_name, validate in STREAMING_PARAMS_FOR_MODULE[self.params.task].items(): + check_value = getattr(self.params, param_name) + if self.params.task == "stop": + if param_name == "xml_path" and not check_value: + check_value = getattr(self.params, 'json_path') + validate(check_value) + except ValidationError as error: + self.logger.logExit(str(error)) + return self.params -- Gitee From 6dd7237a9e1e4b8f0157ae17642bc30243adde2c Mon Sep 17 00:00:00 2001 From: chuanglichuangwai Date: Fri, 4 Aug 2023 10:57:52 +0800 Subject: [PATCH 02/23] =?UTF-8?q?gs=5Fddr=E5=B7=A5=E5=85=B7=E7=9A=84switch?= =?UTF-8?q?over=E5=92=8Cfailover=E7=9A=84=E5=AD=90=E5=91=BD=E4=BB=A4?= =?UTF-8?q?=E7=9A=84=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- script/gspylib/common/Common.py | 4 +- script/gspylib/common/DbClusterInfo.py | 12 + script/gspylib/common/ErrorCode.py | 7 +- .../impl/dorado_disaster_recovery/ddr_base.py | 385 ++++++++++++------ .../dorado_disaster_recovery/ddr_constants.py | 2 +- .../dorado_diaster_recovery_start.py | 30 +- .../dorado_disaster_recovery_failover.py | 34 +- .../dorado_disaster_recovery_query.py | 2 +- .../dorado_disaster_recovery_stop.py | 14 +- .../dorado_disaster_recovery_switchover.py | 167 ++++---- .../params_handler.py | 5 +- 11 files changed, 418 insertions(+), 244 deletions(-) diff --git a/script/gspylib/common/Common.py b/script/gspylib/common/Common.py index 2d1016fa..4f91eb9b 100644 --- a/script/gspylib/common/Common.py +++ b/script/gspylib/common/Common.py @@ -3027,7 +3027,7 @@ class DefaultValue(): if os.path.isfile(cm_agent_conf_temp_file): with open(cm_agent_conf_temp_file, "r") as cma_conf_file: content = cma_conf_file.read() - ret = re.findall(r'agent_backup_open *= *1|agent_backup_open *= *2', content) + ret = re.findall(r'agent_backup_open *= *1', content) g_file.removeFile(cm_agent_conf_temp_file) if ret: return True @@ -3037,7 +3037,7 @@ class DefaultValue(): raise Exception(ErrorCode.GAUSS_502['GAUSS_50201'] % cm_agent_conf_file) with open(cm_agent_conf_file, "r") as cma_conf_file: content = cma_conf_file.read() - ret = re.findall(r'agent_backup_open *= *1|agent_backup_open *= *2', content) + ret = re.findall(r'agent_backup_open *= *1', content) if ret: return True else: diff --git a/script/gspylib/common/DbClusterInfo.py b/script/gspylib/common/DbClusterInfo.py index 91564fca..305a00c4 100644 --- a/script/gspylib/common/DbClusterInfo.py +++ b/script/gspylib/common/DbClusterInfo.py @@ -1689,6 +1689,18 @@ class dbClusterInfo(): def get_staic_conf_path(self, user, ignore_err=False): return self.__getStaticConfigFilePath(user=user, ignore_err=ignore_err) + def get_mpprc_file(self, user): + """ + get mpprc file + """ + mpprcFile = EnvUtil.getEnvironmentParameterValue('MPPDB_ENV_SEPARATE_PATH', user) + if mpprcFile is not None and mpprcFile != "": + mpprcFile = mpprcFile.replace("\\", "\\\\").replace('"', '\\"\\"') + checkPathVaild(mpprcFile) + userProfile = mpprcFile + else: + userProfile = ClusterConstants.BASHRC + return userProfile def __getEnvironmentParameterValue(self, environmentParameterName, user): """ diff --git a/script/gspylib/common/ErrorCode.py b/script/gspylib/common/ErrorCode.py index 7c0f0392..8d82f5f0 100644 --- a/script/gspylib/common/ErrorCode.py +++ b/script/gspylib/common/ErrorCode.py @@ -133,6 +133,7 @@ class ErrorCode(): 'GAUSS_50110': "[GAUSS-50110] : Cannot execute this script on %s.", 'GAUSS_50111': "[GAUSS-50111] : The %s directory has no permission.", 'GAUSS_50112': "[GAUSS-50112] : Failed to get the permission of %s.", + 'GAUSS_50113': "[GAUSS-50113] : The %s is not writable and readable for %s.", } ########################################################################### @@ -333,7 +334,8 @@ class ErrorCode(): 'GAUSS_50621': "[GAUSS-50621] : Failed to check network care speed.\n", 'GAUSS_50622': "[GAUSS-50622] : Failed to obtain network card " "interrupt count numbers. Commands for getting " - "interrupt count numbers: %s." + "interrupt count numbers: %s.", + 'GAUSS_50623': "[GAUSS-50623] : Ping cluster nodes failed. Successfully ping node: %s." } @@ -631,7 +633,8 @@ class ErrorCode(): "the %s parameter is not needed.", 'GAUSS_51656': "[GAUSS-51656] : Waiting for udev trigger to end timeout", 'GAUSS_51657': "[GAUSS-51657] : Waiting for start %s to end timeout", - 'GAUSS_51658': "[GAUSS-51658] : The azName is different, and the value of azPriority must be different. " + 'GAUSS_51658': "[GAUSS-51658] : The azName is different, and the value of azPriority must be different. ", + 'GAUSS_51659': "[GAUSS-51659] : The cluster status detected by the \"%s\" command is abnormal. " } ########################################################################### diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 0424c911..8fd4b74d 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -21,6 +21,7 @@ import json import os import re +import sys import time from datetime import datetime from datetime import timedelta @@ -62,8 +63,8 @@ class DoradoDisasterRecoveryBase(object): self.local_host = None self.local_ip = None self.is_single_inst = None - self.streaming_file_dir = None - self.streaming_xml = None + self.dorado_file_dir = None + self.dorado_xml = None self.cluster_node_names = None self.normal_cm_ips = [] self.normal_node_list = [] @@ -96,8 +97,8 @@ class DoradoDisasterRecoveryBase(object): self.local_ip = DefaultValue.getIpByHostName() self.is_single_inst = True if self.cluster_info.isSingleInstCluster() else None self.cluster_node_names = self.cluster_info.getClusterNodeNames() - self.streaming_file_dir = os.path.join(self.pg_host, DoradoDisasterRecoveryConstants.DDR_FILES_DIR) - self.streaming_xml = os.path.join(self.streaming_file_dir, + self.dorado_file_dir = os.path.join(self.pg_host, DoradoDisasterRecoveryConstants.DDR_FILES_DIR) + self.streaming_xml = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.STREAMING_CONFIG_XML) self.ssh_tool = SshTool(self.cluster_node_names, self.log_file) self.mpp_file = EnvUtil.getMpprcFile() @@ -139,14 +140,14 @@ class DoradoDisasterRecoveryBase(object): raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "init step file path") else: step_file_name = DoradoDisasterRecoveryConstants.DDR_STEP_FILES[self.params.task] - self.step_file_path = os.path.join(self.streaming_file_dir, step_file_name) + self.step_file_path = os.path.join(self.dorado_file_dir, step_file_name) self.logger.debug("Init step file:%s." % self.step_file_path) def read_cluster_conf_record(self, check_file_exist=True): """ Read cluster conf from file """ - cluster_conf_record = os.path.join(self.streaming_file_dir, + cluster_conf_record = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.DDR_CLUSTER_CONF_RECORD) if not os.path.isfile(cluster_conf_record): if check_file_exist: @@ -354,7 +355,7 @@ class DoradoDisasterRecoveryBase(object): self.logger.log("Start prepare secure files.") secure_dir_name = DoradoDisasterRecoveryConstants.GS_SECURE_FILES temp_secure_dir_path = os.path.realpath( - os.path.join(self.streaming_file_dir, secure_dir_name)) + os.path.join(self.dorado_file_dir, secure_dir_name)) if os.path.isdir(temp_secure_dir_path): self.logger.debug("Secure file dir exist, cleaning...") FileUtil.removeDirectory(temp_secure_dir_path) @@ -406,13 +407,13 @@ class DoradoDisasterRecoveryBase(object): """ Remove streaming files dir """ - cmd = "if [ -d %s ]; then rm %s -rf;fi" % (dir_path, self.streaming_file_dir) + cmd = "if [ -d %s ]; then rm %s -rf;fi" % (dir_path, self.dorado_file_dir) self.ssh_tool.executeCommand(cmd) self.logger.debug("Successfully remove dir [%s] on all nodes." % dir_path) - def query_streaming_step(self): + def query_dorado_step(self): """ - Streaming step + dorado step """ step = -1 if os.path.isfile(self.step_file_path): @@ -426,7 +427,7 @@ class DoradoDisasterRecoveryBase(object): (step, self.params.task)) return step - def write_streaming_step(self, step): + def write_dorado_step(self, step): """ write streaming step :return: NA @@ -441,7 +442,7 @@ class DoradoDisasterRecoveryBase(object): """ Generate cluster status file """ - tmp_file = os.path.join(self.streaming_file_dir, + tmp_file = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.DDR_CLUSTER_STATUS_TMP_FILE) cmd = ClusterCommand.getQueryStatusCmd("", tmp_file) self.logger.debug("Command for checking cluster state: %s" % cmd) @@ -533,7 +534,7 @@ class DoradoDisasterRecoveryBase(object): """ Parse cluster status """ - tmp_file = os.path.join(self.streaming_file_dir, + tmp_file = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.DDR_CLUSTER_STATUS_TMP_FILE) if (not os.path.isfile(tmp_file)) and (not current_status): raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] @@ -612,25 +613,22 @@ class DoradoDisasterRecoveryBase(object): def check_dn_instance_params(self): """set_dn_instance_params""" - check_dick = {"enable_dcf": "off", "synchronous_commit": "on"} + check_dick = {"enable_dcf": "off"} dn_insts = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in db_node.datanodes] - if len(dn_insts) <= 2: - self.logger.debug("Need set most available for current cluster.") - check_dick.update({"most_available_sync": "on"}) primary_dn_insts = [inst for inst in dn_insts if inst.instanceId in self.primary_dn_ids] if not primary_dn_insts: self.logger.debug("The primary dn not exist, do not need check dn inst params.") return execute_dn = primary_dn_insts[0] param_list = [] - guc_backup_file = os.path.join(self.streaming_file_dir, DoradoDisasterRecoveryConstants.GUC_BACKUP_FILE) + guc_backup_file = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.GUC_BACKUP_FILE) if not os.path.isfile(guc_backup_file): FileUtil.createFileInSafeMode(guc_backup_file, DefaultValue.KEY_FILE_MODE_IN_OS) for peer_check, idx in list(check_dick.items()): param_list.append((execute_dn, {peer_check: idx})) ret = parallelTool.parallelExecute(self._check_dn_inst_param, param_list) - self.ssh_tool.scpFiles(guc_backup_file, self.streaming_file_dir, self.cluster_node_names) + self.ssh_tool.scpFiles(guc_backup_file, self.dorado_file_dir, self.cluster_node_names) if any(ret): self.logger.logExit('\n'.join(filter(bool, ret))) self.logger.debug("Successfully check dn inst default value.") @@ -641,7 +639,7 @@ class DoradoDisasterRecoveryBase(object): if len(param) != 2: error_msg = ErrorCode.GAUSS_521["GAUSS_52102"] % param return error_msg - guc_backup_file = os.path.join(self.streaming_file_dir, DoradoDisasterRecoveryConstants.GUC_BACKUP_FILE) + guc_backup_file = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.GUC_BACKUP_FILE) for sql_key, value in list(param[1].items()): sql = "show %s;" % sql_key (status, output) = ClusterCommand.remoteSQLCommand(sql, @@ -668,7 +666,7 @@ class DoradoDisasterRecoveryBase(object): Restore guc params in .streaming_guc_backup """ self.logger.debug("Start restore guc params.") - guc_backup_file = os.path.join(self.streaming_file_dir, DoradoDisasterRecoveryConstants.GUC_BACKUP_FILE) + guc_backup_file = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.GUC_BACKUP_FILE) if not os.path.isfile(guc_backup_file): self.logger.debug("Not found guc backup file, no need restore guc params.") params_record = DefaultValue.obtain_file_content(guc_backup_file) @@ -731,10 +729,10 @@ class DoradoDisasterRecoveryBase(object): """ data = {"remoteClusterConf": self.params.remoteClusterConf, "localClusterConf": self.params.localClusterConf} - file_path = os.path.join(self.streaming_file_dir, + file_path = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.DDR_CLUSTER_CONF_RECORD) FileUtil.write_update_file(file_path, data, DefaultValue.KEY_FILE_MODE_IN_OS) - self.ssh_tool.scpFiles(file_path, self.streaming_file_dir, self.cluster_node_names) + self.ssh_tool.scpFiles(file_path, self.dorado_file_dir, self.cluster_node_names) def __record_wal_keep_segments(self, param_list): """ @@ -766,7 +764,7 @@ class DoradoDisasterRecoveryBase(object): return self.logger.debug("Starting get wal_keep_segments default value.") wal_keep_segments = os.path.join( - self.streaming_file_dir, DoradoDisasterRecoveryConstants.WAL_KEEP_SEGMENTS) + self.dorado_file_dir, DoradoDisasterRecoveryConstants.WAL_KEEP_SEGMENTS) sql_check = "show wal_keep_segments;" param_list = [(dn_inst, sql_check, wal_keep_segments) for db_node in self.cluster_info.dbNodes for dn_inst in db_node.datanodes @@ -874,7 +872,7 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("Command for changing instance pg_hba.conf file: %s" % cmd) self.get_all_connection_node_name("update_streaming_pg_hba") try: - self.ssh_tool.scpFiles(self.streaming_xml, self.streaming_file_dir) + self.ssh_tool.scpFiles(self.streaming_xml, self.dorado_file_dir) self.ssh_tool.executeCommand(cmd, hostList=self.connected_nodes) except Exception as error: msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ @@ -1061,6 +1059,24 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug( "Successfully set all datanode guc param in postgres conf for streaming cluster.") + def set_datanode_guc(self, guc_parameter, guc_value, guc_type, only_mode=None): + """ + set datanode guc param + :return: NA + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Set datanode guc [%s] to [%s] not for mode:%s." + % (guc_parameter, guc_value, self.params.mode)) + return + cmd = "gs_guc %s -Z datanode -N all -I all -c \"%s=%s\" " % \ + (guc_type, guc_parameter, guc_value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ + % "set datanode guc [%s] to [%s], output:%s" \ + % (guc_parameter, guc_value, output) + self.logger.debug(msg) + def set_cmserver_guc(self, guc_parameter, guc_value, guc_type, only_mode=None): """ set cmserver guc param @@ -1126,7 +1142,7 @@ class DoradoDisasterRecoveryBase(object): """ file_path = os.path.join(dir_name, "pg_hba.conf") old_file_path = os.path.join(dir_name, "pg_hba.conf.old") - dest_file = os.path.join(self.streaming_file_dir, "%s_pg_hba.conf" % inst_id) + dest_file = os.path.join(self.dorado_file_dir, "%s_pg_hba.conf" % inst_id) if self.local_host == node_name: if mode == "backup" and not os.path.isfile(dest_file): if os.path.isfile(file_path): @@ -1165,7 +1181,7 @@ class DoradoDisasterRecoveryBase(object): Backup or restore pg_ident file. """ file_path = os.path.join(dir_name, "pg_ident.conf") - dest_file = os.path.join(self.streaming_file_dir, "%s_pg_ident.conf" % inst_id) + dest_file = os.path.join(self.dorado_file_dir, "%s_pg_ident.conf" % inst_id) if self.local_host == node_name: if mode == "backup" and not os.path.isfile(dest_file): if os.path.isfile(file_path): @@ -1531,7 +1547,7 @@ class DoradoDisasterRecoveryBase(object): return self.logger.debug("Starting restore wal_keep_segments default value.") default_value_dict = {} - wal_keep_segments = os.path.join(self.streaming_file_dir, + wal_keep_segments = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.WAL_KEEP_SEGMENTS) if not os.path.isfile(wal_keep_segments): self.logger.debug("Not found wal keep segments record file, no need restore.") @@ -1545,33 +1561,42 @@ class DoradoDisasterRecoveryBase(object): self.set_wal_keep_segments("reload", default_value_dict, True) self.logger.debug("Successfully restore wal_keep_segments default value.") - def __clean_streaming_files_on_local_node(self, file_name_list): + def __clean_dorado_files_on_local_node(self, file_name_list): file_name_list = [file_name_list] \ if not isinstance(file_name_list, list) else file_name_list for file_name in file_name_list: - file_path = os.path.join(self.streaming_file_dir, file_name) + file_path = os.path.join(self.dorado_file_dir, file_name) if os.path.isfile(file_path): FileUtil.removeFile(file_path) self.logger.debug("Successfully removed file:[%s]" % file_path) + def clean_flag_file(self): + """ + Clean flag file + """ + flag_file = os.path.join(self.step_file_path, "remote_replication_pairs_done") + if os.path.exists(flag_file): + self.logger.debug("Successfully removed flag file %s." % flag_file) + os.remove(flag_file) + def clean_step_file(self): """ Clean step file for each action """ step_file = os.path.basename(self.step_file_path) - self.__clean_streaming_files_on_local_node(step_file) + self.__clean_dorado_files_on_local_node(step_file) self.logger.log("Successfully removed step file.") def check_action_and_mode(self): """ Check action and mode if step file exist. - if any streaming options not finished(step file exist), + if any dorado options not finished(step file exist), not allowed doing any other streaming options except query. """ self.logger.debug("Checking action and mode.") exist_step_file_names = [] for file_name in DoradoDisasterRecoveryConstants.DDR_STEP_FILES.values(): - step_file_path = os.path.join(self.streaming_file_dir, file_name) + step_file_path = os.path.join(self.dorado_file_dir, file_name) if os.path.isfile(step_file_path) and file_name != ".ddr_query.step": exist_step_file_names.append(file_name) if exist_step_file_names and set(exist_step_file_names) ^ {os.path.basename( @@ -1584,19 +1609,19 @@ class DoradoDisasterRecoveryBase(object): "doing current options" % (exist_step_file_names, exist_action)) self.logger.debug("Successfully checked action and mode.") - def clean_streaming_dir(self): + def clean_dorado_dir(self): """ - Clean streaming dir when stop or failover + Clean dorado dir when stop or failover """ - self.logger.debug("Start clean streaming dir:%s." % self.streaming_file_dir) - cmd = g_file.SHELL_CMD_DICT["deleteDir"] % (self.streaming_file_dir, - self.streaming_file_dir) + self.logger.debug("Start clean dorado dir:%s." % self.dorado_file_dir) + cmd = g_file.SHELL_CMD_DICT["deleteDir"] % (self.dorado_file_dir, + self.dorado_file_dir) try: self.ssh_tool.executeCommand(cmd, hostList=self.cluster_info.getClusterNodeNames()) except Exception as error: self.logger.debug( - "Failed to remove streaming dir with error:%s" % error) - self.logger.log("Finished remove streaming dir.") + "Failed to remove dorado dir with error:%s" % error) + self.logger.log("Finished remove dorado dir.") def clean_global_config(self): """ @@ -1706,7 +1731,7 @@ class DoradoDisasterRecoveryBase(object): Find and copy key file dir from all dn dir """ local_temp_secure_path = os.path.join( - self.streaming_file_dir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES) + self.dorado_file_dir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES) if os.path.isdir(local_temp_secure_path): FileUtil.removeDirectory(local_temp_secure_path) rand_path = os.path.join(local_temp_secure_path, DoradoDisasterRecoveryConstants.HADR_KEY_RAND) @@ -1721,7 +1746,7 @@ class DoradoDisasterRecoveryBase(object): dn_inst.datadir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES)) cmd_copy_dir = cmd_tep % (key_file_path, self.mpp_file, self.trace_id, self.local_host, key_file_path, - self.streaming_file_dir, + self.dorado_file_dir, key_file_path, dn_inst.hostname) status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd_copy_dir) self.logger.debug("Copy cmd:%s" % cmd_copy_dir) @@ -1754,7 +1779,7 @@ class DoradoDisasterRecoveryBase(object): # check cluster user consistency self.__check_cluster_user() # distribute key files to all node - secure_dir_path = os.path.join(self.streaming_file_dir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES) + secure_dir_path = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES) self.__copy_hadr_user_key(secure_dir_path, update=True) FileUtil.removeDirectory(secure_dir_path) self.logger.log("Successfully build and distribute key files to all nodes.") @@ -1764,7 +1789,7 @@ class DoradoDisasterRecoveryBase(object): function: Check whether the version numbers of the host cluster and the disaster recovery cluster are the same """ - gs_secure_version = os.path.realpath(os.path.join(self.streaming_file_dir, + gs_secure_version = os.path.realpath(os.path.join(self.dorado_file_dir, "gs_secure_files/version.cfg")) master_commit_id = VersionInfo.get_version_info(gs_secure_version)[-1] local_version_file = VersionInfo.get_version_file() @@ -1781,7 +1806,7 @@ class DoradoDisasterRecoveryBase(object): function: Check whether the version numbers of the host cluster and the disaster recovery cluster are the same """ - user_file = os.path.realpath(os.path.join(self.streaming_file_dir, + user_file = os.path.realpath(os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES, DoradoDisasterRecoveryConstants.CLUSTER_USER_RECORD)) remote_user = DefaultValue.obtain_file_content(user_file, is_list=False) @@ -1896,7 +1921,7 @@ class DoradoDisasterRecoveryBase(object): params = [] dn_instances = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes] - cluster_conf = os.path.join(self.streaming_file_dir, + cluster_conf = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.DDR_CLUSTER_CONF_RECORD) dn_num = DefaultValue.get_all_dn_num_for_dr(cluster_conf, dn_instances[0], self.cluster_info, self.logger) @@ -1916,7 +1941,7 @@ class DoradoDisasterRecoveryBase(object): :return: NA """ self.logger.log("Start remove cluster file.") - cluster_info_file = os.path.join(self.streaming_file_dir, + cluster_info_file = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.DDR_CLUSTER_CONF_RECORD) cmd = g_file.SHELL_CMD_DICT["deleteFile"] % (cluster_info_file, cluster_info_file) try: @@ -2015,9 +2040,9 @@ class DoradoDisasterRecoveryBase(object): parallelTool.parallelExecute(self.concurrent_drop_slot, params) self.logger.log("Finished drop all node replication slots") - def update_streaming_info(self, key, value, only_mode=None): + def update_dorado_info(self, key, value, only_mode=None): """ - Update info for streaming status + Update info for dorado status """ if only_mode and self.params.mode != only_mode: self.logger.debug("Update query status [%s] to [%s] " @@ -2036,20 +2061,20 @@ class DoradoDisasterRecoveryBase(object): else: self.logger.debug("key error.") return - file_path = os.path.realpath(os.path.join(self.streaming_file_dir, key_stat)) + file_path = os.path.realpath(os.path.join(self.dorado_file_dir, key_stat)) with os.fdopen(os.open(file_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, DefaultValue.KEY_FILE_MODE_IN_OS), "w") as fp_write: fp_write.write(value) host_names = self.get_all_connection_node_name( - action_flag="update_streaming_info", no_update=True) - self.ssh_tool.scpFiles(file_path, self.streaming_file_dir, host_names) + action_flag="update_dorado_info", no_update=True) + self.ssh_tool.scpFiles(file_path, self.dorado_file_dir, host_names) except Exception as error: self.logger.debug("Failed write info, key:%s, value:%s, " "error:%s." % (key, value, error)) def create_cluster_maintance_file(self, value): """ - add cluster_maintance file for streaming failover and switchover disaster_standby + add cluster_maintance file for dorado failover and switchover disaster_standby """ self.logger.debug("Start create cluster_maintance file.") try: @@ -2066,70 +2091,199 @@ class DoradoDisasterRecoveryBase(object): "error:%s." % (value, str(error))) self.logger.debug("Successfully create cluster_maintance file.") - def streaming_failover_single_inst(self, stream_disaster_step, action_flag=None): + + def check_datanode_query_info(self, params): """ - streaming disaster recovery failover for single_inst cluster + check datanode info by "gs_ctl query" command. """ - self.create_cluster_maintance_file("streaming failover") + state, dest_ip, datadir = params + # get mpprc file + mpprcFile = self.cluster_info.get_mpprc_file(self.user) + if dest_ip == self.local_host: + cmd = "source %s && gs_ctl query -D %s" % (mpprcFile, datadir) + else: + cmd = "pssh -H %s \"source %s && gs_ctl query -D %s \"" % (dest_ip, + mpprcFile, + datadir) + (status, output) = subprocess.getstatusoutput(cmd) + dbState = re.findall(r"db_state.*: (.*?)\n", output) + localRole = re.findall(r"local_role.*: (.*?)\n", output) + peerRole = re.findall(r"peer_role.*: (.*?)\n", output) + preeState = re.findall(r"pree_state.*: (.*?)\n", output) + channel = re.findall(r"channel.*: (.*?)\n", output) + if status == 0: + check_ok = 0 + if state == "Primary": + if (len(dbState) != 1 or dbState[0] != "Normal") or \ + (len(localRole) != 2 or localRole[0] != "Primary" or localRole[1] != "Primary") or \ + (len(peerRole) != 1 or peerRole[0] != "StandbyCluster_Standby") or \ + (len(preeState) != 1 or preeState[0] != "Normal") or \ + (len(channel) != 1 or "-->" not in channel[0]): + check_ok = -1 + elif state == "Main Standby": + if (len(dbState) != 1 or dbState[0] != "Normal") or \ + (len(localRole) != 2 or localRole[0] != "Main Standby" or localRole[1] != "Standby") or \ + (len(peerRole) != 1 or peerRole[0] != "Primary") or \ + (len(preeState) != 1 or preeState[0] != "Normal") or \ + (len(channel) != 1 or "<--" not in channel[0]): + check_ok = -1 + elif state == "Standby": + # 不管是主集群,还是灾难备集群仅仅检查 local_role 只有一个元素 Standby 和 db_state 为 Normal + if (len(dbState) != 1 or dbState[0] != "Normal") or \ + (len(localRole) != 1 or localRole[0] != "Standby"): + check_ok = -1 + else: + raise Exception(ErrorCode.GAUSS_521["GAUSS_52102"] % state) + else: + check_ok = status + + return check_ok, output, dest_ip + + def check_dorado_datanode_query_info(self, timeout=DefaultValue.TIMEOUT_CLUSTER_START, + dorado_switchover=None): + """ + check gs_ctl query info + """ + self.logger.debug("Waiting for gs_ctl query status being satisfied.") + end_time = None if timeout <= 0 else datetime.now() + timedelta(seconds=timeout) + + self.init_cluster_status() + self.parse_cluster_status() + host_names = self.get_all_connection_node_name() + if len(host_names) != len(self.cluster_node_names): + raise Exception(ErrorCode.GAUSS_506["GAUSS_50623"] % host_names) + check_params = [] + all_instances = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + for dn_inst in all_instances: + check_params.append([dn_inst.state, dn_inst.hostname, dn_inst.datadir]) + if len(check_params) <= 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51620"] % "cluster") + while True: + check_status = 0 + time.sleep(10) + if end_time is not None and datetime.now() >= end_time: + check_status = 1 + self.logger.debug("Timeout. The gs_ctl query command cannot obtain the expected status.") + break + results = parallelTool.parallelExecute( + self.check_datanode_query_info, check_params) + for ret in results: + if ret[0] != 0: + self.logger.debug("Failed to check node[%s] info using \"gs_ctl query\" command " + "with status[%s], output[%s]" % (ret[-1], ret[0], ret[1])) + check_status = 1 + if check_status == 0: + break + if check_status != 0: + if dorado_switchover == "dorado_switchover": + raise Exception( + ErrorCode.GAUSS_516["GAUSS_51659"] % "gs_ctl query") + self.logger.logExit( + ErrorCode.GAUSS_516["GAUSS_51659"] % "gs_ctl query") + self.logger.debug("Successfully wait for gs_ctl query status become Normal.", "constant") + + def dorado_failover_single_inst(self, dorado_disaster_step, action_flag=None): + """ + dorado disaster recovery failover for single_inst cluster + """ + self.create_cluster_maintance_file("dorado failover") if action_flag != DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: - self.update_streaming_info("cluster", "promote") + self.update_dorado_info("cluster", "promote") # 0. check cluster status and get normal instance list - if stream_disaster_step < 0: + if dorado_disaster_step < 0: if action_flag == DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: - self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "10%") + self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "10%") else: - self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "10%") + self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "10%") self.init_cluster_status() self.parse_cluster_status() - self.write_streaming_step("0_check_cluster_status_done_for_failover") - # 1.Specify max xid and max ter to start etcd - max_term_record = os.path.join(self.streaming_file_dir, ".max_term_record") - if stream_disaster_step < 1: - max_term = self.get_term_info() - term_key = "/%s/CMServer/status_key/term" % self.user - para_dict = {term_key: max_term, self.backup_open_key: "0"} - ClusterInstanceConfig.set_data_on_dcc(self.cluster_info, - self.logger, self.user, para_dict) - DefaultValue.write_content_on_file(max_term_record, max_term) - self.write_streaming_step("1_start_etcd_done_for_failover") - self._failover_config_step(stream_disaster_step, action_flag) - self._failover_start_step(stream_disaster_step, action_flag, max_term_record) - - def _failover_start_step(self, stream_disaster_step, action_flag, max_term_record): + self.stop_cluster() + self.write_dorado_step("0_dorado_disaster_stop_cluster_for_failover") + flag_file = os.path.join(self.step_file_path, "remote_replication_pairs_done") + if os.path.exists(flag_file): + self.logger.debug("Delete file %s." % flag_file) + os.remove(flag_file) + self.logger.debug(self.remote_replication_pairs_log_message % flag_file) + sys.exit(0) + if dorado_disaster_step < 1: + # 标志文件存在,检查远程复制的lun设备权限,更新进度,代表 "远程复制Pair"任务完成 + flag_file = os.path.join(self.step_file_path, "remote_replication_pairs_done") + if not os.path.exists(flag_file) or not self.check_xlog_file_path(): + self.logger.debug(self.remote_replication_pairs_log_message % flag_file) + sys.exit(0) + self.write_dorado_step("1_set_remote_replication_pairs_for_failover") + self._failover_config_step(dorado_disaster_step, action_flag) + self._failover_start_step(dorado_disaster_step, action_flag) + + def check_dorado_datanode_query_info(self, timeout=DefaultValue.TIMEOUT_CLUSTER_START, + dorado_switchover=None): + """ + check gs_ctl query info + """ + self.logger.debug("Waiting for gs_ctl query status being satisfied.") + end_time = None if timeout <= 0 else datetime.now() + timedelta(seconds=timeout) + + host_names = self.get_all_connection_node_name() + if len(host_names) != len(self.cluster_node_names): + raise Exception(ErrorCode.GAUSS_506["GAUSS_50623"] % host_names) + check_params = [] + all_instances = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + for dn_inst in all_instances: + check_params.append([dn_inst.state, dn_inst.hostname, dn_inst.datadir]) + if len(check_params) <= 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51620"] % "cluster") + while True: + check_status = 0 + time.sleep(10) + if end_time is not None and datetime.now() >= end_time: + check_status = 1 + self.logger.debug("Timeout. The gs_ctl query command cannot obtain the expected status.") + break + results = parallelTool.parallelExecute( + self.check_datanode_query_info, check_params) + for ret in results: + if ret[0] != 0: + self.logger.debug("Failed to check node[%s] info using \"gs_ctl query\" command " + "with status[%s], output[%s]" % (ret[-1], ret[0], ret[1])) + check_status = 1 + if check_status == 0: + break + if check_status != 0: + if dorado_switchover == "dorado_switchover": + raise Exception( + ErrorCode.GAUSS_516["GAUSS_51659"] % "gs_ctl query") + self.logger.logExit( + ErrorCode.GAUSS_516["GAUSS_51659"] % "gs_ctl query") + self.logger.debug("Successfully wait for gs_ctl query status become Normal.", "constant") + + def _failover_start_step(self, dorado_disaster_step, action_flag): """ Failover step 5 & 6 """ - if stream_disaster_step < 5: + if dorado_disaster_step < 3: if action_flag == DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: - self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "80%") + self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "80%") else: - self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "80%") - if not os.path.isfile(max_term_record): - raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % max_term_record) - _, dn_infos = self.get_specified_dn_infos() - max_term_list = DefaultValue.obtain_file_content(max_term_record) - if not max_term_list: - raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "read max term") - params = [(dn_info, max_term_list[0]) for dn_info in dn_infos] - if params: - parallelTool.parallelExecute(self.start_primary_dn, params) - self.write_streaming_step("5_start_primary_dn_done") - if stream_disaster_step < 6: + self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "80%") + self.remove_cluster_maintance_file_for_switchover() + self.remove_cluster_maintance_file() self.start_cluster() - cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL, - DefaultValue.CLUSTER_STATUS_DEGRADED] + self.write_dorado_step("3_start_cluster_done") + if dorado_disaster_step < 4: + cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL] self.check_cluster_status(cluster_normal_status, check_current=True) - cluster_info = self.query_cluster_info() - self.parse_cluster_status(current_status=cluster_info) if action_flag != DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: - self.clean_global_config() - self.restore_guc_params() - self.streaming_clean_archive_slot() - if action_flag != DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: - self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "100%") - self.update_streaming_info("cluster", "normal") + self.check_dorado_datanode_query_info(timeout=30, + dorado_switchover="dorado_failover") + self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "100%") + self.update_dorado_info("cluster", "normal") else: - self.update_streaming_info("cluster", "archive") + self.check_dorado_datanode_query_info(timeout=30, + dorado_switchover="dorado_switchover") + self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "100%") + self.update_dorado_info("cluster", "archive") def streaming_clean_archive_slot(self): """ @@ -2170,7 +2324,7 @@ class DoradoDisasterRecoveryBase(object): """ Get specified dn infos """ - tmp_file = os.path.join(self.streaming_file_dir, "cluster_state_tmp") + tmp_file = os.path.join(self.dorado_file_dir, "cluster_state_tmp") if not os.path.isfile(tmp_file) or update: cmd = ClusterCommand.getQueryStatusCmd(self.user, 0, tmp_file) self.logger.debug("Update cluster state with cmd: %s" % cmd) @@ -2267,31 +2421,20 @@ class DoradoDisasterRecoveryBase(object): self.ssh_tool.executeCommand(cmd, hostList=self.normal_node_list) self.logger.debug("Successfully set cm agent for streaming disaster.") - def _failover_config_step(self, stream_disaster_step, action_flag): + def _failover_config_step(self, dorado_disaster_step, action_flag): """ Failover step 2 - 4 """ # 2.Stop the cluster by node - if stream_disaster_step < 2: - if action_flag != DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: - self.streaming_clean_replication_slot() - self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "30%") - self.stop_cluster_by_node() - self.write_streaming_step("2_stop_cluster_done_for_failover") - # 3.Start the cluster in the main cluster mode - if stream_disaster_step < 3: + if dorado_disaster_step < 2: + if action_flag == DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: + self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "30%") + else: + self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "30%") + self.set_datanode_guc("cluster_run_mode", "cluster_primary", "set") self.set_cmserver_guc("backup_open", "0", "set") - self.stream_disaster_set_cmagent_guc("agent_backup_open", "0", "set") - self.write_streaming_step("3_set_backup_open_for_failover") - # 4.Delete the relevant guc parameters and remove the disaster tolerance relationship - # based on streaming disaster recovery cluster, No need to delete for switchover. - if not action_flag: - if stream_disaster_step < 4: - self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "50%") - self.remove_all_stream_repl_infos() - self.remove_streaming_pg_hba(True) - self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "70%") - self.write_streaming_step("4_remove_hba_repl_done_for_failover") + self.set_cmagent_guc("agent_backup_open", "0", "set") + self.write_dorado_step("2_set_cluster_guc_for_failover_done") def get_term_info(self): """get_term_info""" diff --git a/script/impl/dorado_disaster_recovery/ddr_constants.py b/script/impl/dorado_disaster_recovery/ddr_constants.py index 6e185b35..60d8b3da 100644 --- a/script/impl/dorado_disaster_recovery/ddr_constants.py +++ b/script/impl/dorado_disaster_recovery/ddr_constants.py @@ -54,7 +54,7 @@ class DoradoDisasterRecoveryConstants: STREAM_DISTRIBUTE_ACTION = "distribute_stream_failover" # GUC CHANGE MAP - GUC_CHANGE_MAP = {"most_available_sync": "on", "synchronous_commit": "on"} + GUC_CHANGE_MAP = {} # params in json file for each module STREAMING_JSON_PARAMS = { diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py index ee341be5..7d57043b 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py @@ -40,7 +40,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): return self.logger.debug("Start first step of DisasterRecovery start.") #创建容灾过程使用的临时目录 - self.create_disaster_recovery_dir(self.streaming_file_dir) + self.create_disaster_recovery_dir(self.dorado_file_dir) #检查执行的标志文件 self.check_action_and_mode() self.init_cluster_status() @@ -61,7 +61,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): self.check_is_under_upgrade() #检查dn的GUC参数 #self.check_dn_instance_params() - self.write_streaming_step("2_check_cluster_step") + self.write_dorado_step("2_check_cluster_step") def _third_step_for_ddr_start(self, step): """ @@ -74,7 +74,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): #self.prepare_gs_secure_files(only_mode='primary') #self.build_and_distribute_key_files(only_mode='disaster_standby') #self.get_default_wal_keep_segments(only_mode='primary') - self.write_streaming_step("3_set_wal_segments_step") + self.write_dorado_step("3_set_wal_segments_step") def drop_replication_slot_on_dr_cluster(self, only_mode=None): """ @@ -117,7 +117,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): self.logger.debug("Start fourth step of streaming start.") self.set_wal_keep_segments( "reload", DoradoDisasterRecoveryConstants.MAX_WAL_KEEP_SEGMENTS, only_mode='primary') - self.write_streaming_step("4_set_wal_segments_step") + self.write_dorado_step("4_set_wal_segments_step") def _fifth_step_for_ddr_start(self, step): """ @@ -130,7 +130,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): self.set_data_in_dcc(self.backup_open_key, "1", only_mode='disaster_standby') #self.set_most_available(mode="reload", raise_error=False) self.stop_cluster_by_node(only_mode='disaster_standby') - self.write_streaming_step("5_set_wal_segments_step") + self.write_dorado_step("5_set_wal_segments_step") def common_step_for_ddr_start(self): """ @@ -150,7 +150,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): self.logger.debug("Start sixth step of streaming start.") self.set_cmserver_guc("backup_open", "1", "set", only_mode='disaster_standby') self.set_cmagent_guc("agent_backup_open", "1", "set", only_mode='disaster_standby') - self.write_streaming_step("6_set_guc_step") + self.write_dorado_step("6_set_guc_step") def _seventh_step_for_ddr_start(self, step): """ @@ -159,15 +159,15 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): if step >= 7: return self.logger.debug("Start seventh step of streaming start.") - self.update_streaming_info("cluster", "restore", only_mode='disaster_standby') + self.update_dorado_info("cluster", "restore", only_mode='disaster_standby') try: self.start_dss_instance(only_mode='disaster_standby') self.build_dn_instance(only_mode='disaster_standby') self.kill_dss_instance(only_mode='disaster_standby') except Exception as error: - self.update_streaming_info("cluster", "restore_fail", only_mode='disaster_standby') + self.update_dorado_info("cluster", "restore_fail", only_mode='disaster_standby') raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "build dns" + "Error:%s" % error) - self.write_streaming_step("7_build_dn_instance_step") + self.write_dorado_step("7_build_dn_instance_step") def _eighth_step_for_ddr_start(self, step): """ @@ -178,18 +178,18 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): self.logger.debug("Start eighth step of streaming start.") self.start_cluster(cm_timeout=DoradoDisasterRecoveryConstants.STANDBY_START_TIMEOUT, only_mode='disaster_standby') - self.update_streaming_info("cluster", "full_backup", only_mode='primary') + self.update_dorado_info("cluster", "full_backup", only_mode='primary') try: self.wait_main_standby_connection(only_mode='primary') except Exception as error: - self.update_streaming_info("cluster", "backup_fail", only_mode='primary') + self.update_dorado_info("cluster", "backup_fail", only_mode='primary') raise Exception(str(error)) ret = self.check_cluster_status(status_allowed=['Normal'], only_check=True, check_current=True) query_status = "recovery" if ret else "recovery_fail" - self.update_streaming_info("cluster", query_status, only_mode='disaster_standby') - self.update_streaming_info("cluster", "archive", only_mode='primary') - self.write_streaming_step("8_start_cluster_step") + self.update_dorado_info("cluster", query_status, only_mode='disaster_standby') + self.update_dorado_info("cluster", "archive", only_mode='primary') + self.write_dorado_step("8_start_cluster_step") def _ninth_step_for_ddr_start(self, step): """ @@ -224,7 +224,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): def run(self): self.logger.log("Start create dorado storage disaster relationship.") - step = self.query_streaming_step() + step = self.query_dorado_step() self._first_step_for_ddr_start(step) #1.检查集群状态正常 self.parse_cluster_status() diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py index 77bdacc4..4f12b804 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py @@ -30,41 +30,41 @@ class DisasterRecoveryFailoverHandler(DoradoDisasterRecoveryBase): super().__init__(*args, **kwargs) def run(self): - self.logger.log("Start streaming disaster failover.") + self.logger.log("Start dorado disaster failover.") self.check_action_and_mode() - step = self.check_streaming_failover_workable(check_type_step=3, check_status_step=0) + step = self.check_dorado_failover_workable(check_type_step=3, check_status_step=0) self.check_is_under_upgrade() - self.init_cluster_conf() try: - self.streaming_failover_single_inst(step) - self.update_streaming_info("cluster", "normal") + self.dorado_failover_single_inst(step) + self.update_dorado_info("cluster", "normal") self.clean_step_file() + self.clean_flag_file() except Exception as error: - self.update_streaming_info("cluster", "promote_fail") + self.update_dorado_info("cluster", "promote_fail") raise Exception( ErrorCode.GAUSS_516["GAUSS_51632"] % "centralize failover" + "Error:%s" % error) finally: self.remove_cluster_maintance_file() - self.clean_streaming_dir() - self.logger.log("Successfully do streaming disaster recovery failover.") + self.clean_dorado_dir() + self.logger.log("Successfully do dorado disaster recovery failover.") - def check_streaming_failover_workable(self, check_type_step=0, check_status_step=0): + def check_dorado_failover_workable(self, check_type_step=0, check_status_step=0): """ - Check streaming failover is workable. + Check dorado failover is workable. """ - self.logger.debug("Streaming disaster distribute cluster failover...") - stream_disaster_step = self.query_streaming_step() + self.logger.debug("dorado disaster distribute cluster failover...") + dorado_disaster_step = self.query_dorado_step() if not DefaultValue.is_disaster_cluster(self.cluster_info) \ - and stream_disaster_step < check_type_step: + and dorado_disaster_step < check_type_step: self.logger.debug("The primary dn exist, do nothing except record the result file.") raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % - "streaming disaster cluster failover, Because the primary cluster " + "dorado disaster cluster failover, Because the primary cluster " "does not support failover") cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL, DefaultValue.CLUSTER_STATUS_DEGRADED] - if stream_disaster_step < check_status_step: + if dorado_disaster_step < check_status_step: self.init_cluster_status() self.parse_cluster_status() - if stream_disaster_step < check_status_step: + if dorado_disaster_step < check_status_step: self.check_cluster_status(cluster_normal_status) - return stream_disaster_step + return dorado_disaster_step diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py index dc7ffea3..a2825fe9 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py @@ -35,7 +35,7 @@ class StreamingQueryHandler(DoradoDisasterRecoveryBase): """ Query infos from files. """ - file_path = os.path.realpath(os.path.join(self.streaming_file_dir, file_name)) + file_path = os.path.realpath(os.path.join(self.dorado_file_dir, file_name)) if not os.path.isfile(file_path) and file_name in [DoradoDisasterRecoveryConstants.HADR_CLUSTER_STAT]: return "normal" if not os.path.isfile(file_path): diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py index abe08902..be1c289e 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py @@ -46,7 +46,7 @@ class DisasterRecoveryStopHandler(DoradoDisasterRecoveryBase): self.check_cluster_status(status_allowed=['Normal']) self.check_cluster_type(allowed_type='primary') self.check_is_under_upgrade() - self.write_streaming_step("2_check_cluster_step") + self.write_dorado_step("2_check_cluster_step") def _third_step_for_streaming_stop(self, step): """ @@ -57,7 +57,7 @@ class DisasterRecoveryStopHandler(DoradoDisasterRecoveryBase): self.logger.debug("Start third step of streaming stop.") self.remove_all_stream_repl_infos(guc_mode="reload") self.remove_streaming_cluster_file() - self.write_streaming_step("3_remove_config_step") + self.write_dorado_step("3_remove_config_step") def _fourth_step_for_streaming_stop(self, step): """ @@ -68,7 +68,7 @@ class DisasterRecoveryStopHandler(DoradoDisasterRecoveryBase): self.logger.debug("Start fourth step of streaming stop.") self.remove_streaming_pg_hba() self.restore_guc_params() - self.write_streaming_step("4_remove_pg_hba_step") + self.write_dorado_step("4_remove_pg_hba_step") def _fifth_step_for_streaming_stop(self, step): """ @@ -78,7 +78,7 @@ class DisasterRecoveryStopHandler(DoradoDisasterRecoveryBase): return self.logger.debug("Start fifth step of streaming start.") self.streaming_clean_replication_slot() - self.write_streaming_step("5_update_config_step") + self.write_dorado_step("5_update_config_step") def _sixth_step_for_streaming_stop(self, step): """ @@ -89,12 +89,12 @@ class DisasterRecoveryStopHandler(DoradoDisasterRecoveryBase): self.logger.debug("Start sixth step of streaming stop.") self.check_cluster_status(['Normal']) self.clean_global_config() - self.update_streaming_info("cluster", "normal") - self.clean_streaming_dir() + self.update_dorado_info("cluster", "normal") + self.clean_dorado_dir() def run(self): self.logger.log("Start remove streaming disaster relationship.") - step = self.query_streaming_step() + step = self.query_dorado_step() self._first_step_for_streaming_stop(step) self.parse_cluster_status() self._second_step_for_streaming_stop(step) diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py index 2763ae77..3a2c077f 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py @@ -18,8 +18,11 @@ # ---------------------------------------------------------------------------- # Description : streaming_disaster_recovery_switchover.py is a utility for # changing role between primary cluster and standby cluster. - +import json import os +import re +import subprocess +import sys import time from datetime import datetime, timedelta @@ -36,103 +39,113 @@ from impl.dorado_disaster_recovery.ddr_constants import DoradoDisasterRecoveryCo class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.remote_replication_pairs_log_message = \ + "Please configure \"Remote Replication Pairs\" correctly on "\ + "And check and grant appropriate permissions to the corresponding device files.\n"\ + "to inform the tool and execute the tool again." def run(self): """ - streaming disaster recovery switchover + dorado disaster recovery switchover """ - self.logger.log("Start streaming disaster switchover.") + self.logger.log("Start dorado disaster switchover.") self.check_action_and_mode() self.check_switchover_workable() - self.init_cluster_conf() self.check_dn_instance_params() self.check_is_under_upgrade() try: - self.streaming_switchover_single_inst() + self.dorado_switchover_single_inst() self.clean_step_file() + self.clean_flag_file() except Exception as error: if self.params.mode == "primary": - self.update_streaming_info("cluster", "promote_fail") + self.update_dorado_info("cluster", "promote_fail") raise Exception( ErrorCode.GAUSS_516["GAUSS_51632"] % "switchover" + "Error:%s" % str(error)) finally: self.remove_cluster_maintance_file_for_switchover() self.remove_cluster_maintance_file() - self.logger.log("Successfully do streaming disaster recovery switchover.") + self.logger.log("Successfully do dorado disaster recovery switchover.") + + def check_xlog_file_path(self): + """ + get and check xlog_file_path + """ + linkDev = self.dorado_info + if os.path.islink(linkDev): + linkDev = os.readlink(self.dorado_info) + if not os.access(linkDev, os.R_OK | os.W_OK): + self.logger.debug(ErrorCode.GAUSS_501("GAUSS_50113") % self.user) + return False + return True - def streaming_switchover_single_inst(self): + def dorado_switchover_single_inst(self): """ - streaming disaster recovery switchover for single_inst cluster + dorado disaster recovery switchover for single_inst cluster disaster_standby: expect primary cluster becomes standby primary: expect standby cluster becomes primary """ - self.create_cluster_maintance_file("streaming switchover") - self.update_streaming_info("cluster", DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER) - stream_disaster_step = self.query_streaming_step() + self.create_cluster_maintance_file("dorado switchover") + self.update_dorado_info("cluster", DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER) + dorado_disaster_step = self.query_dorado_step() if self.params.mode == "primary": - end_time = datetime.now() + timedelta(seconds=self.params.waitingTimeout) - self.logger.log("Waiting for switchover barrier.") - while True: - switchover_barrier_list = self.check_streaming_disaster_switchover_barrier() - if len(switchover_barrier_list) == len(self.normal_dn_ids): - break - if datetime.now() >= end_time: - self.restart_cluster() - raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % - "check switchover_barrier on all main standby dn" + - " Because check timeout: %ss" % - str(self.params.waitingTimeout)) - time.sleep(5) - self.streaming_failover_single_inst(stream_disaster_step, - DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER) + # 这里可以等待 “Remote Copy Pairs” 同步状态完成 + # + self.dorado_failover_single_inst(dorado_disaster_step, + DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER) else: self.add_cluster_maintance_file_for_switchover() try: - if stream_disaster_step < 1: + if dorado_disaster_step < 1: self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "10%") self.stop_cluster() - self.start_cluster() - self.streaming_disaster_set_master_cluster_in_switchover() - self.write_streaming_step("1_streaming_disaster_set_master_in_switchover") - if stream_disaster_step < 2: - self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "30%") - ClusterInstanceConfig.set_data_on_dcc(self.cluster_info, - self.logger, self.user, - {self.backup_open_key: "2"}) - self.stop_cluster() - self.write_streaming_step("2_stop_cluster_for_switchover") - if stream_disaster_step < 3: - self.set_cmserver_guc("backup_open", "2", "set") - self.set_cmagent_guc("agent_backup_open", "2", "set") - self.write_streaming_step("3_set_backup_open_2_done") - if stream_disaster_step < 4: - self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "50%") + self.write_dorado_step("1_dorado_disaster_stop_cluster_for_switchover") + flag_file = os.path.join(self.step_file_path, "remote_replication_pairs_done") + if os.path.exists(flag_file): + self.logger.debug("Delete file %s." % flag_file) + os.remove(flag_file) + self.logger.debug(self.remote_replication_pairs_log_message % flag_file) + sys.exit(0) + if dorado_disaster_step < 2: + self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "30%") + flag_file = os.path.join(self.step_file_path, "remote_replication_pairs_done") + if not os.path.exists(flag_file) or not self.check_xlog_file_path(): + self.logger.debug(self.remote_replication_pairs_log_message % flag_file) + sys.exit(0) + self.write_dorado_step("2_set_remote_replication_pairs_for_switchover") + if dorado_disaster_step < 3: + self.set_datanode_guc("cluster_run_mode", "cluster_standby", "set") + self.set_cmserver_guc("backup_open", "1", "set") + self.set_cmagent_guc("agent_backup_open", "1", "set") + self.write_dorado_step("3_set_cluster_guc_done") + if dorado_disaster_step < 4: + self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "50%") self.remove_cluster_maintance_file_for_switchover() self.remove_cluster_maintance_file() self.start_cluster() - self.write_streaming_step("4_start_cluster_done") - if stream_disaster_step < 5: + self.write_dorado_step("4_start_cluster_done") + if dorado_disaster_step < 5: self.wait_for_normal(timeout=self.params.waitingTimeout, - streaming_switchover="streaming_switchover") - self.streaming_clean_replication_slot() - self.update_streaming_info("cluster", "recovery") + dorado_switchover="dorado_switchover") + self.check_dorado_datanode_query_info(timeout=self.params.waitingTimeout, + dorado_switchover="dorado_switchover") + self.update_dorado_info("cluster", "recovery") except Exception as error: - self.logger.error("Failed to do streaming disaster cluster switchover, Error:" + self.logger.error("Failed to do dorado disaster cluster switchover, Error:" " \n%s" % str(error)) - rollback_step = self.query_streaming_step() + rollback_step = self.query_dorado_step() self.logger.debug("Roll back switchover step:%s" % rollback_step) self.remove_cluster_maintance_file_for_switchover() self.remove_cluster_maintance_file() - if rollback_step < 4 or (rollback_step >= 4 and - self.streaming_switchover_roll_back_condition()): - self.streaming_switchover_roll_back(update_query=True) + self.dorado_switchover_roll_back(update_query=True) self.clean_step_file() + self.clean_flag_file() raise Exception(error) self.remove_hadr_switchover_process_file() def remove_hadr_switchover_process_file(self): self.logger.debug("Remove hadr switchover process file for switchover.") - process_file = os.path.realpath(os.path.join(self.streaming_file_dir, + process_file = os.path.realpath(os.path.join(self.dorado_file_dir, ".hadr_switchover_stat")) cmd = "if [ -f {0} ]; then rm -rf {0}; fi".format(process_file) self.ssh_tool.executeCommand(cmd, hostList=self.connected_nodes) @@ -203,7 +216,7 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): def add_cluster_maintance_file_for_switchover(self): """ - add cluster_maintance file for streaming disaster switchover to disaster_standby + add cluster_maintance file for dorado disaster switchover to disaster_standby """ self.logger.debug("Start add cluster_maintance file for switchover.") try: @@ -311,7 +324,7 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): return True def wait_for_normal(self, timeout=DefaultValue.TIMEOUT_CLUSTER_START, - streaming_switchover=None): + dorado_switchover=None): """ function:Wait the cluster become Normal or Degraded input:NA @@ -346,10 +359,10 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): break else: self.logger.debug("Cluster status is %s(%s)." % ( - cluster_status.clusterStatus, cluster_status.clusterStatusDetail)) + cluster_status.clusterStatus, cluster_status.clusterStatusDetail)) if check_status != 0: - if streaming_switchover == "streaming_switchover": + if dorado_switchover == "dorado_switchover": raise Exception( ErrorCode.GAUSS_528["GAUSS_52800"] % (cluster_status.clusterStatus, cluster_status.clusterStatusDetail)) @@ -383,23 +396,29 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): % "set auto_csn_barrier" + "Error:%s" % output) self.logger.debug("Successfully %s auto_csn_barrier is %s." % (guc_mode, guc_value)) - def streaming_switchover_roll_back(self, update_query=False): + def dorado_switchover_roll_back(self, update_query=False): """ - streaming disaster cluster roll back in switchover + dorado disaster cluster roll back in switchover """ - self.logger.log("Roll back streaming disaster cluster switchover...") - ClusterInstanceConfig.set_data_on_dcc(self.cluster_info, - self.logger, self.user, - {self.backup_open_key: "0"}) + self.logger.log("Roll back dorado disaster cluster switchover...") self.stop_cluster() - self.set_cmserver_guc("backup_open", "0", "set") - self.set_cmagent_guc("agent_backup_open", "0", "set") - self.logger.log("Successfully modify cma and cms parameters to start according to primary " + if self.params.mode == "primary": + self.set_datanode_guc("cluster_run_mode", "'cluster_standby", "set") + self.set_cmserver_guc("backup_open", "1", "set") + self.set_cmagent_guc("agent_backup_open", "1", "set") + else: + self.set_datanode_guc("cluster_run_mode", "'cluster_primary", "set") + self.set_cmserver_guc("backup_open", "0", "set") + self.set_cmagent_guc("agent_backup_open", "0", "set") + self.logger.log("Successfully modify cma and cms parameters to start according to original " "cluster mode") if update_query: - self.update_streaming_info("cluster", "archive") - self.start_cluster() - self.logger.log("Successfully Roll back streaming disaster cluster switchover.") + self.update_dorado_info("cluster", "archive") + self.logger.debug("Please restore the original \"Remote Replication Pairs\" correctly on " + "the storage management interface.\n" + "And check and grant appropriate permissions to the corresponding device files.\n" + "After completing these steps, start the cluster manually !") + self.logger.log("Successfully Roll back dorado disaster cluster switchover.") def check_streaming_disaster_switchover_barrier(self): """ @@ -432,13 +451,13 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): and self.params.mode == "primary": self.logger.debug("The primary dn exist, do nothing except record the result file.") raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % - "streaming disaster cluster switchover, Because the primary cluster " + "dorado disaster cluster switchover, Because the primary cluster " "[drClusterMode] parameter must be disaster_standby") if DefaultValue.is_disaster_cluster(self.cluster_info) and \ self.params.mode == "disaster_standby": self.logger.debug("The primary dn not exist, do nothing except record the result file.") raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % - "streaming disaster cluster switchover, Because the disaster_standby " + "dorado disaster cluster switchover, Because the disaster_standby " "cluster [drClusterMode] parameter must be primary") self.logger.log("Waiting for cluster and all instances normal.") if self.params.mode == "primary": @@ -447,7 +466,7 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): self.init_cluster_status() self.parse_cluster_status() if self.check_cluster_status(status_allowed=['Normal'], only_check=True, - is_log=False) and self.check_instances_ready_for_switchover(): + is_log=False) and self.check_instances_ready_for_switchover(): break if datetime.now() >= end_time: raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] diff --git a/script/impl/dorado_disaster_recovery/params_handler.py b/script/impl/dorado_disaster_recovery/params_handler.py index 530d7d6a..eb316755 100644 --- a/script/impl/dorado_disaster_recovery/params_handler.py +++ b/script/impl/dorado_disaster_recovery/params_handler.py @@ -143,7 +143,7 @@ gs_sdr is a utility for streaming disaster recovery fully options. Usage: gs_sdr -? | --help gs_sdr -V | --version - gs_sdr -t start -m [primary|disaster_standby] -X XMLFILE [-U DR_USERNAME] [-W DR_PASSWORD] [--json JSONFILE] [--time-out=SECS] [-l LOGFILE] + gs_sdr -t start -m [primary|disaster_standby] -X XMLFILE [-U DR_USERNAME] [-W DR_PASSWORD] [--time-out=SECS] [-l LOGFILE] gs_sdr -t stop -X XMLFILE|--json JSONFILE [-l LOGFILE] gs_sdr -t switchover -m [primary|disaster_standby] [--time-out=SECS] [-l LOGFILE] gs_sdr -t failover [-l LOGFILE] @@ -160,7 +160,6 @@ General options: -W Disaster recovery user password. -X Path of the XML configuration file. -l Path of log file. - --json Path of params file for streaming options. --time-out=SECS Maximum waiting time when Main standby connect to the primary dn, default value is 1200s. """ @@ -201,8 +200,6 @@ class ParamsHandler(object): help='hadr user password.') parser.add_option('-X', dest='xml_path', type='string', help='Cluster config xml path.') - parser.add_option('--json', dest='json_path', type='string', - help='Config json file of streaming options') parser.add_option('--time-out=', dest='timeout', default="1200", type='string', help='time out.') parser.add_option("-l", dest='logFile', type='string', -- Gitee From 5c392b92a6dfde1837e2f4ff4ca992c89cbf5977 Mon Sep 17 00:00:00 2001 From: Hao Date: Mon, 14 Aug 2023 16:01:02 +0800 Subject: [PATCH 03/23] dorado diaster recovery start comit --- script/gs_ddr | 5 +- .../impl/dorado_disaster_recovery/ddr_base.py | 400 +++++++++--------- .../dorado_disaster_recovery/ddr_constants.py | 16 +- .../dorado_diaster_recovery_start.py | 89 ++-- .../dorado_disaster_recovery_query.py | 8 +- .../params_handler.py | 30 +- 6 files changed, 270 insertions(+), 278 deletions(-) diff --git a/script/gs_ddr b/script/gs_ddr index f7c3793f..120726bd 100644 --- a/script/gs_ddr +++ b/script/gs_ddr @@ -58,7 +58,6 @@ class DoradoStorageDisasterRecoveryBase(object): self.logger = None self.trace_id = uuid.uuid1().hex self.dorado_info = None - DoradoStorageDisasterRecoveryBase.mock_process_user_sensitive_info() self.__init_globals() @staticmethod @@ -69,11 +68,11 @@ class DoradoStorageDisasterRecoveryBase(object): def __init_globals(self): self.user = UserUtil.getUserInfo()['name'] - tmp_logger_file = ClusterLog.getOMLogPath(DoradoDisasterRecoveryConstants.STREAMING_LOG_FILE, self.user) + tmp_logger_file = ClusterLog.getOMLogPath(DoradoDisasterRecoveryConstants.DDR_LOG_FILE, self.user) tmp_logger = GaussLog(tmp_logger_file, 'parse_and_validate_params', trace_id=self.trace_id) self.params = ParamsHandler(tmp_logger, self.trace_id).get_valid_params() self.log_file = self.params.logFile if self.params.logFile else \ - ClusterLog.getOMLogPath(DoradoDisasterRecoveryConstants.STREAMING_LOG_FILE, self.user) + ClusterLog.getOMLogPath(DoradoDisasterRecoveryConstants.DDR_LOG_FILE, self.user) self.logger = GaussLog(self.log_file, self.params.task, trace_id=self.trace_id) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 8fd4b74d..70792908 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -432,7 +432,7 @@ class DoradoDisasterRecoveryBase(object): write streaming step :return: NA """ - self.logger.debug("Streaming action:[%s] record current step:[%s]" + self.logger.debug("Dorado disaster recovery action:[%s] record current step:[%s]" % (self.params.task, step)) with os.fdopen(os.open(self.step_file_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, DefaultValue.KEY_FILE_MODE_IN_OS), "w") as fp_write: @@ -792,6 +792,25 @@ class DoradoDisasterRecoveryBase(object): % ("set wal_keep_segments for inst:%s" % inst.instanceId, str(output))) self.logger.debug("Successfully [%s] shardNum [%s] node [%s] wal_keep_segments " "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) + + + def __set_dn_xlog_file_path(self, params_list): + """ + Set xlog_file_path value in primary dn + """ + (inst, opt_type, value, mpprc_file) = params_list + self.logger.debug("Start [%s] shardNum [%s] node [%s] wal_keep_segments value [%s]." + % (opt_type, inst.mirrorId, inst.hostname, value)) + cmd = "source %s; gs_guc %s " \ + "-N %s -D %s -c \"xlog_file_path = '%s'\" " % \ + (mpprc_file, opt_type, inst.node, inst.datadir, value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + "Options:%s, Error: \n%s " + % ("set xlog_file_path for inst:%s" % inst.instanceId, str(output))) + self.logger.debug("Successfully [%s] shardNum [%s] node [%s] wal_keep_segments " + "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) def set_wal_keep_segments(self, opt_type, value, restore_flag=False, only_mode=None): """ @@ -815,6 +834,78 @@ class DoradoDisasterRecoveryBase(object): parallelTool.parallelExecute(self.__set_wal_keep_segments_each_inst, params_list) self.logger.log("Successfully %s wal_keep_segments value: %s." % (opt_type, value)) + def set_xlog_file_path(self, xlog_file_path): + """ + guc set xlog_file_path value in primary dn + """ + self.__set_guc_param("xlog_file_path", xlog_file_path) + self.set_xlog_lock_file_path() + + def __set_xlog_lock_file_each_inst(self, params_list): + """ + Set xlog_lock_file_path value in each dn + """ + (inst, opt_type, value, mpprc_file) = params_list + self.logger.debug("Start [%s] shardNum [%s] node [%s] xlog_lock_file value [%s]." + % (opt_type, inst.mirrorId, inst.hostname, value)) + cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ + "-Z datanode -D %s -c \\\"xlog_lock_file_path = '%s'\\\"\"" % \ + (mpprc_file, inst.hostname, mpprc_file, opt_type, inst.datadir, value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + "Options:%s, Error: \n%s " + % ("set xlog_lock_file_path for inst:%s" % inst.instanceId, str(output))) + self.logger.debug("Successfully [%s] shardNum [%s] node [%s] xlog_lock_file_path " + "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) + + def set_xlog_lock_file_path(self, opt_type="set"): + """ + guc set xlog_lock_file_path value in primary dn + """ + self.logger.log("Starting %s xlog_lock_file_path param" % (opt_type)) + params_list=[] + for dbnode in self.cluster_info.dbNodes: + for inst in dbnode.datanodes: + lock_file = os.path.join(inst.datadir, "xlog_lock_file") + params_list.append((inst, opt_type, lock_file, self.mpp_file)) + + if not params_list: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain param list for set xlog_lock_file_path") + + parallelTool.parallelExecute(self.__set_xlog_lock_file_each_inst, params_list) + self.logger.log("Successfully %s xlog_lock_file_path param." % (opt_type)) + + def set_application_name(self): + """ + guc set application_name value + """ + self.logger.log("Starting set application_name param" ) + app_name_prefix = "dn_master" if self.params.mode == "primary" \ + else "dn_standby" + params_list=[] + for dbnode in self.cluster_info.dbNodes: + for inst in dbnode.datanodes: + app_name = "%s_%s" % (app_name_prefix, inst.instanceId) + params_list.append((inst, "set", app_name, self.mpp_file)) + + if not params_list: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain param list for set application_name") + + parallelTool.parallelExecute(self.__set_xlog_lock_file_each_inst, params_list) + self.logger.log("Successfully %s application_name param." % (opt_type)) + + def set_cluster_run_mode(self): + """ + guc set xlog_file_path value in primary dn + """ + cluster_run_mode = "cluster_primary" if self.params.mode == "primary" \ + else "cluster_standby" + self.__set_guc_param("cluster_run_mode", cluster_run_mode) + self.__set_guc_param("ha_module_debug", "off") + def __stop_one_node(self, node_id): """ Stop one node by node id @@ -859,7 +950,7 @@ class DoradoDisasterRecoveryBase(object): self.connected_nodes = connected_hosts return self.connected_nodes - def update_streaming_pg_hba(self): + def update_pg_hba(self): """ update pg_hba.conf, read config_param.json file and set other cluster ip :return:NA @@ -870,7 +961,7 @@ class DoradoDisasterRecoveryBase(object): self.mpp_file, OMCommand.getLocalScript( "Local_Config_Hba"), self.user, self.streaming_xml) self.logger.debug("Command for changing instance pg_hba.conf file: %s" % cmd) - self.get_all_connection_node_name("update_streaming_pg_hba") + self.get_all_connection_node_name("update_pg_hba") try: self.ssh_tool.scpFiles(self.streaming_xml, self.dorado_file_dir) self.ssh_tool.executeCommand(cmd, hostList=self.connected_nodes) @@ -881,75 +972,6 @@ class DoradoDisasterRecoveryBase(object): raise Exception(msg) self.logger.log("Successfully update pg_hba config.") - def __get_repl_info_cmd(self, node_name, ret, dn_inst, opt_mode, idx): - """ - get_repl_info_cmd - """ - if node_name != self.local_host: - set_cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ - "-Z datanode -D %s -c " \ - "\\\"replconninfo%s = 'localhost=%s localport=%s " \ - "localheartbeatport=%s localservice=%s remotehost=%s " \ - "remoteport=%s remoteheartbeatport=%s " \ - "remoteservice=%s iscascade=%s iscrossregion=%s'\\\"\"" - set_cmd = set_cmd % (self.mpp_file, node_name, - self.mpp_file, opt_mode, - dn_inst.datadir, idx, ret.group(1), - ret.group(2), ret.group(3), ret.group(4), - ret.group(5), ret.group(6), ret.group(7), - ret.group(8), "true", "false") - else: - set_cmd = "source %s ; gs_guc %s -Z datanode -D %s -c " \ - "\"replconninfo%s = 'localhost=%s localport=%s " \ - "localheartbeatport=%s localservice=%s remotehost=%s " \ - "remoteport=%s remoteheartbeatport=%s " \ - "remoteservice=%s iscascade=%s iscrossregion=%s'\"" - set_cmd = set_cmd % (self.mpp_file, opt_mode, - dn_inst.datadir, idx, ret.group(1), - ret.group(2), ret.group(3), ret.group(4), - ret.group(5), ret.group(6), ret.group(7), - ret.group(8), "true", "false") - return set_cmd - - def __set_original_repl_info(self, dn_inst, node_name, opt_mode="set"): - """ - Rectify original replconninfos - """ - orignal_ports = None - if not all([dn_inst, node_name]): - raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain dn infos") - for idx in range(1, DoradoDisasterRecoveryConstants.MAX_REPLICATION_NUMS + 1): - if node_name == self.local_host: - cmd = "source %s; gs_guc check -Z datanode -D %s " \ - "-c 'cross_cluster_replconninfo%s'" % (self.mpp_file, dn_inst.datadir, idx) - else: - cmd = "source %s; pssh -H %s 'source %s; gs_guc check " \ - "-Z datanode -D %s -c \"cross_cluster_replconninfo%s\"'" \ - % (self.mpp_file, node_name, self.mpp_file, dn_inst.datadir, idx) - self.logger.debug("Check original repl infos with cmd:%s" % cmd) - status, output = CmdUtil.retryGetstatusoutput(cmd) - if status != 0: - raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + - " Error: \n%s " % output) - if output.count("=NULL") > 2 or "iscrossregion=true" in output.lower(): - self.logger.debug("InstanceID:%s, Index:%s" % (dn_inst.instanceId, idx)) - return idx, orignal_ports - ret = re.search( - r"cross_cluster_replconninfo%s='localhost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})" - r" localport=(\d{4,5}) " - r"remotehost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}) " - r"remoteport=(\d{4,5}) " % idx, output) - if not ret: - raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "search repl infos") - set_cmd = self.__get_repl_info_cmd(node_name, ret, dn_inst, opt_mode, idx) - self.logger.debug("Set original repl infos with cmd:%s" % set_cmd) - status, output = CmdUtil.retryGetstatusoutput(set_cmd) - if status != 0: - raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % set_cmd + - " Error: \n%s " % output) - orignal_ports = (ret.group(2), ret.group(3), ret.group(4)) - self.logger.debug("Successfully rectify original repl infos for instance:%s." - % dn_inst.instanceId) def __get_local_data_ip(self, inst_host): """ @@ -967,115 +989,66 @@ class DoradoDisasterRecoveryBase(object): return data_ip raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] % "obtain shards from local cluster info") - - def __config_one_dn_instance(self, params): + + def __get_remote_ips(self): """ - Config replconninfo for one dn instance + Get remote dn data ip """ - inst, opt_mode, remote_cluster_info = params - local_data_ip = self.__get_local_data_ip(inst.hostname) - base_dn_port = self.params.remoteClusterConf['port'] - self.logger.debug("Start config instance:[%s], got dataIp:[%s], port:[%s]." - % (inst.instanceId, local_data_ip, base_dn_port)) - if not all([local_data_ip, base_dn_port]): - raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] - % "dn port or dataIp for config instance") - inst_index, original_ports = self.__set_original_repl_info( - inst, inst.hostname, opt_mode=opt_mode) - repl_params = [] - shards = remote_cluster_info.get("shards") + remoteClusterConf = self.params.remoteClusterConf + shards = remoteClusterConf["shards"] + indx = 1 + remote_ips = [] for shard in shards: - for node_info in shard: - data_ip = node_info.get("dataIp") - shard_num = node_info.get("shardNum", '1') - if str(inst.mirrorId) == str(shard_num): - repl_params.append(( - shard_num, inst.hostname, local_data_ip, - inst.datadir, data_ip, inst_index, - original_ports, base_dn_port, opt_mode)) - inst_index += 1 - return repl_params - - def __do_config_dn_repl_info(self, params): - """ - function:config postgres conf - :return:NA + for node in shard: + ip = node["ip"] + data_ip = node["dataIp"] + remote_ips.append(data_ip) + + return remote_ips + + def __config_one_dn_instance(self, params): """ - shard_num, host, local_data_ip, data_dir, data_ip, index, \ - original_ports, base_port, opt_mode = params - local_port, local_heartbeat, local_service = original_ports - remote_base = int(base_port) - self.logger.debug("shard num %s base port is %s" % (shard_num, remote_base)) - remote_port = remote_base + 1 - remote_heartbeat = remote_base + 5 - remote_service = remote_base + 4 - is_cascade = "false" - if self.local_host == host: - guc_cmd = "source %s ; gs_guc %s -Z datanode -D %s " \ - "-c \"replconninfo%s = 'localhost=%s localport=%s " \ - "localheartbeatport=%s localservice=%s remotehost=%s " \ - "remoteport=%s remoteheartbeatport=%s remoteservice=%s " \ - "iscascade=%s iscrossregion=true'\"" \ - % (self.mpp_file, opt_mode, data_dir, index, local_data_ip, local_port, - local_heartbeat, local_service, data_ip, remote_port, - remote_heartbeat, remote_service, is_cascade) - self.logger.debug("Set datanode postgres file for streaming " - "disaster cluster with cmd:%s" % guc_cmd) - else: - guc_cmd = "source %s; pssh -s -H %s \"source %s ; gs_guc %s -Z datanode -D %s " \ - "-c \\\"replconninfo%s = 'localhost=%s localport=%s " \ - "localheartbeatport=%s localservice=%s remotehost=%s " \ - "remoteport=%s remoteheartbeatport=%s remoteservice=%s " \ - "iscascade=%s iscrossregion=true'\\\"\"" \ - % (self.mpp_file, host, - self.mpp_file, opt_mode, data_dir, index, - local_data_ip, local_port, local_heartbeat, - local_service, data_ip, remote_port, - remote_heartbeat, remote_service, is_cascade) - self.logger.debug("Set datanode postgres file for streaming " - "disaster cluster with cmd:%s" % guc_cmd) - status, output = CmdUtil.retryGetstatusoutput(guc_cmd) - if status != 0: - raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % guc_cmd + - " Error: \n%s " % output) + Config cross_cluster_replconninfo for one dn instance + """ + inst, opt_mode = params + local_dn_ip = inst.listenIps[0] + local_port = inst.port + remote_port = self.params.remoteClusterConf['port'] + remote_data_ips = self.__get_remote_ips() + + idx = 1 + for remote_ip in remote_data_ips: + set_cmd = "source %s ; gs_guc set -N %s -D %s -c " \ + "\"cross_cluster_replconninfo%s = 'localhost=%s localport=%s " \ + "remotehost=%s remoteport=%s '\"" \ + % (self.mpp_file, inst.hostname, inst.datadir, idx, + local_dn_ip, local_port, remote_ip, remote_port) + self.logger.debug("Set dn cross cluster replinfos with cmd:%s" % set_cmd) + idx += 1 + status, output = CmdUtil.retryGetstatusoutput(set_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % set_cmd + + " Error: \n%s " % output) + self.logger.debug("Successfully rectify original repl infos for instance:%s." + % inst.instanceId) + def config_cross_cluster_repl_info(self): """ - update postgresql.conf for replconninfo + update postgresql.conf for cross_cluster_replconninfo """ - self.logger.debug("set all datanode guc param in postgres conf for ddr cluster.") - repl_params = [] - opt_mode = "reload" if self.params.mode == "primary" else "set" + self.logger.debug("set all datanode guc param in postgres conf for cross_cluster_replconninfo.") + + opt_mode = "set" config_repl_params = [] datanode_instance = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes] for inst in datanode_instance: - config_repl_params.append((inst, opt_mode, self.params.remoteClusterConf)) + config_repl_params.append((inst, opt_mode)) rets = parallelTool.parallelExecute(self.__config_one_dn_instance, config_repl_params) - for param in rets: - repl_params += param - self.logger.debug("Got repl params:%s" % str(repl_params)) - parallelTool.parallelExecute(self.__do_config_dn_repl_info, repl_params) + self.logger.debug( - "Successfully set all datanode guc param in postgres conf for streaming cluster.") - - def set_datanode_guc(self, guc_parameter, guc_value, guc_type, only_mode=None): - """ - set datanode guc param - :return: NA - """ - if only_mode and self.params.mode != only_mode: - self.logger.debug("Set datanode guc [%s] to [%s] not for mode:%s." - % (guc_parameter, guc_value, self.params.mode)) - return - cmd = "gs_guc %s -Z datanode -N all -I all -c \"%s=%s\" " % \ - (guc_type, guc_parameter, guc_value) - status, output = CmdUtil.retryGetstatusoutput(cmd) - if status != 0: - msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ - % "set datanode guc [%s] to [%s], output:%s" \ - % (guc_parameter, guc_value, output) - self.logger.debug(msg) + "Successfully set all datanode guc param in postgres conf for cross_cluster_replconninfo.") def set_cmserver_guc(self, guc_parameter, guc_value, guc_type, only_mode=None): """ @@ -1086,8 +1059,8 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("Set cms guc [%s] to [%s] not for mode:%s." % (guc_parameter, guc_value, self.params.mode)) return - cmd = "gs_guc %s -Z cmserver -N all -I all -c \"%s=%s\" " % \ - (guc_type, guc_parameter, guc_value) + cmd = "source %s; cm_ctl %s --param --server -k \"%s=%s\" " % \ + (self.mpp_file, guc_type, guc_parameter, guc_value) status, output = CmdUtil.retryGetstatusoutput(cmd) if status != 0: msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ @@ -1104,8 +1077,8 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("Set cma guc [%s] to [%s] not for mode:%s." % (guc_parameter, guc_value, self.params.mode)) return - cmd = "gs_guc %s -Z cmagent -N all -I all -c \"%s=%s\" " % \ - (guc_type, guc_parameter, guc_value) + cmd = "source %s; cm_ctl %s --param --agent -k \"%s=%s\" " % \ + (self.mpp_file, guc_type, guc_parameter, guc_value) status, output = CmdUtil.retryGetstatusoutput(cmd) if status != 0: msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ @@ -1300,18 +1273,65 @@ class DoradoDisasterRecoveryBase(object): raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "full build from remote cluster" + error_detail) self.logger.debug("Successfully build cascade standby dn:%s" % inst.instanceId) - - def start_dss_instance(self, only_mode=None): + + def __start_dss_and_build(self, params): """ Start dss server process """ - cmd = "source %s; export DSS_MAINTAIN=TRUE; dssserver -D %s & " % self.dss_home_dir + inst, mpprc_file = params + self.logger.debug("Start dssserver on node [%s] ." % inst.hostname) + + cmd = "source %s; pssh -H %s \"source %s ; export DSS_MAINTAIN=TRUE; " \ + " dssserver -D %s & \"" % (mpprc_file,inst.hostname, mpprc_file, self.dss_home_dir) status, output = CmdUtil.retryGetstatusoutput(cmd) if status != 0: - self.logger.error(ErrorCode.GAUSS_516["GAUSS_51600"] + - "status(%d), output(%s)" % (status, output)) + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + "Options:%s, Error: \n%s " + % ("start dssserver on node :%s" % inst.hostname, str(output))) + self.logger.debug("Successfully Start dssserver on node [%s] " % inst.hostname) + + build_cmd = "source %s; pssh -H %s \"source %s ; gs_ctl build -D %s -b cross_cluster_full -g 0 -q " \ + % (mpprc_file,inst.hostname, mpprc_file, inst.datadir) + status, output = CmdUtil.retryGetstatusoutput(build_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % build_cmd + + "Options:%s, Error: \n%s " + % ("build main_standby on node :%s" % inst.hostname, str(output))) + self.logger.debug("Successfully build main_standby in disaster standby cluster on node [%s] " % inst.hostname) + + kill_cmd = "source %s; pssh -H %s \"source %s ; ps ux | grep dssserver | grep -v grep | awk '{print $2}' | xargs kill -9 " \ + % (mpprc_file,inst.hostname, mpprc_file) + status, output = CmdUtil.retryGetstatusoutput(kill_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % kill_cmd + + "Options:%s, Error: \n%s " + % ("stop dssserver before start cluster on node :%s" % inst.hostname, str(output))) + self.logger.debug("Successfully stop dssserver before start cluster on node [%s] " % inst.hostname) return output + def start_dss_instance(self, only_mode=None): + """ + Start dss server process + """ + if self.params.mode == "primary" or self.params.mode != only_mode: + self.logger.debug("start dssserver step is not for mode:%s." % self.params.mode) + return + primary_dn = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + + params_list = [] + for inst in primary_dn: + params_list.append((inst, self.mpp_file)) + + if not params_list: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain param list for start dssserver in disaster_standby") + parallelTool.parallelExecute(self.__start_dss_and_build, params_list) + self.logger.log("Successfully start dssserver and build main_standby inst : %s." % primary_dn) + return + + + def kill_dss_instance(self, only_mode=None): """ Start dss server process @@ -1429,26 +1449,19 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("The primary dn does not exist on current cluster.") return self.primary_dn_ids = p_inst_list - sql_check = "select 1 from pg_catalog.gs_hadr_local_rto_and_rpo_stat();" - sql_check_2 = "select 1 from pg_catalog.pg_stat_get_wal_senders() where " \ + sql_check = "select 1 from pg_catalog.pg_stat_get_wal_senders() where " \ "sync_state='Async' and peer_role='Standby' and peer_state='Normal';" param_list = [(dn_inst, sql_check) for db_node in self.cluster_info.dbNodes - for dn_inst in db_node.datanodes - if dn_inst.instanceId in self.primary_dn_ids] - param_list_2 = [(dn_inst, sql_check_2) for db_node in self.cluster_info.dbNodes - for dn_inst in db_node.datanodes if dn_inst.instanceId - in self.primary_dn_ids] + for dn_inst in db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + if not param_list: raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain param list for check main standby connection on primary dn") self.logger.debug("Start check main standby connection with sql:%s." % sql_check) results = parallelTool.parallelExecute(self.__check_one_main_standby_connection, param_list) - self.logger.debug("Start check main standby connection with sql:%s." % sql_check_2) - results_2 = parallelTool.parallelExecute(self.__check_one_main_standby_connection, - param_list_2) - return all(results+results_2) + return all(results) def wait_main_standby_connection(self, only_mode=None): if only_mode and self.params.mode != only_mode: @@ -2051,13 +2064,13 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("Update query [%s] to [%s]." % (key, value)) try: if key == "cluster": - key_stat = DoradoDisasterRecoveryConstants.HADR_CLUSTER_STAT + key_stat = DoradoDisasterRecoveryConstants.DDR_CLUSTER_STAT elif key == DoradoDisasterRecoveryConstants.ACTION_FAILOVER: - key_stat = DoradoDisasterRecoveryConstants.HADR_FAILOVER_STAT + key_stat = DoradoDisasterRecoveryConstants.DDR_FAILOVER_STAT elif key == DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: - key_stat = DoradoDisasterRecoveryConstants.HADR_SWICHOVER_STAT + key_stat = DoradoDisasterRecoveryConstants.DDR_SWICHOVER_STAT elif key == DoradoDisasterRecoveryConstants.ACTION_ESTABLISH: - key_stat = DoradoDisasterRecoveryConstants.HADR_ESTABLISH_STAT + key_stat = DoradoDisasterRecoveryConstants.DDR_ESTABLISH_STAT else: self.logger.debug("key error.") return @@ -2128,7 +2141,6 @@ class DoradoDisasterRecoveryBase(object): (len(channel) != 1 or "<--" not in channel[0]): check_ok = -1 elif state == "Standby": - # 不管是主集群,还是灾难备集群仅仅检查 local_role 只有一个元素 Standby 和 db_state 为 Normal if (len(dbState) != 1 or dbState[0] != "Normal") or \ (len(localRole) != 1 or localRole[0] != "Standby"): check_ok = -1 @@ -2287,8 +2299,8 @@ class DoradoDisasterRecoveryBase(object): def streaming_clean_archive_slot(self): """ - drop lot_type is physical and slot_name not contain (gs_roach_full,gs_roach_inc, - cn_xxx,dn_xxx, dn_xxx_hadr) on all cn node and all primary dn node if the + drop lot_type is physical and slot_name not contain (gs_roach_full,gs_roach_inc, + cn_xxx,dn_xxx, dn_xxx_hadr) on all cn node and all primary dn node if the slot_name exists when the disaster cluster become primary cluster """ self.logger.debug("Starting drop archive slots") diff --git a/script/impl/dorado_disaster_recovery/ddr_constants.py b/script/impl/dorado_disaster_recovery/ddr_constants.py index 60d8b3da..8469e324 100644 --- a/script/impl/dorado_disaster_recovery/ddr_constants.py +++ b/script/impl/dorado_disaster_recovery/ddr_constants.py @@ -16,8 +16,8 @@ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. # ---------------------------------------------------------------------------- -# Description : streaming_constants.py is utility for defining constants -# of streaming disaster recovery. +# Description : ddr_constants.py is utility for defining constants +# of dorado disaster recovery. ############################################################################# @@ -46,18 +46,18 @@ class DoradoDisasterRecoveryConstants: ACTION_ESTABLISH = "establish" # streaming query temp file - HADR_CLUSTER_STAT = ".hadr_cluster_stat" - HADR_FAILOVER_STAT = ".hadr_failover_stat" - HADR_SWICHOVER_STAT = ".hadr_switchover_stat" - HADR_ESTABLISH_STAT = ".hadr_establish_stat" + DDR_CLUSTER_STAT = ".ddr_cluster_stat" + DDR_FAILOVER_STAT = ".ddr_failover_stat" + DDR_SWICHOVER_STAT = ".ddr_switchover_stat" + DDR_ESTABLISH_STAT = ".ddr_establish_stat" - STREAM_DISTRIBUTE_ACTION = "distribute_stream_failover" + DDR_DISTRIBUTE_ACTION = "distribute_stream_failover" # GUC CHANGE MAP GUC_CHANGE_MAP = {} # params in json file for each module - STREAMING_JSON_PARAMS = { + DDR_JSON_PARAMS = { "start": ["localClusterConf", "remoteClusterConf"], "stop": ["localClusterConf", "remoteClusterConf"], "switchover": [], diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py index 7d57043b..98f70007 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py @@ -63,50 +63,32 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): #self.check_dn_instance_params() self.write_dorado_step("2_check_cluster_step") + def common_step_for_ddr_start(self): + """ + Common step for ddr start between step 1 and 2 + """ + self.logger.debug("Start common config step of ddr start.") + self.distribute_cluster_conf() + #调用local/ConfigHba.py和streaming_xml进行设置,考虑使用gs_guc set适配 + self.update_pg_hba() + self.config_cross_cluster_repl_info() + self.set_xlog_file_path(self.dorado_info) + self.set_application_name() + self.set_cluster_run_mode() + def _third_step_for_ddr_start(self, step): """ Third step for streaming start """ if step >= 3: return - self.logger.debug("Start third step of streaming start.") - #self.drop_replication_slot_on_dr_cluster(only_mode="disaster_standby") + self.logger.debug("Start third step of ddr start.") + #self.prepare_gs_secure_files(only_mode='primary') #self.build_and_distribute_key_files(only_mode='disaster_standby') #self.get_default_wal_keep_segments(only_mode='primary') self.write_dorado_step("3_set_wal_segments_step") - def drop_replication_slot_on_dr_cluster(self, only_mode=None): - """ - Drop replication slot on dr cluster - """ - if only_mode and self.params.mode != only_mode: - self.logger.debug("Drop replication slot opts not for mode:%s." % self.params.mode) - return - sql_check = "select slot_name from pg_get_replication_slots() where slot_type='logical'" - primary_dns = DefaultValue.get_primary_dn_instance_id("Primary", ignore=True) - if not primary_dns: - return - primary_insts = [inst for node in self.cluster_info.dbNodes - for inst in node.datanodes if str(inst.instanceId) in primary_dns] - dn_inst = primary_insts[0] - self.logger.debug("Start drop node %s [%s] slots" % (dn_inst.hostname, dn_inst.instanceId)) - status, output = ClusterCommand.remoteSQLCommand( - sql_check, self.user, dn_inst.hostname, dn_inst.port) - self.logger.debug("Get %s all replication slots, status=%d, output: %s." % - (dn_inst.instanceId, status, SensitiveMask.mask_pwd(output))) - if status == 0 and output.strip(): - drop_slots = output.strip().split('\n') - for slot in drop_slots: - self.logger.debug("Starting drop node %s %s" % (dn_inst.instanceId, slot.strip())) - sql = "select * from pg_drop_replication_slot('%s');" % slot.strip() - status_dr, output_dr = ClusterCommand.remoteSQLCommand( - sql, self.user, dn_inst.hostname, dn_inst.port) - if status_dr != 0: - self.logger.debug("Failed to remove node %s %s with error: %s" % ( - dn_inst.hostname, slot.strip(), SensitiveMask.mask_pwd(output_dr))) - self.logger.debug( - "Successfully drop node %s %s" % (dn_inst.instanceId, slot.strip())) def _fourth_step_for_ddr_start(self, step): """ @@ -114,7 +96,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): """ if step >= 4: return - self.logger.debug("Start fourth step of streaming start.") + self.logger.debug("Start fourth step of ddr start.") self.set_wal_keep_segments( "reload", DoradoDisasterRecoveryConstants.MAX_WAL_KEEP_SEGMENTS, only_mode='primary') self.write_dorado_step("4_set_wal_segments_step") @@ -125,21 +107,14 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): """ if step >= 5: return - self.logger.debug("Start fifth step of streaming start.") - self.set_data_in_dcc(self.backup_open_key, "0", only_mode='primary') - self.set_data_in_dcc(self.backup_open_key, "1", only_mode='disaster_standby') - #self.set_most_available(mode="reload", raise_error=False) - self.stop_cluster_by_node(only_mode='disaster_standby') - self.write_dorado_step("5_set_wal_segments_step") - - def common_step_for_ddr_start(self): - """ - Common step for ddr start between step 1 and 2 - """ - self.logger.debug("Start common config step of ddr start.") - self.distribute_cluster_conf() - self.update_streaming_pg_hba() - self.config_cross_cluster_repl_info() + self.logger.debug("Start fifth step of ddr start.") + # self.set_data_in_dcc(self.backup_open_key, "0", only_mode='primary') + # self.set_data_in_dcc(self.backup_open_key, "1", only_mode='disaster_standby') + # self.set_most_available(mode="reload", raise_error=False) + #self.stop_cluster_by_node(only_mode='disaster_standby') + self.stop_cluster() + self.start_cluster(only_mode="primary") + self.write_ddr_step("5_set_wal_segments_step") def _sixth_step_for_ddr_start(self, step): """ @@ -158,12 +133,12 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): """ if step >= 7: return - self.logger.debug("Start seventh step of streaming start.") + self.logger.debug("Start seventh step of ddr start.") self.update_dorado_info("cluster", "restore", only_mode='disaster_standby') try: self.start_dss_instance(only_mode='disaster_standby') - self.build_dn_instance(only_mode='disaster_standby') - self.kill_dss_instance(only_mode='disaster_standby') + # self.build_dn_instance(only_mode='disaster_standby') + # self.kill_dss_instance(only_mode='disaster_standby') except Exception as error: self.update_dorado_info("cluster", "restore_fail", only_mode='disaster_standby') raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "build dns" + "Error:%s" % error) @@ -175,7 +150,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): """ if step >= 8: return - self.logger.debug("Start eighth step of streaming start.") + self.logger.debug("Start eighth step of ddr start.") self.start_cluster(cm_timeout=DoradoDisasterRecoveryConstants.STANDBY_START_TIMEOUT, only_mode='disaster_standby') self.update_dorado_info("cluster", "full_backup", only_mode='primary') @@ -199,7 +174,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): return self.logger.debug("Start ninth step of streaming start.") #self.restore_wal_keep_segments(only_mode='primary') - self.clean_gs_secure_dir() + #self.clean_gs_secure_dir() self.clean_step_file() def _check_and_refresh_disaster_user_permission(self): @@ -228,13 +203,11 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): self._first_step_for_ddr_start(step) #1.检查集群状态正常 self.parse_cluster_status() - #dorado存储复制没有流复制user - #self._check_and_refresh_disaster_user_permission() + #检查集群内dn状态和cm服务 self._second_step_for_ddr_start(step) #更新pg_hba和replinfo self.common_step_for_ddr_start() - self._third_step_for_ddr_start(step) - self._fourth_step_for_ddr_start(step) + self._fifth_step_for_ddr_start(step) #设置CM backup_open参数,灾备backup_open=1, 主集群backup_open=0 self._sixth_step_for_ddr_start(step) diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py index a2825fe9..371582ae 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py @@ -36,7 +36,7 @@ class StreamingQueryHandler(DoradoDisasterRecoveryBase): Query infos from files. """ file_path = os.path.realpath(os.path.join(self.dorado_file_dir, file_name)) - if not os.path.isfile(file_path) and file_name in [DoradoDisasterRecoveryConstants.HADR_CLUSTER_STAT]: + if not os.path.isfile(file_path) and file_name in [DoradoDisasterRecoveryConstants.DDR_CLUSTER_STAT]: return "normal" if not os.path.isfile(file_path): return "0%" @@ -141,15 +141,15 @@ class StreamingQueryHandler(DoradoDisasterRecoveryBase): self.parse_cluster_status(current_status=cluster_info) self.check_is_under_upgrade() check_cluster_stat = self.get_streaming_cluster_query_value( - DoradoDisasterRecoveryConstants.HADR_CLUSTER_STAT) + DoradoDisasterRecoveryConstants.DDR_CLUSTER_STAT) archive_status = self.check_archive(check_cluster_stat, self.cluster_status) recovery_status = self.check_recovery(check_cluster_stat, self.cluster_status) hadr_cluster_stat = archive_status or recovery_status or check_cluster_stat hadr_failover_stat = self.get_streaming_cluster_query_value( - DoradoDisasterRecoveryConstants.HADR_FAILOVER_STAT) + DoradoDisasterRecoveryConstants.DDR_FAILOVER_STAT) hadr_switchover_stat = self.get_streaming_cluster_query_value( - DoradoDisasterRecoveryConstants.HADR_SWICHOVER_STAT) + DoradoDisasterRecoveryConstants.DDR_SWICHOVER_STAT) if hadr_cluster_stat != "promote": hadr_failover_stat = "" if hadr_cluster_stat != "switchover": diff --git a/script/impl/dorado_disaster_recovery/params_handler.py b/script/impl/dorado_disaster_recovery/params_handler.py index eb316755..761ab39c 100644 --- a/script/impl/dorado_disaster_recovery/params_handler.py +++ b/script/impl/dorado_disaster_recovery/params_handler.py @@ -26,7 +26,7 @@ import json import optparse import getpass -from impl.streaming_disaster_recovery.streaming_constants import DoradoDisasterRecoveryConstants +from impl.dorado_disaster_recovery.ddr_constants import DoradoDisasterRecoveryConstants from gspylib.common.DbClusterInfo import dbClusterInfo from gspylib.common.ErrorCode import ErrorCode from base_utils.security.security_checker import SecurityChecker, ValidationError @@ -76,6 +76,13 @@ def check_wait_timeout(value): description = "wait timeout" SecurityChecker.check_is_digit(description, value) +def check_dorado_config(value): + """ + Check dorado config + """ + description = "dorado config" + SecurityChecker.check_is_string(description, value) + def check_local_cluster_conf(value): """ @@ -115,8 +122,9 @@ STREAMING_PARAMS_FOR_MODULE = { "start": { "mode": check_streaming_start_mode, "xml_path": check_xml_file, - "hadrUserName": check_hadr_user, - "hadrUserPassword": check_hadr_pwd, + # "hadrUserName": check_hadr_user, + # "hadrUserPassword": check_hadr_pwd, + "doradoConfig": check_wait_timeout, "waitingTimeout": check_wait_timeout, "localClusterConf": check_local_cluster_conf, "remoteClusterConf": check_remote_cluster_conf @@ -194,10 +202,10 @@ class ParamsHandler(object): '"switchover", "failover", "query"') parser.add_option('-m', dest='mode', type='string', help='Cluster run mode. It could be ["primary", "disaster_standby"].') - parser.add_option('-U', dest='hadrusername', type='string', - help='hadr user name.') - parser.add_option('-W', dest='hadruserpasswd', type='string', - help='hadr user password.') + # parser.add_option('-U', dest='hadrusername', type='string', + # help='hadr user name.') + # parser.add_option('-W', dest='hadruserpasswd', type='string', + # help='hadr user password.') parser.add_option('-X', dest='xml_path', type='string', help='Cluster config xml path.') parser.add_option('--time-out=', dest='timeout', default="1200", type='string', @@ -236,7 +244,7 @@ class ParamsHandler(object): with open(file_path, 'r') as read_fp: param_dict = json.load(read_fp) for key, value in param_dict.items(): - if key not in DoradoDisasterRecoveryConstants.STREAMING_JSON_PARAMS[self.params.task]: + if key not in DoradoDisasterRecoveryConstants.DDR_JSON_PARAMS[self.params.task]: continue setattr(self.params, key, value) return @@ -278,10 +286,10 @@ class ParamsHandler(object): self.__print_version_info() if not hasattr(self.params, 'task') or not self.params.task: raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50001"] % 't' + ".") - if self.params.task not in DoradoDisasterRecoveryConstants.STREAMING_JSON_PARAMS.keys(): + if self.params.task not in DoradoDisasterRecoveryConstants.DDR_JSON_PARAMS.keys(): raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50004"] % 't') # parse arguments in json/xml file - if DoradoDisasterRecoveryConstants.STREAMING_JSON_PARAMS[self.params.task]: + if DoradoDisasterRecoveryConstants.DDR_JSON_PARAMS[self.params.task]: self.__cluster_conf_parser(self.params.json_path) def __reload_hadr_user_info(self): @@ -331,7 +339,7 @@ class ParamsHandler(object): self.logger.log('Streaming disaster recovery ' + self.params.task + ' ' + self.trace_id) self.logger.log(DoradoDisasterRecoveryConstants.LOG_REMARK) self.__init_default_params() - self.__reload_hadr_user_info() + #self.__reload_hadr_user_info() for param_name, validate in STREAMING_PARAMS_FOR_MODULE[self.params.task].items(): check_value = getattr(self.params, param_name) if self.params.task == "stop": -- Gitee From 2ca0bb5f98cc7e0eec86053b3d053538eaa6c77b Mon Sep 17 00:00:00 2001 From: Hao Date: Mon, 14 Aug 2023 21:57:25 +0800 Subject: [PATCH 04/23] dorado config params --- build.sh | 6 ++++-- build/get_PlatForm_str.sh | 10 +++++----- script/gs_ddr | 1 + .../ddr_modules/dorado_diaster_recovery_start.py | 2 +- script/impl/dorado_disaster_recovery/params_handler.py | 4 ++-- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/build.sh b/build.sh index ccf89dda..e4f76dba 100644 --- a/build.sh +++ b/build.sh @@ -54,7 +54,7 @@ done PLAT_FORM_STR=$(sh "${ROOT_DIR}/build/get_PlatForm_str.sh") if [ "${PLAT_FORM_STR}"x == "Failed"x ]; then - echo "We only support openEuler(aarch64), EulerOS(aarch64), FusionOS, CentOS, UnionTech(X86) platform." + echo "We only support kylin(aarch64), EulerOS(aarch64), FusionOS, CentOS, UnionTech(X86) platform." exit 1; fi @@ -68,6 +68,8 @@ if [ X$(echo $PLAT_FORM_STR | grep "centos") != X"" ]; then dist_version="CentOS" elif [ X$(echo $PLAT_FORM_STR | grep "openeuler") != X"" ]; then dist_version="openEuler" +elif [ X$(echo $PLAT_FORM_STR | grep "kylin") != X"" ]; then + dist_version="kylin" elif [ X$(echo $PLAT_FORM_STR | grep "fusionos") != X"" ]; then dist_version="FusionOS" elif [ X$(echo $PLAT_FORM_STR | grep "euleros") != X"" ]; then @@ -79,7 +81,7 @@ elif [ X$(echo $PLAT_FORM_STR | grep "asianux") != X"" ]; then elif [ X$(echo $PLAT_FORM_STR | grep "UnionTech") != X"" ]; then dist_version="UnionTech" else - echo "We only support openEuler(aarch64), EulerOS(aarch64), FusionOS, CentOS, Ubuntu(x86), UnionTech(x86) platform." + echo "We only support kylin(aarch64), EulerOS(aarch64), FusionOS, CentOS, Ubuntu(x86), UnionTech(x86) platform." echo "Kernel is $kernel" exit 1 fi diff --git a/build/get_PlatForm_str.sh b/build/get_PlatForm_str.sh index 2bd8af9b..98e3233e 100644 --- a/build/get_PlatForm_str.sh +++ b/build/get_PlatForm_str.sh @@ -19,14 +19,14 @@ function get_os_str() { cpu_arc=$(uname -p) - if [ "$os_name"x = "centos"x ] && [ "$cpu_arc"x = "x86_64"x ]; then + if [ "$os_name"x = "centos"x ] && [ "$cpu_arc"x = "x86_64"x ]; then os_str=centos7.6_x86_64 elif [ "$os_name"x = "euleros"x ] && [ "$cpu_arc"x = "aarch64"x ]; then os_str=euleros2.0_sp8_aarch64 - elif [ "$os_name"x = "openEuler"x ] && [ "$cpu_arc"x = "aarch64"x ]; then - os_str=openeuler_aarch64 - elif [ "$os_name"x = "openEuler"x ] && [ "$cpu_arc"x = "x86_64"x ]; then - os_str=openeuler_x86_64 + elif [ "$os_name"x = "kylin"x ] && [ "$cpu_arc"x = "aarch64"x ]; then + os_str=kylin_aarch64 + elif [ "$os_name"x = "kylin"x ] && [ "$cpu_arc"x = "x86_64"x ]; then + os_str=kylin_x86_64 elif [ "$os_name"x = "fusionos"x ] && [ "$cpu_arc"x = "aarch64"x ]; then os_str=fusionos_aarch64 elif [ "$os_name"x = "fusionos"x ] && [ "$cpu_arc"x = "x86_64"x ]; then diff --git a/script/gs_ddr b/script/gs_ddr index 120726bd..3e699349 100644 --- a/script/gs_ddr +++ b/script/gs_ddr @@ -71,6 +71,7 @@ class DoradoStorageDisasterRecoveryBase(object): tmp_logger_file = ClusterLog.getOMLogPath(DoradoDisasterRecoveryConstants.DDR_LOG_FILE, self.user) tmp_logger = GaussLog(tmp_logger_file, 'parse_and_validate_params', trace_id=self.trace_id) self.params = ParamsHandler(tmp_logger, self.trace_id).get_valid_params() + self.dorado_info = self.params.doradoConfig self.log_file = self.params.logFile if self.params.logFile else \ ClusterLog.getOMLogPath(DoradoDisasterRecoveryConstants.DDR_LOG_FILE, self.user) self.logger = GaussLog(self.log_file, self.params.task, trace_id=self.trace_id) diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py index 98f70007..a6311d89 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py @@ -72,7 +72,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): #调用local/ConfigHba.py和streaming_xml进行设置,考虑使用gs_guc set适配 self.update_pg_hba() self.config_cross_cluster_repl_info() - self.set_xlog_file_path(self.dorado_info) + self.set_xlog_file_path(self.params.doradoConfig) self.set_application_name() self.set_cluster_run_mode() diff --git a/script/impl/dorado_disaster_recovery/params_handler.py b/script/impl/dorado_disaster_recovery/params_handler.py index 761ab39c..4857acba 100644 --- a/script/impl/dorado_disaster_recovery/params_handler.py +++ b/script/impl/dorado_disaster_recovery/params_handler.py @@ -124,7 +124,7 @@ STREAMING_PARAMS_FOR_MODULE = { "xml_path": check_xml_file, # "hadrUserName": check_hadr_user, # "hadrUserPassword": check_hadr_pwd, - "doradoConfig": check_wait_timeout, + "doradoConfig": check_dorado_config, "waitingTimeout": check_wait_timeout, "localClusterConf": check_local_cluster_conf, "remoteClusterConf": check_remote_cluster_conf @@ -212,7 +212,7 @@ class ParamsHandler(object): help='time out.') parser.add_option("-l", dest='logFile', type='string', help='Path of log file.') - parser.add_option("--dorado-info", dest='dorado_info', type='string', + parser.add_option("--dorado-config", dest='doradoConfig', type='string', help='Path of dorado xlog share disk.') return parser -- Gitee From 121afb4409fb783a033dace6fe447cfca652bb6f Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 15 Aug 2023 11:47:49 +0800 Subject: [PATCH 05/23] update ddr_base.py --- .../impl/dorado_disaster_recovery/ddr_base.py | 49 ++++++++++++------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 70792908..cda0fca1 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -838,8 +838,24 @@ class DoradoDisasterRecoveryBase(object): """ guc set xlog_file_path value in primary dn """ - self.__set_guc_param("xlog_file_path", xlog_file_path) - self.set_xlog_lock_file_path() + self.set_xlog_path(xlog_file_path) + self.set_xlog_lock_file() + + def set_xlog_path(self, xlog_file_path): + """ + guc set xlog_file_path value + """ + self.logger.log("Starting set xlog_lock_file_path param") + cmd = "source %s && gs_guc set -Z datanode -N all -I all " \ + "-c \"xlog_file_path='%s'\"" \ + % (self.mpp_file, xlog_file_path) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + "Error:%s" % output) + else: + self.logger.debug("Successfully set xlog_file_path %s." % xlog_file_path) + + self.logger.log("Successfully %s xlog_lock_file_path param." % (opt_type)) def __set_xlog_lock_file_each_inst(self, params_list): """ @@ -859,7 +875,7 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("Successfully [%s] shardNum [%s] node [%s] xlog_lock_file_path " "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) - def set_xlog_lock_file_path(self, opt_type="set"): + def set_xlog_lock_file(self, opt_type="set"): """ guc set xlog_lock_file_path value in primary dn """ @@ -956,22 +972,19 @@ class DoradoDisasterRecoveryBase(object): :return:NA """ self.logger.log("Start update pg_hba config.") - FileUtil.cpFile(self.params.xml_path, self.streaming_xml) - cmd = "source %s; %s -U %s -X '%s' --try-reload" % ( - self.mpp_file, OMCommand.getLocalScript( - "Local_Config_Hba"), self.user, self.streaming_xml) - self.logger.debug("Command for changing instance pg_hba.conf file: %s" % cmd) - self.get_all_connection_node_name("update_pg_hba") - try: - self.ssh_tool.scpFiles(self.streaming_xml, self.dorado_file_dir) - self.ssh_tool.executeCommand(cmd, hostList=self.connected_nodes) - except Exception as error: - msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ - % "update streaming pg_hba with error:%s" % error - self.logger.debug(msg) - raise Exception(msg) - self.logger.log("Successfully update pg_hba config.") + remote_ips = self.__get_remote_ips() + for remote_ip in remote_ips: + cmd = "source %s ; gs_guc set -Z datanode -N all -I all -h " \ + "\"host all all %s/32 trust" \ + % (self.mpp_file, remote_ip) + self.logger.debug("Update pg_hba.conf with cmd: %s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + " Error: \n%s " % output) + self.logger.debug("Successfully update pg_hba config with remote datanode ip:%s." + % remote_ips) def __get_local_data_ip(self, inst_host): """ -- Gitee From c98140857f17cec40f55121271d2aa38ce12445c Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 15 Aug 2023 15:37:57 +0800 Subject: [PATCH 06/23] gs_ddr -t start bugfix --- .../impl/dorado_disaster_recovery/ddr_base.py | 30 +++++++++++++++---- .../dorado_diaster_recovery_start.py | 2 +- .../params_handler.py | 2 ++ 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index cda0fca1..582ab945 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -855,7 +855,7 @@ class DoradoDisasterRecoveryBase(object): else: self.logger.debug("Successfully set xlog_file_path %s." % xlog_file_path) - self.logger.log("Successfully %s xlog_lock_file_path param." % (opt_type)) + self.logger.log("Successfully set xlog_lock_file_path param: %s." % (xlog_file_path)) def __set_xlog_lock_file_each_inst(self, params_list): """ @@ -875,6 +875,24 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("Successfully [%s] shardNum [%s] node [%s] xlog_lock_file_path " "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) + def __set_app_name_each_inst(self, params_list): + """ + Set xlog_lock_file_path value in each dn + """ + (inst, opt_type, value, mpprc_file) = params_list + self.logger.debug("Start [%s] shardNum [%s] node [%s] application_name value [%s]." + % (opt_type, inst.mirrorId, inst.hostname, value)) + cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ + "-Z datanode -D %s -c \\\"application_name = '%s'\\\"\"" % \ + (mpprc_file, inst.hostname, mpprc_file, opt_type, inst.datadir, value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + "Options:%s, Error: \n%s " + % ("set application_name for inst:%s" % inst.instanceId, str(output))) + self.logger.debug("Successfully [%s] shardNum [%s] node [%s] application_name " + "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) + def set_xlog_lock_file(self, opt_type="set"): """ guc set xlog_lock_file_path value in primary dn @@ -910,15 +928,15 @@ class DoradoDisasterRecoveryBase(object): raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain param list for set application_name") - parallelTool.parallelExecute(self.__set_xlog_lock_file_each_inst, params_list) - self.logger.log("Successfully %s application_name param." % (opt_type)) + parallelTool.parallelExecute(self.__set_app_name_each_inst, params_list) + self.logger.log("Successfully set application_name param." ) def set_cluster_run_mode(self): """ guc set xlog_file_path value in primary dn """ - cluster_run_mode = "cluster_primary" if self.params.mode == "primary" \ - else "cluster_standby" + cluster_run_mode = "'cluster_primary'" if self.params.mode == "primary" \ + else "'cluster_standby'" self.__set_guc_param("cluster_run_mode", cluster_run_mode) self.__set_guc_param("ha_module_debug", "off") @@ -976,7 +994,7 @@ class DoradoDisasterRecoveryBase(object): for remote_ip in remote_ips: cmd = "source %s ; gs_guc set -Z datanode -N all -I all -h " \ - "\"host all all %s/32 trust" \ + "\"host all all %s/32 trust\"" \ % (self.mpp_file, remote_ip) self.logger.debug("Update pg_hba.conf with cmd: %s" % cmd) status, output = CmdUtil.retryGetstatusoutput(cmd) diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py index a6311d89..090575b1 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py @@ -114,7 +114,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): #self.stop_cluster_by_node(only_mode='disaster_standby') self.stop_cluster() self.start_cluster(only_mode="primary") - self.write_ddr_step("5_set_wal_segments_step") + self.write_dorado_step("5_set_wal_segments_step") def _sixth_step_for_ddr_start(self, step): """ diff --git a/script/impl/dorado_disaster_recovery/params_handler.py b/script/impl/dorado_disaster_recovery/params_handler.py index 4857acba..a2ca5835 100644 --- a/script/impl/dorado_disaster_recovery/params_handler.py +++ b/script/impl/dorado_disaster_recovery/params_handler.py @@ -208,6 +208,8 @@ class ParamsHandler(object): # help='hadr user password.') parser.add_option('-X', dest='xml_path', type='string', help='Cluster config xml path.') + parser.add_option('--json', dest='json_path', type='string', + help='Config json file of streaming options') parser.add_option('--time-out=', dest='timeout', default="1200", type='string', help='time out.') parser.add_option("-l", dest='logFile', type='string', -- Gitee From 5713ba1e44988fa58739d67bee106391c01eb73b Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 15 Aug 2023 21:08:04 +0800 Subject: [PATCH 07/23] update ddr_base.py --- script/impl/dorado_disaster_recovery/ddr_base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 582ab945..0ac37b1f 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -845,7 +845,7 @@ class DoradoDisasterRecoveryBase(object): """ guc set xlog_file_path value """ - self.logger.log("Starting set xlog_lock_file_path param") + self.logger.log("Starting set xlog_file_path param") cmd = "source %s && gs_guc set -Z datanode -N all -I all " \ "-c \"xlog_file_path='%s'\"" \ % (self.mpp_file, xlog_file_path) @@ -855,14 +855,14 @@ class DoradoDisasterRecoveryBase(object): else: self.logger.debug("Successfully set xlog_file_path %s." % xlog_file_path) - self.logger.log("Successfully set xlog_lock_file_path param: %s." % (xlog_file_path)) + self.logger.log("Successfully set xlog_file_path param: %s." % (xlog_file_path)) def __set_xlog_lock_file_each_inst(self, params_list): """ Set xlog_lock_file_path value in each dn """ (inst, opt_type, value, mpprc_file) = params_list - self.logger.debug("Start [%s] shardNum [%s] node [%s] xlog_lock_file value [%s]." + self.logger.debug("Start [%s] shardNum [%s] node [%s] xlog_lock_file_path value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ "-Z datanode -D %s -c \\\"xlog_lock_file_path = '%s'\\\"\"" % \ -- Gitee From a7be8e09f864054d3d6eedde94721b15a22d51c4 Mon Sep 17 00:00:00 2001 From: Hao Date: Wed, 16 Aug 2023 14:44:42 +0800 Subject: [PATCH 08/23] bugfix --- .../impl/dorado_disaster_recovery/ddr_base.py | 10 ++++++---- .../dorado_disaster_recovery/params_handler.py | 18 +++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 0ac37b1f..470f0129 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -1310,19 +1310,20 @@ class DoradoDisasterRecoveryBase(object): Start dss server process """ inst, mpprc_file = params - self.logger.debug("Start dssserver on node [%s] ." % inst.hostname) - + cmd = "source %s; pssh -H %s \"source %s ; export DSS_MAINTAIN=TRUE; " \ " dssserver -D %s & \"" % (mpprc_file,inst.hostname, mpprc_file, self.dss_home_dir) status, output = CmdUtil.retryGetstatusoutput(cmd) + self.logger.debug("Start dssserver on node [%s],cmd: %s." % inst.hostname, cmd) if status != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + "Options:%s, Error: \n%s " % ("start dssserver on node :%s" % inst.hostname, str(output))) self.logger.debug("Successfully Start dssserver on node [%s] " % inst.hostname) - build_cmd = "source %s; pssh -H %s \"source %s ; gs_ctl build -D %s -b cross_cluster_full -g 0 -q " \ + build_cmd = "source %s; pssh -H %s \"source %s ; gs_ctl build -D %s -b cross_cluster_full -g 0 -q\"" \ % (mpprc_file,inst.hostname, mpprc_file, inst.datadir) + self.logger.debug("Build main standby datanode on node [%s],cmd: %s." % inst.hostname, build_cmd) status, output = CmdUtil.retryGetstatusoutput(build_cmd) if status != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % build_cmd + @@ -1330,8 +1331,9 @@ class DoradoDisasterRecoveryBase(object): % ("build main_standby on node :%s" % inst.hostname, str(output))) self.logger.debug("Successfully build main_standby in disaster standby cluster on node [%s] " % inst.hostname) - kill_cmd = "source %s; pssh -H %s \"source %s ; ps ux | grep dssserver | grep -v grep | awk '{print $2}' | xargs kill -9 " \ + kill_cmd = "source %s; pssh -H %s \"source %s ; ps ux | grep dssserver | grep -v grep | awk '{print $2}' | xargs kill -9 \"" \ % (mpprc_file,inst.hostname, mpprc_file) + self.logger.debug("Stop dssserver proc on node [%s],cmd: %s." % inst.hostname, kill_cmd) status, output = CmdUtil.retryGetstatusoutput(kill_cmd) if status != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % kill_cmd + diff --git a/script/impl/dorado_disaster_recovery/params_handler.py b/script/impl/dorado_disaster_recovery/params_handler.py index a2ca5835..8eabed13 100644 --- a/script/impl/dorado_disaster_recovery/params_handler.py +++ b/script/impl/dorado_disaster_recovery/params_handler.py @@ -146,16 +146,16 @@ STREAMING_PARAMS_FOR_MODULE = { } HELP_MSG = """ -gs_sdr is a utility for streaming disaster recovery fully options. +gs_ddr is a utility for streaming disaster recovery fully options. Usage: - gs_sdr -? | --help - gs_sdr -V | --version - gs_sdr -t start -m [primary|disaster_standby] -X XMLFILE [-U DR_USERNAME] [-W DR_PASSWORD] [--time-out=SECS] [-l LOGFILE] - gs_sdr -t stop -X XMLFILE|--json JSONFILE [-l LOGFILE] - gs_sdr -t switchover -m [primary|disaster_standby] [--time-out=SECS] [-l LOGFILE] - gs_sdr -t failover [-l LOGFILE] - gs_sdr -t query [-l LOGFILE] + gs_ddr -? | --help + gs_ddr -V | --version + gs_ddr -t start -m [primary|disaster_standby] -X XMLFILE [--time-out=SECS] [-l LOGFILE] + gs_ddr -t stop -X XMLFILE|--json JSONFILE [-l LOGFILE] + gs_ddr -t switchover -m [primary|disaster_standby] [--time-out=SECS] [-l LOGFILE] + gs_ddr -t failover [-l LOGFILE] + gs_ddr -t query [-l LOGFILE] General options: -?, --help Show help information for this utility, and exit the command line mode. @@ -190,7 +190,7 @@ class ParamsHandler(object): """ parser = optparse.OptionParser(conflict_handler='resolve') parser.disable_interspersed_args() - parser.epilog = "Example: gs_sdr -t " \ + parser.epilog = "Example: gs_ddr -t " \ "start -m primary -X clusterConfig.xml " \ "--time-out=1200." parser.add_option('-V', "--version", dest='version_info', action='store_true', -- Gitee From 7da36d14738327c6542f6d6ceae58fa4d520d2a8 Mon Sep 17 00:00:00 2001 From: Hao Date: Wed, 16 Aug 2023 15:10:19 +0800 Subject: [PATCH 09/23] updata sql --- script/impl/dorado_disaster_recovery/ddr_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 470f0129..91a0be74 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -1483,7 +1483,7 @@ class DoradoDisasterRecoveryBase(object): return self.primary_dn_ids = p_inst_list sql_check = "select 1 from pg_catalog.pg_stat_get_wal_senders() where " \ - "sync_state='Async' and peer_role='Standby' and peer_state='Normal';" + "sync_state='Async' and peer_role='StandbyCluster_Standby' and peer_state='Normal';" param_list = [(dn_inst, sql_check) for db_node in self.cluster_info.dbNodes for dn_inst in db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] -- Gitee From 3e302b8c48530f11abc0da5fb14bbd3dd2dc6bea Mon Sep 17 00:00:00 2001 From: Hao Date: Wed, 16 Aug 2023 21:43:46 +0800 Subject: [PATCH 10/23] update and bugfix --- .../impl/dorado_disaster_recovery/ddr_base.py | 92 +++++++++---------- .../dorado_diaster_recovery_start.py | 41 ++++----- 2 files changed, 59 insertions(+), 74 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 91a0be74..5f46a22b 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -102,7 +102,7 @@ class DoradoDisasterRecoveryBase(object): DoradoDisasterRecoveryConstants.STREAMING_CONFIG_XML) self.ssh_tool = SshTool(self.cluster_node_names, self.log_file) self.mpp_file = EnvUtil.getMpprcFile() - self.dss_home_dir = "" + self.dss_home_dir = self.cluster_info.dss_home self._init_step_file_path() def init_cluster_conf(self): @@ -1305,76 +1305,68 @@ class DoradoDisasterRecoveryBase(object): % "full build from remote cluster" + error_detail) self.logger.debug("Successfully build cascade standby dn:%s" % inst.instanceId) - def __start_dss_and_build(self, params): + def start_dss_instance(self, only_mode=None): """ - Start dss server process + Start dssserver process """ - inst, mpprc_file = params - + self.logger.log("Start start dssserver in main standby node.") + if only_mode and self.params.mode != only_mode: + self.logger.debug("Start dssserver step is not for mode:%s." % self.params.mode) + return + primary_dn = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + main_standby_inst = primary_dn[0] + cmd = "source %s; pssh -H %s \"source %s ; export DSS_MAINTAIN=TRUE; " \ - " dssserver -D %s & \"" % (mpprc_file,inst.hostname, mpprc_file, self.dss_home_dir) + " dssserver -D $DSS_HOME & \"" % (self.mpp_file, main_standby_inst.hostname, self.mpp_file) + self.logger.debug("Start dssserver on node [%s],cmd: %s." % (main_standby_inst.hostname, cmd)) status, output = CmdUtil.retryGetstatusoutput(cmd) - self.logger.debug("Start dssserver on node [%s],cmd: %s." % inst.hostname, cmd) if status != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + "Options:%s, Error: \n%s " - % ("start dssserver on node :%s" % inst.hostname, str(output))) - self.logger.debug("Successfully Start dssserver on node [%s] " % inst.hostname) + % ("Start dssserver on node :%s" % main_standby_inst.hostname, str(output))) + self.logger.log("Successfully Start dssserver on node [%s] " % main_standby_inst.hostname) + def build_main_standby_datanode(self, only_mode=None): + """ + Build Main standby datanode + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Build Main standby step is not for mode:%s." % self.params.mode) + return + primary_dn = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + main_standby_inst = primary_dn[0] + build_cmd = "source %s; pssh -H %s \"source %s ; gs_ctl build -D %s -b cross_cluster_full -g 0 -q\"" \ - % (mpprc_file,inst.hostname, mpprc_file, inst.datadir) - self.logger.debug("Build main standby datanode on node [%s],cmd: %s." % inst.hostname, build_cmd) + % (self.mpp_file,main_standby_inst.hostname, self.mpp_file, main_standby_inst.datadir) + self.logger.debug("Build Main standby datanode on node [%s],cmd: %s." % (main_standby_inst.hostname, build_cmd)) status, output = CmdUtil.retryGetstatusoutput(build_cmd) if status != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % build_cmd + "Options:%s, Error: \n%s " - % ("build main_standby on node :%s" % inst.hostname, str(output))) - self.logger.debug("Successfully build main_standby in disaster standby cluster on node [%s] " % inst.hostname) - - kill_cmd = "source %s; pssh -H %s \"source %s ; ps ux | grep dssserver | grep -v grep | awk '{print $2}' | xargs kill -9 \"" \ - % (mpprc_file,inst.hostname, mpprc_file) - self.logger.debug("Stop dssserver proc on node [%s],cmd: %s." % inst.hostname, kill_cmd) - status, output = CmdUtil.retryGetstatusoutput(kill_cmd) - if status != 0: - raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % kill_cmd + - "Options:%s, Error: \n%s " - % ("stop dssserver before start cluster on node :%s" % inst.hostname, str(output))) - self.logger.debug("Successfully stop dssserver before start cluster on node [%s] " % inst.hostname) - return output + % ("build main_standby on node :%s" % main_standby_inst.hostname, str(output))) + self.logger.debug("Successfully build main_standby in disaster standby cluster on node [%s] " % main_standby_inst.hostname) - def start_dss_instance(self, only_mode=None): + def kill_dss_instance(self, only_mode=None): """ - Start dss server process + Kill dssserver process """ - if self.params.mode == "primary" or self.params.mode != only_mode: - self.logger.debug("start dssserver step is not for mode:%s." % self.params.mode) + if only_mode and self.params.mode != only_mode: + self.logger.debug("Kill dssserver process step is not for mode:%s." % self.params.mode) return primary_dn = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] - - params_list = [] - for inst in primary_dn: - params_list.append((inst, self.mpp_file)) - - if not params_list: - raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] - % "obtain param list for start dssserver in disaster_standby") - parallelTool.parallelExecute(self.__start_dss_and_build, params_list) - self.logger.log("Successfully start dssserver and build main_standby inst : %s." % primary_dn) - return - - + main_standby_inst = primary_dn[0] - def kill_dss_instance(self, only_mode=None): - """ - Start dss server process - """ - cmd = "source %s; ps ux | grep dssserver | grep -v grep | awk '{print $2}' | xargs kill -9" % self.mpp_file - status, output = CmdUtil.retryGetstatusoutput(cmd) + kill_cmd = "source %s; pssh -H %s \"source %s ; ps ux | grep dssserver | grep -v grep | awk '{print $2}' | xargs kill -9 \"" \ + % (self.mpp_file,main_standby_inst.hostname, self.mpp_file) + self.logger.debug("Kill dssserver on node [%s],cmd: %s." % (main_standby_inst.hostname, kill_cmd)) + status = CmdUtil.retryGetstatusoutput(kill_cmd) if status != 0: - self.logger.error(ErrorCode.GAUSS_516["GAUSS_51600"] + - "status(%d), output(%s)" % (status, output)) - return output + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "kill dssserver before start cluster on node:" + main_standby_inst.hostname) + self.logger.debug("Successfully stop dssserver before start cluster on node [%s] " % main_standby_inst.hostname) def build_dn_instance(self, only_mode=None): """ diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py index 090575b1..a80c0aa6 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py @@ -69,12 +69,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): """ self.logger.debug("Start common config step of ddr start.") self.distribute_cluster_conf() - #调用local/ConfigHba.py和streaming_xml进行设置,考虑使用gs_guc set适配 - self.update_pg_hba() - self.config_cross_cluster_repl_info() - self.set_xlog_file_path(self.params.doradoConfig) - self.set_application_name() - self.set_cluster_run_mode() + def _third_step_for_ddr_start(self, step): """ @@ -84,10 +79,12 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): return self.logger.debug("Start third step of ddr start.") - #self.prepare_gs_secure_files(only_mode='primary') - #self.build_and_distribute_key_files(only_mode='disaster_standby') - #self.get_default_wal_keep_segments(only_mode='primary') - self.write_dorado_step("3_set_wal_segments_step") + self.update_pg_hba() + self.config_cross_cluster_repl_info() + self.set_xlog_file_path(self.params.doradoConfig) + self.set_application_name() + self.set_cluster_run_mode() + self.write_dorado_step("3_set_datanode_guc_step") def _fourth_step_for_ddr_start(self, step): @@ -97,48 +94,44 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): if step >= 4: return self.logger.debug("Start fourth step of ddr start.") - self.set_wal_keep_segments( - "reload", DoradoDisasterRecoveryConstants.MAX_WAL_KEEP_SEGMENTS, only_mode='primary') - self.write_dorado_step("4_set_wal_segments_step") + self.stop_cluster() + self.write_dorado_step("4_stop_cluster_step") def _fifth_step_for_ddr_start(self, step): """ - Fifth step for streaming start + Fifth step for ddr start """ if step >= 5: return self.logger.debug("Start fifth step of ddr start.") # self.set_data_in_dcc(self.backup_open_key, "0", only_mode='primary') # self.set_data_in_dcc(self.backup_open_key, "1", only_mode='disaster_standby') - # self.set_most_available(mode="reload", raise_error=False) - #self.stop_cluster_by_node(only_mode='disaster_standby') - self.stop_cluster() self.start_cluster(only_mode="primary") - self.write_dorado_step("5_set_wal_segments_step") + self.write_dorado_step("5_start_primary_cluster_step") def _sixth_step_for_ddr_start(self, step): """ - Sixth step for streaming start + Sixth step for ddr start """ if step >= 6: return - self.logger.debug("Start sixth step of streaming start.") + self.logger.debug("Start sixth step of ddr start.") self.set_cmserver_guc("backup_open", "1", "set", only_mode='disaster_standby') self.set_cmagent_guc("agent_backup_open", "1", "set", only_mode='disaster_standby') - self.write_dorado_step("6_set_guc_step") + self.write_dorado_step("6_set_cm_guc_step") def _seventh_step_for_ddr_start(self, step): """ Seventh step for streaming start """ - if step >= 7: + if step >= 7 or self.params.mode == "primary": return self.logger.debug("Start seventh step of ddr start.") self.update_dorado_info("cluster", "restore", only_mode='disaster_standby') try: self.start_dss_instance(only_mode='disaster_standby') - # self.build_dn_instance(only_mode='disaster_standby') - # self.kill_dss_instance(only_mode='disaster_standby') + self.build_main_standby_datanode(only_mode='disaster_standby') + self.kill_dss_instance(only_mode='disaster_standby') except Exception as error: self.update_dorado_info("cluster", "restore_fail", only_mode='disaster_standby') raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "build dns" + "Error:%s" % error) -- Gitee From 5ec3c50ce03c7480a41d0e10aacaaeecd8119cf9 Mon Sep 17 00:00:00 2001 From: Hao Date: Thu, 17 Aug 2023 17:26:01 +0800 Subject: [PATCH 11/23] upadte --- .../impl/dorado_disaster_recovery/ddr_base.py | 271 +++--------------- 1 file changed, 37 insertions(+), 234 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 5f46a22b..a43dfdc4 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -39,7 +39,7 @@ from gspylib.common.DbClusterStatus import DbClusterStatus from gspylib.threads.SshTool import SshTool from gspylib.threads.parallelTool import parallelTool from gspylib.os.gsfile import g_file -from base_utils.os.cmd_util import CmdUtil +from base_utils.os.cmd_util import CmdUtil, FastPopen from base_utils.os.env_util import EnvUtil from base_utils.os.net_util import NetUtil from base_utils.os.file_util import FileUtil @@ -1117,194 +1117,6 @@ class DoradoDisasterRecoveryBase(object): % (guc_parameter, guc_value, output) self.logger.debug(msg) - def __check_datanode_data_ip_connection(self, inst): - """ - Check remote data ip can connect or not - """ - any_connected = False - node_infos = [node_info for shard in self.params.remoteClusterConf.get("shards", []) - for node_info in shard] - local_data_ip = self.__get_local_data_ip(inst.hostname) - for node_info in node_infos: - data_ip = node_info.get("dataIp") - shard_num = node_info.get("shardNum", '1') - if str(shard_num) != str(inst.mirrorId): - continue - _, ret = DefaultValue.fast_ping_on_node(inst.hostname, local_data_ip, - data_ip, self.logger) - if ret: - any_connected = True - break - if not any_connected: - self.logger.error("Failed check data ip connection for inst:%s." % inst.instanceId) - raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "check data ip connection") - self.logger.debug("Successfully check main standby data ip connection.") - - def __pghba_backup_handler(self, node_name, dir_name, inst_id, mode="backup"): - """ - Backup or restore pg_hba file. - """ - file_path = os.path.join(dir_name, "pg_hba.conf") - old_file_path = os.path.join(dir_name, "pg_hba.conf.old") - dest_file = os.path.join(self.dorado_file_dir, "%s_pg_hba.conf" % inst_id) - if self.local_host == node_name: - if mode == "backup" and not os.path.isfile(dest_file): - if os.path.isfile(file_path): - self.logger.debug("Backup file from[%s] to[%s]." % ( - file_path, dest_file)) - FileUtil.cpFile(file_path, dest_file) - else: - self.logger.debug("Backup file from[%s] to[%s]." % ( - old_file_path, dest_file)) - FileUtil.cpFile(old_file_path, dest_file) - if mode == "restore": - self.logger.debug("Restore file from[%s] to[%s]." % ( - dest_file, file_path)) - FileUtil.cpFile(dest_file, file_path) - FileUtil.removeFile(dest_file) - else: - if mode == "backup": - cmd = "source %s; pssh -s -H %s \"if [ ! -f '%s' ];then if [ -f '%s' ];" \ - "then cp '%s' '%s';else cp '%s' '%s';fi;fi\"" \ - % (self.mpp_file, node_name, dest_file, file_path, file_path, - dest_file, old_file_path, dest_file) - self.logger.debug("Backup file on node[%s] with cmd [%s]." % ( - node_name, cmd)) - else: - cmd = "source %s; pssh -s -H %s \"cp %s %s && rm -f %s\"" % ( - self.mpp_file, node_name, dest_file, file_path, dest_file) - self.logger.debug("Restore file on node[%s] from[%s] to[%s]." % ( - node_name, file_path, dest_file)) - status, output = CmdUtil.retryGetstatusoutput(cmd) - if status != 0: - raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + - " Error: \n%s " % output) - - def __pg_ident_backup_handler(self, node_name, dir_name, inst_id, mode="backup"): - """ - Backup or restore pg_ident file. - """ - file_path = os.path.join(dir_name, "pg_ident.conf") - dest_file = os.path.join(self.dorado_file_dir, "%s_pg_ident.conf" % inst_id) - if self.local_host == node_name: - if mode == "backup" and not os.path.isfile(dest_file): - if os.path.isfile(file_path): - self.logger.debug("Backup file from[%s] to[%s]." % ( - file_path, dest_file)) - FileUtil.cpFile(file_path, dest_file) - - if mode == "restore" and os.path.isfile(dest_file): - self.logger.debug("Restore file from[%s] to[%s]." % ( - dest_file, file_path)) - FileUtil.cpFile(dest_file, file_path) - FileUtil.removeFile(dest_file) - else: - if mode == "backup": - cmd = "source %s; pssh -s -H %s \"if [ ! -f '%s' ];then if [ -f '%s' ];" \ - "then cp '%s' '%s';fi;fi\"" \ - % (self.mpp_file, node_name, dest_file, file_path, file_path, dest_file) - self.logger.debug("Backup file on node[%s] with cmd [%s]." % ( - node_name, cmd)) - else: - cmd = "source %s; pssh -s -H %s \"if [ -f '%s' ];then cp '%s' '%s' && " \ - "rm -f '%s';fi\"" % (self.mpp_file, node_name, dest_file, dest_file, - file_path, dest_file) - self.logger.debug("Restore file on node[%s] from[%s] to[%s]." % ( - node_name, file_path, dest_file)) - status, output = CmdUtil.retryGetstatusoutput(cmd) - if status != 0: - raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + - " Error: \n%s " % output) - - def __start_main_standby_dn(self, start_params): - """ - Start single main standby dn - """ - local_ip, inst, bin_path, distribute_arg, build_timeout = start_params - self.logger.debug("Starting start dn:%s" % inst.instanceId) - if local_ip == inst.hostname: - cmd_start = "source %s; %s/gs_ctl start -D %s -M hadr_main_standby%s" % ( - self.mpp_file, bin_path, inst.datadir, distribute_arg) - else: - cmd_start = "source %s; pssh -s -t %s -H %s \"source %s; %s/gs_ctl start -D %s " \ - "-M hadr_main_standby%s\"" \ - % (self.mpp_file, DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT + 10, inst.hostname, - self.mpp_file, bin_path, inst.datadir, distribute_arg) - self.logger.debug("Start dn with cmd:%s." % cmd_start) - status, output = CmdUtil.retry_util_timeout(cmd_start, build_timeout) - if status != 0: - raise Exception( - ErrorCode.GAUSS_514[ - "GAUSS_51400"] % cmd_start + " Error: \n%s " % output) - self.logger.debug("Successfully start dn:%s" % inst.instanceId) - - def __build_main_standby_dn(self, params): - """ - Build single main standby dn - """ - inst, build_timeout, local_ip, bin_path, distribute_arg, rds_backup, backup_pwd = params - self.logger.debug("Start build main standby dn:%s" % inst.instanceId) - self.__check_datanode_data_ip_connection(inst) - self.__pghba_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="backup") - self.__pg_ident_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="backup") - # -t 1209600 means default value 14 days - if local_ip == inst.hostname: - cmd = "source %s; %s/gs_ctl build -D %s -b cross_cluster_full -g 0 -q -t %s" \ - % (self.mpp_file, bin_path, inst.datadir, - DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT) - else: - cmd = "echo \"source %s; %s/gs_ctl build -D %s -b cross_cluster_full -g 0 -q " \ - " -t %s\" | pssh -s -t %s -H %s" \ - % (self.mpp_file, bin_path, inst.datadir, - DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT, - DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT + 10, inst.hostname) - cmd_log = cmd.replace(backup_pwd, '***') - self.logger.debug("Building with cmd:%s." % cmd_log) - status, output = CmdUtil.retry_util_timeout(cmd, build_timeout) - if status != 0: - error_detail = "Error: Failed to do build because of pssh timeout." \ - if "was killed or timeout" in output else \ - "Error: Failed to do build because of retry timeout in %s s." \ - % build_timeout - self.logger.debug("Failed to do gs_ctl build. " + error_detail) - raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] - % "full build from remote cluster" + error_detail) - self.logger.debug("Successfully build main standby dn:%s" % inst.instanceId) - self.__pghba_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="restore") - self.__pg_ident_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="restore") - start_params = (local_ip, inst, bin_path, distribute_arg, build_timeout) - self.__start_main_standby_dn(start_params) - - def __build_cascade_standby_dn(self, params): - """ - Build single main standby dn - """ - inst, build_timeout, local_ip, bin_path, distribute_arg = params - self.logger.debug("Start build cascade standby dn:%s" % inst.instanceId) - # -t 1209600 means default value 14 days - if local_ip == inst.hostname: - cmd = "source %s; %s/gs_ctl build -D %s -M cascade_standby " \ - "-b standby_full -r 7200%s -t %s" \ - % (self.mpp_file, bin_path, inst.datadir, distribute_arg, - DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT) - else: - cmd = "echo \"source %s; %s/gs_ctl build -D %s -M cascade_standby -b standby_full " \ - "-r 7200%s -t %s\" | pssh -s -t %s -H %s" \ - % (self.mpp_file, bin_path, inst.datadir, distribute_arg, - DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT, - DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT + 10, inst.hostname) - self.logger.debug("Building with cmd:%s." % cmd) - status, output = CmdUtil.retry_util_timeout(cmd, build_timeout) - if status != 0: - error_detail = "Error: Failed to do build because of pssh timeout." \ - if "was killed or timeout" in output else \ - "Error: Failed to do build because of retry timeout in %s s." \ - % build_timeout - self.logger.debug("Failed to do gs_ctl build. " + error_detail) - raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] - % "full build from remote cluster" + error_detail) - self.logger.debug("Successfully build cascade standby dn:%s" % inst.instanceId) - def start_dss_instance(self, only_mode=None): """ Start dssserver process @@ -1317,14 +1129,23 @@ class DoradoDisasterRecoveryBase(object): db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] main_standby_inst = primary_dn[0] - cmd = "source %s; pssh -H %s \"source %s ; export DSS_MAINTAIN=TRUE; " \ - " dssserver -D $DSS_HOME & \"" % (self.mpp_file, main_standby_inst.hostname, self.mpp_file) + if self.local_host == main_standby_inst.hostname: + cmd = 'sh -c "source {}; export DSS_MAINTAIN=TRUE && nohup dssserver -D $DSS_HOME >/dev/null 2>&1 & "'.format( + self.mpp_file) + # cmd = 'sh -c %s; export DSS_MAINTAIN=TRUE && ' \ + # "nohup dssserver -D $DSS_HOME >/dev/null 2>&1 &" % (self.mpp_file) + else: + cmd = "source %s; pssh -s -t 5 -H %s \"source %s; export DSS_MAINTAIN=TRUE && " \ + "nohup dssserver -D $DSS_HOME >/dev/null 2>&1 & \"" \ + % (self.mpp_file, main_standby_inst.hostname) + self.logger.debug("Start dssserver on node [%s],cmd: %s." % (main_standby_inst.hostname, cmd)) - status, output = CmdUtil.retryGetstatusoutput(cmd) - if status != 0: - raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + - "Options:%s, Error: \n%s " - % ("Start dssserver on node :%s" % main_standby_inst.hostname, str(output))) + proc = FastPopen(cmd) + out, err = proc.communicate() + if proc.returncode != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] + + 'Start dssserver on node [{}] Error: {}'.format(main_standby_inst.hostname, str(err + out).strip())) + self.logger.log("Successfully Start dssserver on node [%s] " % main_standby_inst.hostname) def build_main_standby_datanode(self, only_mode=None): @@ -1337,11 +1158,17 @@ class DoradoDisasterRecoveryBase(object): primary_dn = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] main_standby_inst = primary_dn[0] - - build_cmd = "source %s; pssh -H %s \"source %s ; gs_ctl build -D %s -b cross_cluster_full -g 0 -q\"" \ - % (self.mpp_file,main_standby_inst.hostname, self.mpp_file, main_standby_inst.datadir) + + if self.local_host == main_standby_inst.hostname: + build_cmd = "source %s; gs_ctl build -D %s -b cross_cluster_full -g 0 -q -t %s" \ + % (self.mpp_file, main_standby_inst.datadir, DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT) + else: + build_cmd = "source %s; pssh -s -t %s -H %s \"source %s;" \ + " gs_ctl build -D %s -b cross_cluster_full -g 0 -q -t %s \"" \ + % (self.mpp_file, DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT + 10, main_standby_inst.hostname, + self.mpp_file, main_standby_inst.datadir, DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT) self.logger.debug("Build Main standby datanode on node [%s],cmd: %s." % (main_standby_inst.hostname, build_cmd)) - status, output = CmdUtil.retryGetstatusoutput(build_cmd) + status, output = CmdUtil.retry_util_timeout(build_cmd, self.params.waitingTimeout) if status != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % build_cmd + "Options:%s, Error: \n%s " @@ -1359,43 +1186,19 @@ class DoradoDisasterRecoveryBase(object): db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] main_standby_inst = primary_dn[0] - kill_cmd = "source %s; pssh -H %s \"source %s ; ps ux | grep dssserver | grep -v grep | awk '{print $2}' | xargs kill -9 \"" \ - % (self.mpp_file,main_standby_inst.hostname, self.mpp_file) + if self.local_host == main_standby_inst.hostname: + kill_cmd = "source %s; pkill -9 -f dssserver" % (self.mpp_file) + else: + kill_cmd = "source %s; pssh -s -t 3 -H %s \"pkill -9 -f dssserver\"" \ + % (self.mpp_file, main_standby_inst.hostname) self.logger.debug("Kill dssserver on node [%s],cmd: %s." % (main_standby_inst.hostname, kill_cmd)) - status = CmdUtil.retryGetstatusoutput(kill_cmd) - if status != 0: + sts, out = CmdUtil.getstatusoutput_by_fast_popen(kill_cmd) + if sts not in [0, 1]: raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] - % "kill dssserver before start cluster on node:" + main_standby_inst.hostname) - self.logger.debug("Successfully stop dssserver before start cluster on node [%s] " % main_standby_inst.hostname) + % "kill dssserver before start cluster on node:" + main_standby_inst.hostname + + ", output:"+str(out).strip()) + self.logger.log("Successfully kill dssserver before start cluster on node [%s] " % main_standby_inst.hostname) - def build_dn_instance(self, only_mode=None): - """ - Build dn instance - """ - if only_mode and self.params.mode != only_mode: - self.logger.debug("Build dn step is not for mode:%s." % self.params.mode) - return - self.logger.debug("Start building process.") - distribute_arg = "" if self.cluster_info.isSingleInstCluster() else " -Z datanode" - main_params = [] - cascade_params = [] - datanode_instance = [inst for node in self.cluster_info.dbNodes - for inst in node.datanodes] - for inst in datanode_instance: - if inst.instanceId in self.main_standby_ids + self.primary_dn_ids: - main_params.append((inst, self.params.waitingTimeout, self.local_host, - self.bin_path, distribute_arg, self.params.hadrUserName, - self.params.hadrUserPassword)) - else: - cascade_params.append((inst, self.params.waitingTimeout, self.local_host, - self.bin_path, distribute_arg)) - if main_params: - parallelTool.parallelExecute(self.__build_main_standby_dn, main_params) - self.logger.debug("Finished build main standby dns.") - #if cascade_params: - # parallelTool.parallelExecute(self.__build_cascade_standby_dn, cascade_params) - # self.logger.debug("Finished build cascade standby dns.") - del self.params.hadrUserPassword def query_cluster(self): """ -- Gitee From 99e2a10147dd2cda2e5cca51be8bb196d3e9d32c Mon Sep 17 00:00:00 2001 From: Hao Date: Thu, 17 Aug 2023 21:58:23 +0800 Subject: [PATCH 12/23] bugfix ddr stop --- .../impl/dorado_disaster_recovery/ddr_base.py | 94 ++++--------------- .../dorado_diaster_recovery_start.py | 2 +- .../dorado_disaster_recovery_stop.py | 65 +++++++------ 3 files changed, 49 insertions(+), 112 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index a43dfdc4..863bb2ed 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -1672,92 +1672,30 @@ class DoradoDisasterRecoveryBase(object): else: self.logger.log("Check cluster type succeed.") - def __remove_streaming_repl_info(self, params): + def __remove_cross_cluster_replinfo(self, params): """ - Remove streaming repl info from single dn instances. + Remove cross_cluster_replinfo from single dn instances. """ dn_inst, guc_mode, dn_num = params - self.logger.debug("Start remove replconninfo for instance:%s" % dn_inst.instanceId) + self.logger.debug("Start remove cross_cluster_replinfo for instance:%s" % dn_inst.instanceId) + for idx in range(1, dn_num + 1): - if dn_inst.hostname == self.local_host: - cmd = "source %s; gs_guc check -Z datanode -D %s " \ - "-c 'replconninfo%s'" % (self.mpp_file, dn_inst.datadir, idx) - else: - cmd = "source %s; pssh -H %s 'source %s; gs_guc check " \ - "-Z datanode -D %s -c \"replconninfo%s\"'" \ - % (self.mpp_file, dn_inst.hostname, self.mpp_file, dn_inst.datadir, idx) - self.logger.debug("Check original repl infos with cmd:%s" % cmd) + cmd = "source %s ; gs_guc %s -N %s -D %s -c " \ + "\"cross_cluster_replconninfo%s\"" \ + % (self.mpp_file, guc_mode, dn_inst.hostname, dn_inst.datadir, idx) + self.logger.debug("Remove dn cross_cluster_replconninfo with cmd:%s" % cmd) status, output = CmdUtil.retryGetstatusoutput(cmd) if status != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + " Error: \n%s " % output) - if output.count("=NULL") > 2: - continue - elif "iscrossregion=false" in output.lower(): - ret = re.search( - r"replconninfo%s='localhost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})" - r" localport=(\d{4,5}) localheartbeatport=(\d{4,5}) " - r"localservice=(\d{4,5}) " - r"remotehost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}) " - r"remoteport=(\d{4,5}) remoteheartbeatport=(\d{4,5}) " - r"remoteservice=(\d{4,5})" % idx, output) - if not ret: - raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "search repl infos") - if dn_inst.hostname != NetUtil.GetHostIpOrName(): - set_cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ - "-Z datanode -D %s -c " \ - "\\\"replconninfo%s = 'localhost=%s localport=%s " \ - "localheartbeatport=%s localservice=%s remotehost=%s " \ - "remoteport=%s remoteheartbeatport=%s " \ - "remoteservice=%s'\\\"\"" - set_cmd = set_cmd % (self.mpp_file, dn_inst.hostname, - self.mpp_file, guc_mode, - dn_inst.datadir, idx, ret.group(1), - ret.group(2), ret.group(3), ret.group(4), - ret.group(5), ret.group(6), ret.group(7), - ret.group(8)) - else: - set_cmd = "source %s ; gs_guc %s -Z datanode -D %s -c " \ - "\"replconninfo%s = 'localhost=%s localport=%s " \ - "localheartbeatport=%s localservice=%s remotehost=%s " \ - "remoteport=%s remoteheartbeatport=%s " \ - "remoteservice=%s'\"" - set_cmd = set_cmd % (self.mpp_file, guc_mode, - dn_inst.datadir, idx, ret.group(1), - ret.group(2), ret.group(3), ret.group(4), - ret.group(5), ret.group(6), ret.group(7), - ret.group(8)) - self.logger.debug("Set original repl infos with cmd:%s" % set_cmd) - status, output = CmdUtil.retryGetstatusoutput(set_cmd) - if status != 0: - raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % set_cmd + - " Error: \n%s " % output) - self.logger.debug("Successfully remove original repl infos with cmd:%s." - % set_cmd) - elif "iscrossregion=true" in output.lower(): - if dn_inst.hostname != self.local_host: - set_cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ - "-Z datanode -D %s -c \\\"replconninfo%s\\\"\"" - set_cmd = set_cmd % (self.mpp_file, dn_inst.hostname, - self.mpp_file, guc_mode, - dn_inst.datadir, idx) - else: - set_cmd = "source %s ; gs_guc %s -Z datanode -D %s -c " \ - "\"replconninfo%s\"" - set_cmd = set_cmd % (self.mpp_file, guc_mode, - dn_inst.datadir, idx) - self.logger.debug("Remove stream repl infos with cmd:%s" % set_cmd) - status, output = CmdUtil.retryGetstatusoutput(set_cmd) - if status != 0: - raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % set_cmd + - " Error: \n%s " % output) - self.logger.debug("Successfully remove stream repl infos with cmd:%s." + self.logger.debug("Successfully remove cross_cluster_replconninfo with cmd:%s." % set_cmd) + self.logger.debug("Successfully removed replconninfo for instance:%s" % dn_inst.instanceId) - def remove_all_stream_repl_infos(self, guc_mode="set"): + def remove_cross_cluster_replinfos(self, guc_mode="set"): """ - Remove retreaming disaster repl infos from all instances + Remove cross_cluster_replinfos from all instances """ params = [] dn_instances = [inst for node in self.cluster_info.dbNodes @@ -1773,10 +1711,10 @@ class DoradoDisasterRecoveryBase(object): params.append((inst, guc_mode, dn_num)) if params: self.logger.log("Starting remove all node dn instances repl infos.") - parallelTool.parallelExecute(self.__remove_streaming_repl_info, params) + parallelTool.parallelExecute(self.__remove_cross_cluster_replinfo, params) self.logger.log("Successfully remove all node dn instances repl infos.") - def remove_streaming_cluster_file(self): + def remove_ddr_cluster_file(self): """ function: remove the parameter file for config pg_hba :return: NA @@ -1792,7 +1730,7 @@ class DoradoDisasterRecoveryBase(object): "Failed to remove cluster file with error:%s" % error) self.logger.log("Finished remove cluster file.") - def remove_streaming_pg_hba(self, ignore_error=False): + def remove_pg_hba(self, ignore_error=False): """ Remove remote ips from pg hba of streaming disaster """ @@ -1804,7 +1742,7 @@ class DoradoDisasterRecoveryBase(object): data_ip = node_info.get("dataIp") remove_ips.append(data_ip) remove_ips = list(set(remove_ips)) - host_names = self.get_all_connection_node_name("remove_streaming_pg_hba") + host_names = self.get_all_connection_node_name("remove_pg_hba") self.logger.debug("Remove ips:%s from pg_hba on nodes:%s" % ( str(remove_ips), str(host_names))) cmd = "%s -U '%s' -l '%s'" % (OMCommand.getLocalScript("Local_Config_Hba"), diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py index a80c0aa6..ea50f564 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py @@ -208,5 +208,5 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): self._seventh_step_for_ddr_start(step) self._eighth_step_for_ddr_start(step) self._ninth_step_for_ddr_start(step) - self.logger.log("Successfully do streaming disaster recovery start.") + self.logger.log("Successfully do dorado disaster recovery start.") \ No newline at end of file diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py index be1c289e..881aba57 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py @@ -16,8 +16,8 @@ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. # ---------------------------------------------------------------------------- -# Description : streaming_disaster_recovery_stop.py is a utility for stopping -# streaming disaster recovery on primary cluster. +# Description : dorado_disaster_recovery_stop.py is a utility for stopping +# dorado disaster recovery on primary cluster. from impl.dorado_disaster_recovery.ddr_base import DoradoDisasterRecoveryBase @@ -26,80 +26,79 @@ class DisasterRecoveryStopHandler(DoradoDisasterRecoveryBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def _first_step_for_streaming_stop(self, step): + def _first_step_for_ddr_stop(self, step): """ - First step for streaming stop + First step for ddr stop """ if step >= 2: return - self.logger.debug("Start first step of streaming stop.") + self.logger.debug("Start first step of dorado disaster recovery stop.") self.init_cluster_status() self.check_action_and_mode() - def _second_step_for_streaming_stop(self, step): + def _second_step_for_ddr_stop(self, step): """ - Second step for streaming stop + Second step for ddr stop """ if step >= 2: return - self.logger.debug("Start second step of streaming start.") + self.logger.debug("Start second step of dorado disaster recovery stop.") self.check_cluster_status(status_allowed=['Normal']) self.check_cluster_type(allowed_type='primary') self.check_is_under_upgrade() self.write_dorado_step("2_check_cluster_step") - def _third_step_for_streaming_stop(self, step): + def _third_step_for_ddr_stop(self, step): """ - Third step for streaming stop + Third step for ddr stop """ if step >= 3: return - self.logger.debug("Start third step of streaming stop.") - self.remove_all_stream_repl_infos(guc_mode="reload") - self.remove_streaming_cluster_file() + self.logger.debug("Start third step of dorado disaster recovery stop.") + self.remove_cross_cluster_replinfos(guc_mode="reload") + self.remove_ddr_cluster_file() self.write_dorado_step("3_remove_config_step") - def _fourth_step_for_streaming_stop(self, step): + def _fourth_step_for_ddr_stop(self, step): """ - Fourth step for streaming stop + Fourth step for ddr stop """ if step >= 4: return - self.logger.debug("Start fourth step of streaming stop.") - self.remove_streaming_pg_hba() + self.logger.debug("Start fourth step of dorado disaster recovery stop.") + self.remove_pg_hba() self.restore_guc_params() self.write_dorado_step("4_remove_pg_hba_step") - def _fifth_step_for_streaming_stop(self, step): + def _fifth_step_for_ddr_stop(self, step): """ - Fifth step for streaming stop + Fifth step for ddr stop """ if step >= 5: return - self.logger.debug("Start fifth step of streaming start.") - self.streaming_clean_replication_slot() + self.logger.debug("Start fifth step of dorado disaster recovery start.") + #self.streaming_clean_replication_slot() self.write_dorado_step("5_update_config_step") - def _sixth_step_for_streaming_stop(self, step): + def _sixth_step_for_ddr_stop(self, step): """ - Sixth step for streaming stop + Sixth step for ddr stop """ if step >= 6: return - self.logger.debug("Start sixth step of streaming stop.") + self.logger.debug("Start sixth step of dorado disaster recovery stop.") self.check_cluster_status(['Normal']) - self.clean_global_config() self.update_dorado_info("cluster", "normal") self.clean_dorado_dir() def run(self): - self.logger.log("Start remove streaming disaster relationship.") + self.logger.log("Start remove dorado disaster relationship.") step = self.query_dorado_step() - self._first_step_for_streaming_stop(step) + self._first_step_for_ddr_stop(step) self.parse_cluster_status() - self._second_step_for_streaming_stop(step) - self._third_step_for_streaming_stop(step) - self._fourth_step_for_streaming_stop(step) - self._fifth_step_for_streaming_stop(step) - self._sixth_step_for_streaming_stop(step) - self.logger.log("Successfully do streaming disaster recovery stop.") + self._second_step_for_ddr_stop(step) + self._third_step_for_ddr_stop(step) + self._fourth_step_for_ddr_stop(step) + self._fifth_step_for_ddr_stop(step) + self._sixth_step_for_ddr_stop(step) + self.logger.log("Successfully do dorado disaster recovery stop.") -- Gitee From fe32d53eedbac71e0d50296995c19241ffba8c02 Mon Sep 17 00:00:00 2001 From: Hao Date: Fri, 18 Aug 2023 11:57:39 +0800 Subject: [PATCH 13/23] bugfix start --- script/gs_ddr | 2 +- script/impl/dorado_disaster_recovery/ddr_base.py | 4 ++-- ...recovery_start.py => dorado_disaster_recovery_start.py} | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) rename script/impl/dorado_disaster_recovery/ddr_modules/{dorado_diaster_recovery_start.py => dorado_disaster_recovery_start.py} (97%) diff --git a/script/gs_ddr b/script/gs_ddr index 3e699349..3fb5047f 100644 --- a/script/gs_ddr +++ b/script/gs_ddr @@ -31,7 +31,7 @@ from base_utils.os.user_util import UserUtil from domain_utils.cluster_file.cluster_log import ClusterLog from impl.dorado_disaster_recovery.params_handler import ParamsHandler from impl.dorado_disaster_recovery.ddr_modules.\ - dorado_diaster_recovery_start import DisasterRecoveryStartHandler + dorado_disaster_recovery_start import DisasterRecoveryStartHandler from impl.dorado_disaster_recovery.ddr_modules.\ dorado_disaster_recovery_stop import DisasterRecoveryStopHandler from impl.dorado_disaster_recovery.ddr_modules.\ diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 863bb2ed..48619e68 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -613,7 +613,7 @@ class DoradoDisasterRecoveryBase(object): def check_dn_instance_params(self): """set_dn_instance_params""" - check_dick = {"enable_dcf": "off"} + check_dick = {"ha_module_debug ": "off"} dn_insts = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in db_node.datanodes] primary_dn_insts = [inst for inst in dn_insts if inst.instanceId in self.primary_dn_ids] @@ -1689,7 +1689,7 @@ class DoradoDisasterRecoveryBase(object): raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + " Error: \n%s " % output) self.logger.debug("Successfully remove cross_cluster_replconninfo with cmd:%s." - % set_cmd) + % cmd) self.logger.debug("Successfully removed replconninfo for instance:%s" % dn_inst.instanceId) diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_start.py similarity index 97% rename from script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py rename to script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_start.py index ea50f564..0bfb7ecd 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_diaster_recovery_start.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_start.py @@ -16,7 +16,7 @@ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. # ---------------------------------------------------------------------------- -# Description : streaming_disaster_recovery_start.py is utility for creating +# Description : dorado_disaster_recovery_start.py is utility for creating # relationship between primary cluster and standby cluster. import os @@ -60,7 +60,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): "check cm_ctl is available for current cluster") self.check_is_under_upgrade() #检查dn的GUC参数 - #self.check_dn_instance_params() + self.check_dn_instance_params() self.write_dorado_step("2_check_cluster_step") def common_step_for_ddr_start(self): @@ -200,7 +200,8 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): self._second_step_for_ddr_start(step) #更新pg_hba和replinfo self.common_step_for_ddr_start() - + self._third_step_for_ddr_start(step) + self._fourth_step_for_ddr_start(step) self._fifth_step_for_ddr_start(step) #设置CM backup_open参数,灾备backup_open=1, 主集群backup_open=0 self._sixth_step_for_ddr_start(step) -- Gitee From 873afa75ee4a70241b83ec00832051b7768d8772 Mon Sep 17 00:00:00 2001 From: chuanglichuangwai Date: Fri, 18 Aug 2023 14:48:12 +0800 Subject: [PATCH 14/23] =?UTF-8?q?switchover=20=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../impl/dorado_disaster_recovery/ddr_base.py | 82 +++++++------------ .../dorado_disaster_recovery_switchover.py | 42 ++++------ .../params_handler.py | 2 +- 3 files changed, 46 insertions(+), 80 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index a43dfdc4..b554baa2 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -21,6 +21,7 @@ import json import os import re +import subprocess import sys import time from datetime import datetime @@ -98,7 +99,7 @@ class DoradoDisasterRecoveryBase(object): self.is_single_inst = True if self.cluster_info.isSingleInstCluster() else None self.cluster_node_names = self.cluster_info.getClusterNodeNames() self.dorado_file_dir = os.path.join(self.pg_host, DoradoDisasterRecoveryConstants.DDR_FILES_DIR) - self.streaming_xml = os.path.join(self.dorado_file_dir, + self.dorado_xml = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.STREAMING_CONFIG_XML) self.ssh_tool = SshTool(self.cluster_node_names, self.log_file) self.mpp_file = EnvUtil.getMpprcFile() @@ -403,9 +404,9 @@ class DoradoDisasterRecoveryBase(object): parallelTool.parallelExecute(self.stream_clean_gs_secure, params) self.logger.debug("Finished clean gs secure dir.") - def remove_streaming_dir(self, dir_path): + def remove_dorado_dir(self, dir_path): """ - Remove streaming files dir + Remove dorado files dir """ cmd = "if [ -d %s ]; then rm %s -rf;fi" % (dir_path, self.dorado_file_dir) self.ssh_tool.executeCommand(cmd) @@ -1080,6 +1081,23 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug( "Successfully set all datanode guc param in postgres conf for cross_cluster_replconninfo.") + def set_datanode_guc(self, guc_parameter, guc_value, guc_type, only_mode=None): + """ + set datanode guc param + :return: NA + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Set datanode guc [%s] to [%s] not for mode:%s." + % (guc_parameter, guc_value, self.params.mode)) + return + cmd = "gs_guc %s -Z datanode -N all -I all -c \"%s=%s\" " % \ + (guc_type, guc_parameter, guc_value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ + % "set datanode guc [%s] to [%s], output:%s" \ + % (guc_parameter, guc_value, output) + self.logger.debug(msg) def set_cmserver_guc(self, guc_parameter, guc_value, guc_type, only_mode=None): """ @@ -1415,7 +1433,7 @@ class DoradoDisasterRecoveryBase(object): """ Clean flag file """ - flag_file = os.path.join(self.step_file_path, "remote_replication_pairs_done") + flag_file = os.path.join(self.dorado_file_dir, "remote_replication_pairs_done") if os.path.exists(flag_file): self.logger.debug("Successfully removed flag file %s." % flag_file) os.remove(flag_file) @@ -1973,7 +1991,7 @@ class DoradoDisasterRecoveryBase(object): (len(localRole) != 1 or localRole[0] != "Standby"): check_ok = -1 else: - raise Exception(ErrorCode.GAUSS_521["GAUSS_52102"] % state) + raise Exception(ErrorCode.GAUSS_521["F"] % state) else: check_ok = status @@ -1993,10 +2011,10 @@ class DoradoDisasterRecoveryBase(object): if len(host_names) != len(self.cluster_node_names): raise Exception(ErrorCode.GAUSS_506["GAUSS_50623"] % host_names) check_params = [] - all_instances = [dn_inst for db_node in self.cluster_info.dbNodes + all_instances = [(db_node.name, dn_inst) for db_node in self.status_info.dbNodes for dn_inst in db_node.datanodes] - for dn_inst in all_instances: - check_params.append([dn_inst.state, dn_inst.hostname, dn_inst.datadir]) + for host_name, dn_inst in all_instances: + check_params.append([dn_inst.status, host_name, dn_inst.datadir]) if len(check_params) <= 0: raise Exception(ErrorCode.GAUSS_516["GAUSS_51620"] % "cluster") while True: @@ -2016,7 +2034,7 @@ class DoradoDisasterRecoveryBase(object): if check_status == 0: break if check_status != 0: - if dorado_switchover == "dorado_switchover": + if dorado_switchover == "disaster_switchover": raise Exception( ErrorCode.GAUSS_516["GAUSS_51659"] % "gs_ctl query") self.logger.logExit( @@ -2056,48 +2074,6 @@ class DoradoDisasterRecoveryBase(object): self._failover_config_step(dorado_disaster_step, action_flag) self._failover_start_step(dorado_disaster_step, action_flag) - def check_dorado_datanode_query_info(self, timeout=DefaultValue.TIMEOUT_CLUSTER_START, - dorado_switchover=None): - """ - check gs_ctl query info - """ - self.logger.debug("Waiting for gs_ctl query status being satisfied.") - end_time = None if timeout <= 0 else datetime.now() + timedelta(seconds=timeout) - - host_names = self.get_all_connection_node_name() - if len(host_names) != len(self.cluster_node_names): - raise Exception(ErrorCode.GAUSS_506["GAUSS_50623"] % host_names) - check_params = [] - all_instances = [dn_inst for db_node in self.cluster_info.dbNodes - for dn_inst in db_node.datanodes] - for dn_inst in all_instances: - check_params.append([dn_inst.state, dn_inst.hostname, dn_inst.datadir]) - if len(check_params) <= 0: - raise Exception(ErrorCode.GAUSS_516["GAUSS_51620"] % "cluster") - while True: - check_status = 0 - time.sleep(10) - if end_time is not None and datetime.now() >= end_time: - check_status = 1 - self.logger.debug("Timeout. The gs_ctl query command cannot obtain the expected status.") - break - results = parallelTool.parallelExecute( - self.check_datanode_query_info, check_params) - for ret in results: - if ret[0] != 0: - self.logger.debug("Failed to check node[%s] info using \"gs_ctl query\" command " - "with status[%s], output[%s]" % (ret[-1], ret[0], ret[1])) - check_status = 1 - if check_status == 0: - break - if check_status != 0: - if dorado_switchover == "dorado_switchover": - raise Exception( - ErrorCode.GAUSS_516["GAUSS_51659"] % "gs_ctl query") - self.logger.logExit( - ErrorCode.GAUSS_516["GAUSS_51659"] % "gs_ctl query") - self.logger.debug("Successfully wait for gs_ctl query status become Normal.", "constant") - def _failover_start_step(self, dorado_disaster_step, action_flag): """ Failover step 5 & 6 @@ -2116,12 +2092,12 @@ class DoradoDisasterRecoveryBase(object): self.check_cluster_status(cluster_normal_status, check_current=True) if action_flag != DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: self.check_dorado_datanode_query_info(timeout=30, - dorado_switchover="dorado_failover") + dorado_switchover="disaster_failover") self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "100%") self.update_dorado_info("cluster", "normal") else: self.check_dorado_datanode_query_info(timeout=30, - dorado_switchover="dorado_switchover") + dorado_switchover="disaster_failover") self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "100%") self.update_dorado_info("cluster", "archive") diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py index 3a2c077f..2f878de5 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py @@ -42,7 +42,7 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): self.remote_replication_pairs_log_message = \ "Please configure \"Remote Replication Pairs\" correctly on "\ "And check and grant appropriate permissions to the corresponding device files.\n"\ - "to inform the tool and execute the tool again." + "Create file \"%s\" to mark the completion of the above operations and execute the tool again." def run(self): """ @@ -50,7 +50,7 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): """ self.logger.log("Start dorado disaster switchover.") self.check_action_and_mode() - self.check_switchover_workable() + # self.check_switchover_workable() self.check_dn_instance_params() self.check_is_under_upgrade() try: @@ -67,18 +67,6 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): self.remove_cluster_maintance_file() self.logger.log("Successfully do dorado disaster recovery switchover.") - def check_xlog_file_path(self): - """ - get and check xlog_file_path - """ - linkDev = self.dorado_info - if os.path.islink(linkDev): - linkDev = os.readlink(self.dorado_info) - if not os.access(linkDev, os.R_OK | os.W_OK): - self.logger.debug(ErrorCode.GAUSS_501("GAUSS_50113") % self.user) - return False - return True - def dorado_switchover_single_inst(self): """ dorado disaster recovery switchover for single_inst cluster @@ -97,20 +85,21 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): self.add_cluster_maintance_file_for_switchover() try: if dorado_disaster_step < 1: - self.update_streaming_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "10%") + self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "10%") + #self.check_switchover_workable() self.stop_cluster() self.write_dorado_step("1_dorado_disaster_stop_cluster_for_switchover") - flag_file = os.path.join(self.step_file_path, "remote_replication_pairs_done") + flag_file = os.path.join(self.dorado_file_dir, "remote_replication_pairs_done") if os.path.exists(flag_file): - self.logger.debug("Delete file %s." % flag_file) + self.logger.log("Delete file %s." % flag_file) os.remove(flag_file) - self.logger.debug(self.remote_replication_pairs_log_message % flag_file) + self.logger.log(self.remote_replication_pairs_log_message % flag_file) sys.exit(0) if dorado_disaster_step < 2: self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "30%") - flag_file = os.path.join(self.step_file_path, "remote_replication_pairs_done") - if not os.path.exists(flag_file) or not self.check_xlog_file_path(): - self.logger.debug(self.remote_replication_pairs_log_message % flag_file) + flag_file = os.path.join(self.dorado_file_dir, "remote_replication_pairs_done") + if not os.path.exists(flag_file): + self.logger.log(self.remote_replication_pairs_log_message % flag_file) sys.exit(0) self.write_dorado_step("2_set_remote_replication_pairs_for_switchover") if dorado_disaster_step < 3: @@ -126,9 +115,9 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): self.write_dorado_step("4_start_cluster_done") if dorado_disaster_step < 5: self.wait_for_normal(timeout=self.params.waitingTimeout, - dorado_switchover="dorado_switchover") + dorado_switchover="disaster_switchover") self.check_dorado_datanode_query_info(timeout=self.params.waitingTimeout, - dorado_switchover="dorado_switchover") + dorado_switchover="disaster_switchover") self.update_dorado_info("cluster", "recovery") except Exception as error: self.logger.error("Failed to do dorado disaster cluster switchover, Error:" @@ -137,7 +126,8 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): self.logger.debug("Roll back switchover step:%s" % rollback_step) self.remove_cluster_maintance_file_for_switchover() self.remove_cluster_maintance_file() - self.dorado_switchover_roll_back(update_query=True) + if rollback_step >= 2: + self.dorado_switchover_roll_back(update_query=True) self.clean_step_file() self.clean_flag_file() raise Exception(error) @@ -362,7 +352,7 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): cluster_status.clusterStatus, cluster_status.clusterStatusDetail)) if check_status != 0: - if dorado_switchover == "dorado_switchover": + if dorado_switchover == "disaster_switchover": raise Exception( ErrorCode.GAUSS_528["GAUSS_52800"] % (cluster_status.clusterStatus, cluster_status.clusterStatusDetail)) @@ -414,7 +404,7 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): "cluster mode") if update_query: self.update_dorado_info("cluster", "archive") - self.logger.debug("Please restore the original \"Remote Replication Pairs\" correctly on " + self.logger.log("Please restore the original \"Remote Replication Pairs\" correctly on " "the storage management interface.\n" "And check and grant appropriate permissions to the corresponding device files.\n" "After completing these steps, start the cluster manually !") diff --git a/script/impl/dorado_disaster_recovery/params_handler.py b/script/impl/dorado_disaster_recovery/params_handler.py index 8eabed13..6feadd46 100644 --- a/script/impl/dorado_disaster_recovery/params_handler.py +++ b/script/impl/dorado_disaster_recovery/params_handler.py @@ -338,7 +338,7 @@ class ParamsHandler(object): try: self.__parse_args() self.logger.log(DoradoDisasterRecoveryConstants.LOG_REMARK) - self.logger.log('Streaming disaster recovery ' + self.params.task + ' ' + self.trace_id) + self.logger.log('Dorado disaster recovery ' + self.params.task + ' ' + self.trace_id) self.logger.log(DoradoDisasterRecoveryConstants.LOG_REMARK) self.__init_default_params() #self.__reload_hadr_user_info() -- Gitee From cbc11076125af1d06d47a24a7728e36c4c4e842a Mon Sep 17 00:00:00 2001 From: chuanglichuangwai Date: Fri, 18 Aug 2023 16:05:06 +0800 Subject: [PATCH 15/23] =?UTF-8?q?switchover=20=E7=81=BE=E5=A4=87=E5=8D=87?= =?UTF-8?q?=E4=B8=BB=E6=B5=8B=E8=AF=95=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../impl/dorado_disaster_recovery/ddr_base.py | 127 +++++++++--------- 1 file changed, 64 insertions(+), 63 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 66cd94b8..e68d49b0 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -100,7 +100,7 @@ class DoradoDisasterRecoveryBase(object): self.cluster_node_names = self.cluster_info.getClusterNodeNames() self.dorado_file_dir = os.path.join(self.pg_host, DoradoDisasterRecoveryConstants.DDR_FILES_DIR) self.dorado_xml = os.path.join(self.dorado_file_dir, - DoradoDisasterRecoveryConstants.STREAMING_CONFIG_XML) + DoradoDisasterRecoveryConstants.STREAMING_CONFIG_XML) self.ssh_tool = SshTool(self.cluster_node_names, self.log_file) self.mpp_file = EnvUtil.getMpprcFile() self.dss_home_dir = self.cluster_info.dss_home @@ -572,7 +572,7 @@ class DoradoDisasterRecoveryBase(object): cluster_status = self.cluster_status if check_current: self.logger.debug("Starting check CLuster status") - check_cmd = "source %s && cm_ctl query | grep cluster_state | awk '{print $NF}'"\ + check_cmd = "source %s && cm_ctl query | grep cluster_state | awk '{print $NF}'" \ % self.mpp_file status, output = CmdUtil.retryGetstatusoutput(check_cmd) if status != 0: @@ -793,7 +793,6 @@ class DoradoDisasterRecoveryBase(object): % ("set wal_keep_segments for inst:%s" % inst.instanceId, str(output))) self.logger.debug("Successfully [%s] shardNum [%s] node [%s] wal_keep_segments " "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) - def __set_dn_xlog_file_path(self, params_list): """ @@ -849,7 +848,7 @@ class DoradoDisasterRecoveryBase(object): self.logger.log("Starting set xlog_file_path param") cmd = "source %s && gs_guc set -Z datanode -N all -I all " \ "-c \"xlog_file_path='%s'\"" \ - % (self.mpp_file, xlog_file_path) + % (self.mpp_file, xlog_file_path) status, output = CmdUtil.retryGetstatusoutput(cmd) if status != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + "Error:%s" % output) @@ -875,7 +874,7 @@ class DoradoDisasterRecoveryBase(object): % ("set xlog_lock_file_path for inst:%s" % inst.instanceId, str(output))) self.logger.debug("Successfully [%s] shardNum [%s] node [%s] xlog_lock_file_path " "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) - + def __set_app_name_each_inst(self, params_list): """ Set xlog_lock_file_path value in each dn @@ -893,22 +892,22 @@ class DoradoDisasterRecoveryBase(object): % ("set application_name for inst:%s" % inst.instanceId, str(output))) self.logger.debug("Successfully [%s] shardNum [%s] node [%s] application_name " "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) - + def set_xlog_lock_file(self, opt_type="set"): """ guc set xlog_lock_file_path value in primary dn """ self.logger.log("Starting %s xlog_lock_file_path param" % (opt_type)) - params_list=[] + params_list = [] for dbnode in self.cluster_info.dbNodes: for inst in dbnode.datanodes: lock_file = os.path.join(inst.datadir, "xlog_lock_file") params_list.append((inst, opt_type, lock_file, self.mpp_file)) - + if not params_list: raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain param list for set xlog_lock_file_path") - + parallelTool.parallelExecute(self.__set_xlog_lock_file_each_inst, params_list) self.logger.log("Successfully %s xlog_lock_file_path param." % (opt_type)) @@ -916,22 +915,22 @@ class DoradoDisasterRecoveryBase(object): """ guc set application_name value """ - self.logger.log("Starting set application_name param" ) + self.logger.log("Starting set application_name param") app_name_prefix = "dn_master" if self.params.mode == "primary" \ else "dn_standby" - params_list=[] + params_list = [] for dbnode in self.cluster_info.dbNodes: for inst in dbnode.datanodes: app_name = "%s_%s" % (app_name_prefix, inst.instanceId) params_list.append((inst, "set", app_name, self.mpp_file)) - + if not params_list: raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain param list for set application_name") - + parallelTool.parallelExecute(self.__set_app_name_each_inst, params_list) - self.logger.log("Successfully set application_name param." ) - + self.logger.log("Successfully set application_name param.") + def set_cluster_run_mode(self): """ guc set xlog_file_path value in primary dn @@ -995,8 +994,8 @@ class DoradoDisasterRecoveryBase(object): for remote_ip in remote_ips: cmd = "source %s ; gs_guc set -Z datanode -N all -I all -h " \ - "\"host all all %s/32 trust\"" \ - % (self.mpp_file, remote_ip) + "\"host all all %s/32 trust\"" \ + % (self.mpp_file, remote_ip) self.logger.debug("Update pg_hba.conf with cmd: %s" % cmd) status, output = CmdUtil.retryGetstatusoutput(cmd) if status != 0: @@ -1021,7 +1020,7 @@ class DoradoDisasterRecoveryBase(object): return data_ip raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] % "obtain shards from local cluster info") - + def __get_remote_ips(self): """ Get remote dn data ip @@ -1035,9 +1034,9 @@ class DoradoDisasterRecoveryBase(object): ip = node["ip"] data_ip = node["dataIp"] remote_ips.append(data_ip) - + return remote_ips - + def __config_one_dn_instance(self, params): """ Config cross_cluster_replconninfo for one dn instance @@ -1053,8 +1052,8 @@ class DoradoDisasterRecoveryBase(object): set_cmd = "source %s ; gs_guc set -N %s -D %s -c " \ "\"cross_cluster_replconninfo%s = 'localhost=%s localport=%s " \ "remotehost=%s remoteport=%s '\"" \ - % (self.mpp_file, inst.hostname, inst.datadir, idx, - local_dn_ip, local_port, remote_ip, remote_port) + % (self.mpp_file, inst.hostname, inst.datadir, idx, + local_dn_ip, local_port, remote_ip, remote_port) self.logger.debug("Set dn cross cluster replinfos with cmd:%s" % set_cmd) idx += 1 status, output = CmdUtil.retryGetstatusoutput(set_cmd) @@ -1063,14 +1062,13 @@ class DoradoDisasterRecoveryBase(object): " Error: \n%s " % output) self.logger.debug("Successfully rectify original repl infos for instance:%s." % inst.instanceId) - def config_cross_cluster_repl_info(self): """ update postgresql.conf for cross_cluster_replconninfo """ self.logger.debug("set all datanode guc param in postgres conf for cross_cluster_replconninfo.") - + opt_mode = "set" config_repl_params = [] datanode_instance = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes] @@ -1078,9 +1076,10 @@ class DoradoDisasterRecoveryBase(object): for inst in datanode_instance: config_repl_params.append((inst, opt_mode)) rets = parallelTool.parallelExecute(self.__config_one_dn_instance, config_repl_params) - + self.logger.debug( "Successfully set all datanode guc param in postgres conf for cross_cluster_replconninfo.") + def set_datanode_guc(self, guc_parameter, guc_value, guc_type, only_mode=None): """ set datanode guc param @@ -1144,7 +1143,7 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("Start dssserver step is not for mode:%s." % self.params.mode) return primary_dn = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in - db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] main_standby_inst = primary_dn[0] if self.local_host == main_standby_inst.hostname: @@ -1156,14 +1155,15 @@ class DoradoDisasterRecoveryBase(object): cmd = "source %s; pssh -s -t 5 -H %s \"source %s; export DSS_MAINTAIN=TRUE && " \ "nohup dssserver -D $DSS_HOME >/dev/null 2>&1 & \"" \ % (self.mpp_file, main_standby_inst.hostname) - + self.logger.debug("Start dssserver on node [%s],cmd: %s." % (main_standby_inst.hostname, cmd)) proc = FastPopen(cmd) out, err = proc.communicate() if proc.returncode != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] + - 'Start dssserver on node [{}] Error: {}'.format(main_standby_inst.hostname, str(err + out).strip())) - + 'Start dssserver on node [{}] Error: {}'.format(main_standby_inst.hostname, + str(err + out).strip())) + self.logger.log("Successfully Start dssserver on node [%s] " % main_standby_inst.hostname) def build_main_standby_datanode(self, only_mode=None): @@ -1174,24 +1174,26 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("Build Main standby step is not for mode:%s." % self.params.mode) return primary_dn = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in - db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] main_standby_inst = primary_dn[0] if self.local_host == main_standby_inst.hostname: build_cmd = "source %s; gs_ctl build -D %s -b cross_cluster_full -g 0 -q -t %s" \ - % (self.mpp_file, main_standby_inst.datadir, DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT) + % (self.mpp_file, main_standby_inst.datadir, DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT) else: build_cmd = "source %s; pssh -s -t %s -H %s \"source %s;" \ - " gs_ctl build -D %s -b cross_cluster_full -g 0 -q -t %s \"" \ - % (self.mpp_file, DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT + 10, main_standby_inst.hostname, - self.mpp_file, main_standby_inst.datadir, DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT) - self.logger.debug("Build Main standby datanode on node [%s],cmd: %s." % (main_standby_inst.hostname, build_cmd)) + " gs_ctl build -D %s -b cross_cluster_full -g 0 -q -t %s \"" \ + % (self.mpp_file, DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT + 10, + main_standby_inst.hostname, + self.mpp_file, main_standby_inst.datadir, DoradoDisasterRecoveryConstants.MAX_BUILD_TIMEOUT) + self.logger.debug("Build Main standby datanode on node [%s],cmd: %s." % (main_standby_inst.hostname, build_cmd)) status, output = CmdUtil.retry_util_timeout(build_cmd, self.params.waitingTimeout) if status != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % build_cmd + "Options:%s, Error: \n%s " % ("build main_standby on node :%s" % main_standby_inst.hostname, str(output))) - self.logger.debug("Successfully build main_standby in disaster standby cluster on node [%s] " % main_standby_inst.hostname) + self.logger.debug( + "Successfully build main_standby in disaster standby cluster on node [%s] " % main_standby_inst.hostname) def kill_dss_instance(self, only_mode=None): """ @@ -1201,23 +1203,22 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("Kill dssserver process step is not for mode:%s." % self.params.mode) return primary_dn = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in - db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] main_standby_inst = primary_dn[0] if self.local_host == main_standby_inst.hostname: kill_cmd = "source %s; pkill -9 -f dssserver" % (self.mpp_file) else: kill_cmd = "source %s; pssh -s -t 3 -H %s \"pkill -9 -f dssserver\"" \ - % (self.mpp_file, main_standby_inst.hostname) - self.logger.debug("Kill dssserver on node [%s],cmd: %s." % (main_standby_inst.hostname, kill_cmd)) + % (self.mpp_file, main_standby_inst.hostname) + self.logger.debug("Kill dssserver on node [%s],cmd: %s." % (main_standby_inst.hostname, kill_cmd)) sts, out = CmdUtil.getstatusoutput_by_fast_popen(kill_cmd) if sts not in [0, 1]: raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "kill dssserver before start cluster on node:" + main_standby_inst.hostname + - ", output:"+str(out).strip()) + ", output:" + str(out).strip()) self.logger.log("Successfully kill dssserver before start cluster on node [%s] " % main_standby_inst.hostname) - def query_cluster(self): """ query cluster @@ -1248,7 +1249,7 @@ class DoradoDisasterRecoveryBase(object): status, output = CmdUtil.retryGetstatusoutput(cmd, retry_time=0) if status != 0: error_str = ErrorCode.GAUSS_516["GAUSS_51607"] % "the cluster" + \ - " Error:\n%s." % output + " Error:\n%s." % output self.logger.debug(error_str) self.logger.log("Warning: the cluster is not normal, please check cluster status!") else: @@ -1296,10 +1297,10 @@ class DoradoDisasterRecoveryBase(object): return self.primary_dn_ids = p_inst_list sql_check = "select 1 from pg_catalog.pg_stat_get_wal_senders() where " \ - "sync_state='Async' and peer_role='StandbyCluster_Standby' and peer_state='Normal';" + "sync_state='Async' and peer_role='StandbyCluster_Standby' and peer_state='Normal';" param_list = [(dn_inst, sql_check) for db_node in self.cluster_info.dbNodes for dn_inst in db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] - + if not param_list: raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain param list for check main standby connection on primary dn") @@ -1699,16 +1700,16 @@ class DoradoDisasterRecoveryBase(object): for idx in range(1, dn_num + 1): cmd = "source %s ; gs_guc %s -N %s -D %s -c " \ - "\"cross_cluster_replconninfo%s\"" \ - % (self.mpp_file, guc_mode, dn_inst.hostname, dn_inst.datadir, idx) + "\"cross_cluster_replconninfo%s\"" \ + % (self.mpp_file, guc_mode, dn_inst.hostname, dn_inst.datadir, idx) self.logger.debug("Remove dn cross_cluster_replconninfo with cmd:%s" % cmd) status, output = CmdUtil.retryGetstatusoutput(cmd) if status != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + " Error: \n%s " % output) self.logger.debug("Successfully remove cross_cluster_replconninfo with cmd:%s." - % cmd) - + % cmd) + self.logger.debug("Successfully removed replconninfo for instance:%s" % dn_inst.instanceId) def remove_cross_cluster_replinfos(self, guc_mode="set"): @@ -1888,7 +1889,6 @@ class DoradoDisasterRecoveryBase(object): "error:%s." % (value, str(error))) self.logger.debug("Successfully create cluster_maintance file.") - def check_datanode_query_info(self, params): """ check datanode info by "gs_ctl query" command. @@ -1936,7 +1936,7 @@ class DoradoDisasterRecoveryBase(object): return check_ok, output, dest_ip def check_dorado_datanode_query_info(self, timeout=DefaultValue.TIMEOUT_CLUSTER_START, - dorado_switchover=None): + dorado_switchover=None): """ check gs_ctl query info """ @@ -1966,7 +1966,7 @@ class DoradoDisasterRecoveryBase(object): self.check_datanode_query_info, check_params) for ret in results: if ret[0] != 0: - self.logger.debug("Failed to check node[%s] info using \"gs_ctl query\" command " + self.logger.log("Failed to check node[%s] info using \"gs_ctl query\" command " "with status[%s], output[%s]" % (ret[-1], ret[0], ret[1])) check_status = 1 if check_status == 0: @@ -1996,17 +1996,17 @@ class DoradoDisasterRecoveryBase(object): self.parse_cluster_status() self.stop_cluster() self.write_dorado_step("0_dorado_disaster_stop_cluster_for_failover") - flag_file = os.path.join(self.step_file_path, "remote_replication_pairs_done") + flag_file = os.path.join(self.dorado_file_dir, "remote_replication_pairs_done") if os.path.exists(flag_file): - self.logger.debug("Delete file %s." % flag_file) + self.logger.log("Delete file %s." % flag_file) os.remove(flag_file) - self.logger.debug(self.remote_replication_pairs_log_message % flag_file) + self.logger.log(self.remote_replication_pairs_log_message % flag_file) sys.exit(0) if dorado_disaster_step < 1: # 标志文件存在,检查远程复制的lun设备权限,更新进度,代表 "远程复制Pair"任务完成 - flag_file = os.path.join(self.step_file_path, "remote_replication_pairs_done") - if not os.path.exists(flag_file) or not self.check_xlog_file_path(): - self.logger.debug(self.remote_replication_pairs_log_message % flag_file) + flag_file = os.path.join(self.dorado_file_dir, "remote_replication_pairs_done") + if not os.path.exists(flag_file): + self.logger.log(self.remote_replication_pairs_log_message % flag_file) sys.exit(0) self.write_dorado_step("1_set_remote_replication_pairs_for_failover") self._failover_config_step(dorado_disaster_step, action_flag) @@ -2029,13 +2029,14 @@ class DoradoDisasterRecoveryBase(object): cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL] self.check_cluster_status(cluster_normal_status, check_current=True) if action_flag != DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: - self.check_dorado_datanode_query_info(timeout=30, - dorado_switchover="disaster_failover") + # 没有流复制信息检查 + # self.check_dorado_datanode_query_info(timeout=30, + # dorado_switchover="disaster_failover") self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "100%") self.update_dorado_info("cluster", "normal") else: - self.check_dorado_datanode_query_info(timeout=30, - dorado_switchover="disaster_failover") + self.check_dorado_datanode_query_info(timeout=self.params.waitingTimeout, + dorado_switchover="disaster_switchover") self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "100%") self.update_dorado_info("cluster", "archive") @@ -2135,8 +2136,8 @@ class DoradoDisasterRecoveryBase(object): % (self.mpp_file, bin_path, opt_type, data_dir, max_term) else: cmd_config = "source %s; pssh -s -t 900 -H %s \"source %s; %s/gs_ctl notify%s -D %s " \ - "-M primary -T %s -t 600\"" % (self.mpp_file, self.mpp_file, hostname, - bin_path, opt_type, data_dir, max_term) + "-M primary -T %s -t 600\"" % (self.mpp_file, self.mpp_file, hostname, + bin_path, opt_type, data_dir, max_term) self.logger.debug("Config primary dn with cmd:%s" % cmd_config) status, output = CmdUtil.retryGetstatusoutput(cmd_config) if status != 0: -- Gitee From 9a3683c443b2f37e3effe901a2b8cfc06d8df356 Mon Sep 17 00:00:00 2001 From: chuanglichuangwai Date: Fri, 18 Aug 2023 18:52:18 +0800 Subject: [PATCH 16/23] =?UTF-8?q?switchover=20=E6=B5=8B=E8=AF=95=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- script/impl/dorado_disaster_recovery/ddr_base.py | 10 +++++----- .../dorado_disaster_recovery_switchover.py | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index e68d49b0..21811a23 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -1906,7 +1906,7 @@ class DoradoDisasterRecoveryBase(object): dbState = re.findall(r"db_state.*: (.*?)\n", output) localRole = re.findall(r"local_role.*: (.*?)\n", output) peerRole = re.findall(r"peer_role.*: (.*?)\n", output) - preeState = re.findall(r"pree_state.*: (.*?)\n", output) + peerState = re.findall(r"peer_state.*: (.*?)\n", output) channel = re.findall(r"channel.*: (.*?)\n", output) if status == 0: check_ok = 0 @@ -1914,14 +1914,14 @@ class DoradoDisasterRecoveryBase(object): if (len(dbState) != 1 or dbState[0] != "Normal") or \ (len(localRole) != 2 or localRole[0] != "Primary" or localRole[1] != "Primary") or \ (len(peerRole) != 1 or peerRole[0] != "StandbyCluster_Standby") or \ - (len(preeState) != 1 or preeState[0] != "Normal") or \ + (len(peerState) != 1 or peerState[0] != "Normal") or \ (len(channel) != 1 or "-->" not in channel[0]): check_ok = -1 elif state == "Main Standby": if (len(dbState) != 1 or dbState[0] != "Normal") or \ (len(localRole) != 2 or localRole[0] != "Main Standby" or localRole[1] != "Standby") or \ (len(peerRole) != 1 or peerRole[0] != "Primary") or \ - (len(preeState) != 1 or preeState[0] != "Normal") or \ + (len(peerState) != 1 or peerState[0] != "Normal") or \ (len(channel) != 1 or "<--" not in channel[0]): check_ok = -1 elif state == "Standby": @@ -1969,8 +1969,8 @@ class DoradoDisasterRecoveryBase(object): self.logger.log("Failed to check node[%s] info using \"gs_ctl query\" command " "with status[%s], output[%s]" % (ret[-1], ret[0], ret[1])) check_status = 1 - if check_status == 0: - break + if check_status == 0: + break if check_status != 0: if dorado_switchover == "disaster_switchover": raise Exception( diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py index 2f878de5..56f7e41b 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py @@ -124,12 +124,12 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): " \n%s" % str(error)) rollback_step = self.query_dorado_step() self.logger.debug("Roll back switchover step:%s" % rollback_step) - self.remove_cluster_maintance_file_for_switchover() - self.remove_cluster_maintance_file() - if rollback_step >= 2: - self.dorado_switchover_roll_back(update_query=True) - self.clean_step_file() - self.clean_flag_file() + #self.remove_cluster_maintance_file_for_switchover() + #self.remove_cluster_maintance_file() + #if rollback_step >= 2: + # self.dorado_switchover_roll_back(update_query=True) + #self.clean_step_file() + #self.clean_flag_file() raise Exception(error) self.remove_hadr_switchover_process_file() -- Gitee From 962e4f15402e2b7f88bde827033b0c9d6e2fe0c4 Mon Sep 17 00:00:00 2001 From: chuanglichuangwai Date: Fri, 18 Aug 2023 21:33:18 +0800 Subject: [PATCH 17/23] =?UTF-8?q?failover=20=E6=B5=8B=E8=AF=95=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../impl/dorado_disaster_recovery/ddr_base.py | 37 +++++++++++-------- .../dorado_disaster_recovery_switchover.py | 35 ++++++------------ 2 files changed, 32 insertions(+), 40 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 21811a23..b4c288fc 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -86,6 +86,10 @@ class DoradoDisasterRecoveryBase(object): self.connected_nodes = [] self.__init_globals() self.backup_open_key = DoradoDisasterRecoveryConstants.BACKUP_OPEN % user + self.remote_replication_pairs_input_message = \ + "Correctly configure \"Remote Replication Pairs\", " \ + "and ensure that the corresponding device files have appropriate permissions.\n" \ + "Ready to move on (yes/no)? " def __init_globals(self): self.cluster_info = dbClusterInfo() @@ -1979,6 +1983,22 @@ class DoradoDisasterRecoveryBase(object): ErrorCode.GAUSS_516["GAUSS_51659"] % "gs_ctl query") self.logger.debug("Successfully wait for gs_ctl query status become Normal.", "constant") + def check_input(self, msg_print): + flag = input(msg_print) + count_f = 2 + while count_f: + if ( + flag.upper() != "YES" + and flag.upper() != "NO" + and flag.upper() != "Y" and flag.upper() != "N"): + count_f -= 1 + flag = input("Please type 'yes' or 'no': ") + continue + break + if flag.upper() != "YES" and flag.upper() != "Y": + self.logger.exitWithError( + ErrorCode.GAUSS_358["GAUSS_35805"] % flag.upper()) + def dorado_failover_single_inst(self, dorado_disaster_step, action_flag=None): """ dorado disaster recovery failover for single_inst cluster @@ -1996,18 +2016,8 @@ class DoradoDisasterRecoveryBase(object): self.parse_cluster_status() self.stop_cluster() self.write_dorado_step("0_dorado_disaster_stop_cluster_for_failover") - flag_file = os.path.join(self.dorado_file_dir, "remote_replication_pairs_done") - if os.path.exists(flag_file): - self.logger.log("Delete file %s." % flag_file) - os.remove(flag_file) - self.logger.log(self.remote_replication_pairs_log_message % flag_file) - sys.exit(0) if dorado_disaster_step < 1: - # 标志文件存在,检查远程复制的lun设备权限,更新进度,代表 "远程复制Pair"任务完成 - flag_file = os.path.join(self.dorado_file_dir, "remote_replication_pairs_done") - if not os.path.exists(flag_file): - self.logger.log(self.remote_replication_pairs_log_message % flag_file) - sys.exit(0) + self.check_input(self.remote_replication_pairs_input_message) self.write_dorado_step("1_set_remote_replication_pairs_for_failover") self._failover_config_step(dorado_disaster_step, action_flag) self._failover_start_step(dorado_disaster_step, action_flag) @@ -2021,17 +2031,12 @@ class DoradoDisasterRecoveryBase(object): self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "80%") else: self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "80%") - self.remove_cluster_maintance_file_for_switchover() - self.remove_cluster_maintance_file() self.start_cluster() self.write_dorado_step("3_start_cluster_done") if dorado_disaster_step < 4: cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL] self.check_cluster_status(cluster_normal_status, check_current=True) if action_flag != DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER: - # 没有流复制信息检查 - # self.check_dorado_datanode_query_info(timeout=30, - # dorado_switchover="disaster_failover") self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_FAILOVER, "100%") self.update_dorado_info("cluster", "normal") else: diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py index 56f7e41b..ee08e2b0 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py @@ -39,10 +39,6 @@ from impl.dorado_disaster_recovery.ddr_constants import DoradoDisasterRecoveryCo class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.remote_replication_pairs_log_message = \ - "Please configure \"Remote Replication Pairs\" correctly on "\ - "And check and grant appropriate permissions to the corresponding device files.\n"\ - "Create file \"%s\" to mark the completion of the above operations and execute the tool again." def run(self): """ @@ -76,9 +72,9 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): self.create_cluster_maintance_file("dorado switchover") self.update_dorado_info("cluster", DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER) dorado_disaster_step = self.query_dorado_step() + if dorado_disaster_step < 1: + self.check_switchover_workable() if self.params.mode == "primary": - # 这里可以等待 “Remote Copy Pairs” 同步状态完成 - # self.dorado_failover_single_inst(dorado_disaster_step, DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER) else: @@ -86,21 +82,12 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): try: if dorado_disaster_step < 1: self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "10%") - #self.check_switchover_workable() self.stop_cluster() self.write_dorado_step("1_dorado_disaster_stop_cluster_for_switchover") - flag_file = os.path.join(self.dorado_file_dir, "remote_replication_pairs_done") - if os.path.exists(flag_file): - self.logger.log("Delete file %s." % flag_file) - os.remove(flag_file) - self.logger.log(self.remote_replication_pairs_log_message % flag_file) - sys.exit(0) if dorado_disaster_step < 2: self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "30%") flag_file = os.path.join(self.dorado_file_dir, "remote_replication_pairs_done") - if not os.path.exists(flag_file): - self.logger.log(self.remote_replication_pairs_log_message % flag_file) - sys.exit(0) + self.check_input(self.remote_replication_pairs_input_message) self.write_dorado_step("2_set_remote_replication_pairs_for_switchover") if dorado_disaster_step < 3: self.set_datanode_guc("cluster_run_mode", "cluster_standby", "set") @@ -124,12 +111,12 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): " \n%s" % str(error)) rollback_step = self.query_dorado_step() self.logger.debug("Roll back switchover step:%s" % rollback_step) - #self.remove_cluster_maintance_file_for_switchover() - #self.remove_cluster_maintance_file() - #if rollback_step >= 2: + # self.remove_cluster_maintance_file_for_switchover() + # self.remove_cluster_maintance_file() + # if rollback_step >= 2: # self.dorado_switchover_roll_back(update_query=True) - #self.clean_step_file() - #self.clean_flag_file() + # self.clean_step_file() + # self.clean_flag_file() raise Exception(error) self.remove_hadr_switchover_process_file() @@ -405,9 +392,9 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): if update_query: self.update_dorado_info("cluster", "archive") self.logger.log("Please restore the original \"Remote Replication Pairs\" correctly on " - "the storage management interface.\n" - "And check and grant appropriate permissions to the corresponding device files.\n" - "After completing these steps, start the cluster manually !") + "the storage management interface.\n" + "And check and grant appropriate permissions to the corresponding device files.\n" + "After completing these steps, start the cluster manually !") self.logger.log("Successfully Roll back dorado disaster cluster switchover.") def check_streaming_disaster_switchover_barrier(self): -- Gitee From ff8886de531aeb00ceb44c8939ade6127804b284 Mon Sep 17 00:00:00 2001 From: chuanglichuangwai Date: Fri, 18 Aug 2023 21:41:04 +0800 Subject: [PATCH 18/23] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AF=B9=E6=B5=81?= =?UTF-8?q?=E5=A4=8D=E5=88=B6=E4=BF=A1=E6=81=AF=E7=9A=84=E5=A2=9E=E5=BC=BA?= =?UTF-8?q?=E5=88=A4=E6=96=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- script/impl/dorado_disaster_recovery/ddr_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index b4c288fc..f841a42d 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -1919,14 +1919,14 @@ class DoradoDisasterRecoveryBase(object): (len(localRole) != 2 or localRole[0] != "Primary" or localRole[1] != "Primary") or \ (len(peerRole) != 1 or peerRole[0] != "StandbyCluster_Standby") or \ (len(peerState) != 1 or peerState[0] != "Normal") or \ - (len(channel) != 1 or "-->" not in channel[0]): + (len(channel) != 1 or "-->" not in channel[0] or len(channel[0]) <= 30): check_ok = -1 elif state == "Main Standby": if (len(dbState) != 1 or dbState[0] != "Normal") or \ (len(localRole) != 2 or localRole[0] != "Main Standby" or localRole[1] != "Standby") or \ (len(peerRole) != 1 or peerRole[0] != "Primary") or \ (len(peerState) != 1 or peerState[0] != "Normal") or \ - (len(channel) != 1 or "<--" not in channel[0]): + (len(channel) != 1 or "<--" not in channel[0] or len(channel[0]) <= 30): check_ok = -1 elif state == "Standby": if (len(dbState) != 1 or dbState[0] != "Normal") or \ -- Gitee From 094bc580d80994fe568b8450b319e7bfb8880a03 Mon Sep 17 00:00:00 2001 From: Hao Date: Mon, 21 Aug 2023 01:39:44 +0800 Subject: [PATCH 19/23] update gs_ddr --- script/gs_ddr | 4 +- .../impl/dorado_disaster_recovery/ddr_base.py | 579 +----------------- .../dorado_disaster_recovery/ddr_constants.py | 10 +- .../dorado_disaster_recovery_failover.py | 6 +- .../dorado_disaster_recovery_query.py | 72 +-- .../dorado_disaster_recovery_start.py | 20 - .../dorado_disaster_recovery_switchover.py | 8 +- .../params_handler.py | 22 +- 8 files changed, 61 insertions(+), 660 deletions(-) diff --git a/script/gs_ddr b/script/gs_ddr index 3fb5047f..6b6689db 100644 --- a/script/gs_ddr +++ b/script/gs_ddr @@ -39,14 +39,14 @@ from impl.dorado_disaster_recovery.ddr_modules.\ from impl.dorado_disaster_recovery.ddr_modules.\ dorado_disaster_recovery_switchover import DisasterRecoverySwitchoverHandler from impl.dorado_disaster_recovery.ddr_modules.\ - dorado_disaster_recovery_query import StreamingQueryHandler + dorado_disaster_recovery_query import DoradoQueryHandler HANDLER_MAPPING = { "start": DisasterRecoveryStartHandler, "stop": DisasterRecoveryStopHandler, "switchover": DisasterRecoverySwitchoverHandler, "failover": DisasterRecoveryFailoverHandler, - #"query": StreamingQueryHandler + "query": DoradoQueryHandler } diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index f841a42d..5e0064a2 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -16,7 +16,7 @@ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. # ---------------------------------------------------------------------------- -# Description : streaming_base.py is a base module for streaming disaster recovery. +# Description : ddr_base.py is a base module for dorado disaster recovery. ############################################################################# import json import os @@ -104,7 +104,7 @@ class DoradoDisasterRecoveryBase(object): self.cluster_node_names = self.cluster_info.getClusterNodeNames() self.dorado_file_dir = os.path.join(self.pg_host, DoradoDisasterRecoveryConstants.DDR_FILES_DIR) self.dorado_xml = os.path.join(self.dorado_file_dir, - DoradoDisasterRecoveryConstants.STREAMING_CONFIG_XML) + DoradoDisasterRecoveryConstants.DDR_CONFIG_XML) self.ssh_tool = SshTool(self.cluster_node_names, self.log_file) self.mpp_file = EnvUtil.getMpprcFile() self.dss_home_dir = self.cluster_info.dss_home @@ -169,7 +169,7 @@ class DoradoDisasterRecoveryBase(object): def handle_lock_file(self, trace_id, action): """ - Create lock file for other streaming process. + Create lock file for other dorado process. """ if self.params.task not in DoradoDisasterRecoveryConstants.TASK_EXIST_CHECK: return @@ -187,7 +187,7 @@ class DoradoDisasterRecoveryBase(object): def check_parallel_process_is_running(self): """ - Check streaming process is running + Check dorado process is running """ hostnames = ' -H '.join(self.cluster_node_names) file_path = os.path.join(self.pg_host, DoradoDisasterRecoveryConstants.PROCESS_LOCK_FILE) @@ -221,87 +221,6 @@ class DoradoDisasterRecoveryBase(object): self.ssh_tool.executeCommand(cmd) self.logger.debug("Successfully create dir [%s] on all nodes." % dir_path) - def check_hadr_pwd(self, only_mode=None): - """ - Check hadr pwd is correct or not - """ - if only_mode and self.params.mode != only_mode: - self.logger.debug("Checking hadr user is not for mode:%s." % self.params.mode) - return - self.logger.debug("Start checking disaster user password.") - sql = "select 1;" - primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in - db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] - if not primary_dns: - raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] - % "obtain primary dn when check disaster user") - status, output = ClusterCommand.remoteSQLCommand( - sql, self.user, primary_dns[0].hostname, primary_dns[0].port, False, - user_name=self.params.hadrUserName, user_pwd=self.params.hadrUserPassword) - if status != 0: - if "Invalid username/password" in output: - self.logger.debug("Logging denied, please check your password.") - self.logger.logExit(ErrorCode.GAUSS_516['GAUSS_51632'] - % "check disaster user password") - self.logger.debug("Successfully check disaster user password.") - - def check_hadr_user(self, only_mode=None): - """ - Check hadr user is exist - """ - if only_mode and self.params.mode != only_mode: - self.logger.debug("Checking hadr user is not for mode:%s." % self.params.mode) - return - self.logger.log("Start checking disaster recovery user.") - sql = "select usename, userepl from pg_user;" - primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in - db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] - if not primary_dns: - raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] - % "obtain primary dn when check disaster user") - status, output = ClusterCommand.remoteSQLCommand( - sql, self.user, primary_dns[0].hostname, primary_dns[0].port, True) - if status != 0: - raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] - % "execute sql for checking disaster user.") - user_dict = {user_info.split('|')[0].strip(): user_info.split('|')[-1].strip() - for user_info in output.strip().split('\n')} - for user_name, repl in user_dict.items(): - if user_name == self.params.hadrUserName and repl == 't': - self.logger.log("Successfully check disaster recovery user.") - return - msg = ErrorCode.GAUSS_516['GAUSS_51632'] % 'checking disaster user, please confirm ' \ - 'disaster user is exist and with ' \ - 'replication role' - self.logger.logExit(msg + "Users:%s" % user_dict) - - def __copy_hadr_user_key(self, secure_dir_path, update=False): - """ - Copy hadr.key.cipher and hadr.key.rand - """ - self.logger.log("Start copy hadr user key files.") - hadr_cipher_path = os.path.join(self.bin_path, "hadr.key.cipher") - hadr_rand_path = os.path.join(self.bin_path, "hadr.key.rand") - secure_cipher_path = os.path.join(secure_dir_path, "hadr.key.cipher") - secure_rand_path = os.path.join(secure_dir_path, "hadr.key.rand") - if not update: - if (not os.path.isfile(hadr_cipher_path)) or (not os.path.isfile(hadr_rand_path)): - self.logger.debug("Not found hadr user key, no need to copy.") - return - FileUtil.cpFile(hadr_cipher_path, secure_cipher_path, cmd_type="shell") - FileUtil.cpFile(hadr_rand_path, secure_rand_path, cmd_type="shell") - self.logger.debug("Successfully copy hadr key files into temp secure dir.") - else: - if (not os.path.isfile(secure_cipher_path)) or (not os.path.isfile(secure_rand_path)): - self.logger.debug("Not found hadr user key, no need to update.") - return - host_names = self.get_all_connection_node_name("update_hadr_key") - self.ssh_tool.scpFiles(secure_cipher_path, self.bin_path, hostList=host_names) - self.ssh_tool.scpFiles(secure_rand_path, self.bin_path, hostList=host_names) - FileUtil.removeFile(secure_cipher_path) - FileUtil.removeFile(secure_rand_path) - self.logger.debug("Finished copy hadr key files to nodes:%s." % host_names) - def remove_secure_dir(self, dir_path, host_name): """ Remove gs_secure_files dir in PGDATA @@ -314,100 +233,6 @@ class DoradoDisasterRecoveryBase(object): if status != 0: raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + " Error: \n%s " % output) - def __stream_copy_file_to_all_dn(self, temp_secure_dir_path): - """ - copy key file dir to all dn dir - """ - dn_infos = DefaultValue.get_dn_info(self.cluster_info) - self.logger.debug("Got dns:%s" % dn_infos) - copy_succeed = 0 - host_names = self.get_all_connection_node_name("copy gs_secure_files to dns") - for dn_info in dn_infos: - if dn_info["host_name"] not in host_names: - continue - self.logger.debug("Copy disaster recovery secure files to inst[%s][%s][%s]." % - (dn_info['id'], dn_info['data_dir'], dn_info['host_name'])) - try: - self.remove_secure_dir(dn_info['data_dir'], dn_info['host_name']) - self.ssh_tool.scpFiles( - temp_secure_dir_path, dn_info['data_dir'], [dn_info['host_name']]) - copy_succeed += 1 - except Exception as error: - self.logger.debug("Failed copy secure files to inst[%s][%s][%s],error:%s." % - (dn_info['id'], dn_info['data_dir'], dn_info['host_name'], - str(error))) - if copy_succeed == 0: - raise Exception( - ErrorCode.GAUSS_516["GAUSS_51632"] % "copy secure dir to all dn data dir") - self.logger.log("Successfully copy secure files.") - - def __prepare_cluster_user_record(self, temp_secure_dir_path): - """ - Save cluster user record - """ - cluster_user_record = os.path.join(temp_secure_dir_path, - DoradoDisasterRecoveryConstants.CLUSTER_USER_RECORD) - DefaultValue.write_content_on_file(cluster_user_record, self.user) - self.logger.debug("Record current cluster user:%s." % self.user) - - def prepare_gs_secure_files(self, only_mode=None): - """ - Prepare gs_secure_files on primary cluster - """ - if only_mode and self.params.mode != only_mode: - self.logger.debug("Prepare gs_secure_files is not for mode:%s." % self.params.mode) - return - self.logger.log("Start prepare secure files.") - secure_dir_name = DoradoDisasterRecoveryConstants.GS_SECURE_FILES - temp_secure_dir_path = os.path.realpath( - os.path.join(self.dorado_file_dir, secure_dir_name)) - if os.path.isdir(temp_secure_dir_path): - self.logger.debug("Secure file dir exist, cleaning...") - FileUtil.removeDirectory(temp_secure_dir_path) - FileUtil.createDirectory(temp_secure_dir_path, True, DefaultValue.KEY_DIRECTORY_MODE) - if os.path.isdir(temp_secure_dir_path): - self.logger.debug("Successfully create secure file dir.") - version_file_path = os.path.realpath(os.path.join(self.gp_home, "version.cfg")) - FileUtil.cpFile(version_file_path, temp_secure_dir_path) - self.__prepare_cluster_user_record(temp_secure_dir_path) - self.__copy_hadr_user_key(temp_secure_dir_path, update=False) - self.__stream_copy_file_to_all_dn(temp_secure_dir_path) - FileUtil.removeDirectory(temp_secure_dir_path) - - def stream_clean_gs_secure(self, params): - """ - clean gs secure dir - """ - inst, file_path = params - self.logger.debug("Starting clean instance %s gs secure dir." % inst.instanceId) - cmd = "source %s && pssh -s -H %s 'if [ -d %s ]; then rm -rf %s; fi'" \ - % (self.mpp_file, inst.hostname, file_path, file_path) - status, output = CmdUtil.retryGetstatusoutput(cmd) - if status != 0: - self.logger.debug("Clean gs secure dir for instance [%s] result:%s." % - (inst.instanceId, output)) - self.logger.debug("Successfully clean instance %s gs secure dir." % inst.instanceId) - - def clean_gs_secure_dir(self, only_mode=None): - """ - Clean gs secure dir if exist - """ - if only_mode and self.params.mode != only_mode: - self.logger.debug("Clean gs_secure_files is not for mode:%s." % self.params.mode) - return - self.logger.debug("Start clean gs secure dir.") - params = [] - for node in self.cluster_info.dbNodes: - for inst in node.datanodes: - if inst.hostname not in self.connected_nodes: - continue - file_path = os.path.realpath(os.path.join( - inst.datadir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES)) - params.append((inst, file_path)) - if params: - parallelTool.parallelExecute(self.stream_clean_gs_secure, params) - self.logger.debug("Finished clean gs secure dir.") - def remove_dorado_dir(self, dir_path): """ Remove dorado files dir @@ -434,7 +259,7 @@ class DoradoDisasterRecoveryBase(object): def write_dorado_step(self, step): """ - write streaming step + write dorado step :return: NA """ self.logger.debug("Dorado disaster recovery action:[%s] record current step:[%s]" @@ -571,7 +396,7 @@ class DoradoDisasterRecoveryBase(object): def check_cluster_status(self, status_allowed, only_check=False, check_current=False, is_log=True): """ - Stream disaster cluster switch to check cluster status + Dorado disaster cluster switch to check cluster status """ cluster_status = self.cluster_status if check_current: @@ -668,7 +493,7 @@ class DoradoDisasterRecoveryBase(object): def restore_guc_params(self): """ - Restore guc params in .streaming_guc_backup + Restore guc params in .dorado_guc_backup """ self.logger.debug("Start restore guc params.") guc_backup_file = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.GUC_BACKUP_FILE) @@ -694,18 +519,6 @@ class DoradoDisasterRecoveryBase(object): inst_type="dn", raise_error=False) restored_keys.append(guc_key) - def set_most_available(self, mode='set', inst_type='dn', raise_error=True): - dn_insts = [dn_inst for db_node in self.cluster_info.dbNodes - for dn_inst in db_node.datanodes if int(dn_inst.mirrorId) == 1] - if len(dn_insts) > 2: - self.logger.debug("No need set most available for current cluster.") - return - self.__set_guc_param("most_available_sync", "on", mode=mode, - inst_type=inst_type, raise_error=raise_error) - - self.__set_guc_param("synchronous_commit", "on", mode=mode, - inst_type=inst_type, raise_error=raise_error) - def __set_guc_param(self, key, value, mode='set', inst_type='dn', raise_error=True): """ Set guc param @@ -798,24 +611,6 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("Successfully [%s] shardNum [%s] node [%s] wal_keep_segments " "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) - def __set_dn_xlog_file_path(self, params_list): - """ - Set xlog_file_path value in primary dn - """ - (inst, opt_type, value, mpprc_file) = params_list - self.logger.debug("Start [%s] shardNum [%s] node [%s] wal_keep_segments value [%s]." - % (opt_type, inst.mirrorId, inst.hostname, value)) - cmd = "source %s; gs_guc %s " \ - "-N %s -D %s -c \"xlog_file_path = '%s'\" " % \ - (mpprc_file, opt_type, inst.node, inst.datadir, value) - status, output = CmdUtil.retryGetstatusoutput(cmd) - if status != 0: - raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + - "Options:%s, Error: \n%s " - % ("set xlog_file_path for inst:%s" % inst.instanceId, str(output))) - self.logger.debug("Successfully [%s] shardNum [%s] node [%s] wal_keep_segments " - "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) - def set_wal_keep_segments(self, opt_type, value, restore_flag=False, only_mode=None): """ guc set wal_keep_segments value in primary dn @@ -950,7 +745,7 @@ class DoradoDisasterRecoveryBase(object): """ self.logger.debug("Start stop node:%s" % node_id) cmd = ClusterCommand.getStopCmd(int(node_id), "i", 1800) - self.logger.debug("Streaming disaster calling cm_ctl to stop cluster, cmd=[%s]" % cmd) + self.logger.debug("dorado disaster calling cm_ctl to stop cluster, cmd=[%s]" % cmd) status, output = CmdUtil.retryGetstatusoutput(cmd) if status != 0: self.logger.debug("Failed stop node:%s, error:%s" % (node_id, output)) @@ -972,7 +767,7 @@ class DoradoDisasterRecoveryBase(object): % (static_config + " or " + cm_ctl_file)) node_id_list = list(set([instance.nodeId for instance in self.normal_instances])) parallelTool.parallelExecute(self.__stop_one_node, node_id_list) - self.logger.log("Successfully stopped the cluster by node for streaming cluster.") + self.logger.log("Successfully stopped the cluster by node for dorado cluster.") def get_all_connection_node_name(self, action_flag="", no_update=True): """ @@ -1332,76 +1127,6 @@ class DoradoDisasterRecoveryBase(object): time.sleep(5) self.logger.log("Main standby already connected.") - def hadr_key_generator(self, key_name): - """ - Generate key_name.key.cipher & key_name.key.rand - """ - self.logger.log("Start generate hadr key files.") - if not os.path.exists(self.bin_path): - msg = ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain bin path." - self.logger.debug(msg) - raise Exception(msg) - if not os.path.exists(self.gp_home): - msg = ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain env GPHOME" - self.logger.debug(msg) - raise Exception(msg) - key_cipher = os.path.join(self.bin_path, "%s.key.cipher" % key_name) - key_rand = os.path.join(self.bin_path, "%s.key.rand" % key_name) - cmd = "export LD_LIBRARY_PATH=%s/script/gspylib/clib && source %s " \ - "&& gs_guc generate -S default -o %s -D '%s' && %s && %s" \ - % (self.gp_home, self.mpp_file, key_name, self.bin_path, - CmdUtil.getChmodCmd(str(ConstantsBase.KEY_FILE_MODE), key_cipher), - CmdUtil.getChmodCmd(str(ConstantsBase.KEY_FILE_MODE), key_rand)) - if (not os.path.isfile(key_cipher)) or (not os.path.isfile(key_rand)): - status, output = CmdUtil.retryGetstatusoutput(cmd) - if status != 0 or (not os.path.isfile(key_cipher)) \ - or (not os.path.isfile(key_rand)): - msg = ErrorCode.GAUSS_516["GAUSS_51632"] \ - % "generate hadr key files" + "Error:%s" % output - self.logger.error(msg) - raise Exception(msg) - else: - self.logger.log("Streaming key files already exist.") - - self.ssh_tool.scpFiles(key_cipher, self.bin_path) - self.ssh_tool.scpFiles(key_rand, self.bin_path) - self.logger.log("Finished generate and distribute hadr key files.") - - def encrypt_hadr_user_info(self, key_name, hadr_user, hadr_pwd): - """ - Encrypt hadr user info. - """ - self.logger.log("Start encrypt hadr user info.") - cmd = "source %s && gs_encrypt -f %s \"%s|%s\"" \ - % (self.mpp_file, key_name, hadr_user, hadr_pwd) - status, output = CmdUtil.retryGetstatusoutput(cmd) - if status != 0 or not output: - msg = ErrorCode.GAUSS_516["GAUSS_51632"] % "encrypt hadr user info" - self.logger.error(msg) - raise Exception(msg) - self.logger.log("Successfully encrypt hadr user info.") - return output - - def keep_hadr_user_info(self, info_str, retry=5): - """ - Keep hadr user info into GLOBAL CONFIGURATION - """ - self.logger.log("Start save hadr user info into database.") - sql = "ALTER GLOBAL CONFIGURATION with(hadr_user_info ='%s');" % info_str - primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in - db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] - primary_dns = primary_dns * retry - output = "None" - for dn_inst in primary_dns: - status, output = ClusterCommand.remoteSQLCommand( - sql, self.user, dn_inst.hostname, dn_inst.port, True) - if status == 0: - self.logger.log("Successfully save hadr user info into database.") - return - msg = ErrorCode.GAUSS_516['GAUSS_51632'] % "save hadr user info into database" - self.logger.error(msg + "Error:%s" % SensitiveMask.mask_pwd(output)) - raise Exception(msg) - def restore_wal_keep_segments(self, only_mode=None): """ restore wal_keep_segments default value @@ -1455,7 +1180,7 @@ class DoradoDisasterRecoveryBase(object): """ Check action and mode if step file exist. if any dorado options not finished(step file exist), - not allowed doing any other streaming options except query. + not allowed doing any other dorado options except query. """ self.logger.debug("Checking action and mode.") exist_step_file_names = [] @@ -1471,7 +1196,7 @@ class DoradoDisasterRecoveryBase(object): % "check action and mode, the step files %s already exist, " "please ensure the action %s is finished before " "doing current options" % (exist_step_file_names, exist_action)) - self.logger.debug("Successfully checked action and mode.") + self.logger.debug("clean_global_configSuccessfully checked action and mode.") def clean_dorado_dir(self): """ @@ -1487,24 +1212,6 @@ class DoradoDisasterRecoveryBase(object): "Failed to remove dorado dir with error:%s" % error) self.logger.log("Finished remove dorado dir.") - def clean_global_config(self): - """ - Clean global config - """ - self.logger.log("Clean hadr user info.") - sql = "DROP GLOBAL CONFIGURATION hadr_user_info;" - primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in - db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] - output = "None" - for dn_inst in primary_dns: - status, output = ClusterCommand.remoteSQLCommand( - sql, self.user, dn_inst.hostname, dn_inst.port, True) - if status == 0: - self.logger.log("Successfully clean hadr user info from database.") - return - msg = ErrorCode.GAUSS_516['GAUSS_51632'] % "clean hadr user info from database" - self.logger.debug(msg + "Error:%s" % SensitiveMask.mask_pwd(output)) - def get_build_info(self): """ Assemble build infos @@ -1538,116 +1245,6 @@ class DoradoDisasterRecoveryBase(object): self.logger.debug("Successfully get remote dn info:%s." % remote_ip_port) return dn_inst_info, remote_ip_port - def build_file_from_remote(self): - """ - Build files from remote cluster - """ - local_dn_info, remote_ip_port = self.get_build_info() - cmd_local = 'source %s; %s/gs_ctl build -D %s -M standby -b copy_secure_files -Z datanode' \ - ' -U %s -P "%s" -C "localhost=%s localport=%s remotehost=%s remoteport=%s"' - cmd_remote = "echo \"source %s; %s/gs_ctl build -D %s -M standby -b copy_secure_files -Z " \ - "datanode -U %s -P '%s' -C 'localhost=%s localport=%s " \ - "remotehost=%s remoteport=%s'\"" \ - " | pssh -s -H %s" - - end_time = datetime.now() + timedelta(seconds=self.params.waitingTimeout) - self.logger.debug("Retry Building with timeout:%ss." % self.params.waitingTimeout) - succeed = False - while datetime.now() < end_time: - for local_primary in local_dn_info: - for remote_ip, remote_port in remote_ip_port: - if local_primary["host_name"] == NetUtil.GetHostIpOrName(): - cmd = cmd_local % (self.mpp_file, "%s/bin" % self.gauss_home, - local_primary["data_dir"], - self.params.hadrUserName, self.params.hadrUserPassword, - local_primary["listen_ip"], local_primary["port"], - remote_ip, remote_port) - else: - cmd = cmd_remote % (self.mpp_file, "%s/bin" % self.gauss_home, - local_primary["data_dir"], - self.params.hadrUserName, self.params.hadrUserPassword, - local_primary["listen_ip"], local_primary["port"], - remote_ip, remote_port, local_primary["host_name"]) - result = DefaultValue.fast_ping_on_node(local_primary["host_name"], - local_primary["listen_ip"], - remote_ip, self.logger) - if not result[-1]: - self.logger.debug("Ignore build from %s, ping result:%s" - % (remote_ip, result[-1])) - continue - if self.cluster_info.isSingleInstCluster(): - cmd = cmd.replace(" -Z datanode", "") - self.logger.debug("Building with cmd:%s." - % cmd.replace(self.params.hadrUserPassword, "***")) - status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd) - if status == 0: - succeed = True - self.logger.debug("Successfully Building with cmd:%s." - % cmd.replace(self.params.hadrUserPassword, "***")) - return succeed - else: - self.logger.debug("Building result:%s." % SensitiveMask.mask_pwd(output)) - time.sleep(1) - return succeed - - def __copy_secure_dir_from_dn_dir(self): - """ - Find and copy key file dir from all dn dir - """ - local_temp_secure_path = os.path.join( - self.dorado_file_dir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES) - if os.path.isdir(local_temp_secure_path): - FileUtil.removeDirectory(local_temp_secure_path) - rand_path = os.path.join(local_temp_secure_path, DoradoDisasterRecoveryConstants.HADR_KEY_RAND) - cipher_path = os.path.join(local_temp_secure_path, DoradoDisasterRecoveryConstants.HADR_KEY_CIPHER) - cmd_tep = "echo \"if [ -d '%s' ];then source %s && pscp --trace-id %s -H %s '%s' '%s' " \ - "&& rm -rf '%s';fi\" | pssh -s -H %s" - succeed = False - for db_node in self.cluster_info.dbNodes: - for dn_inst in db_node.datanodes: - if int(dn_inst.mirrorId) == 1: - key_file_path = os.path.realpath(os.path.join( - dn_inst.datadir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES)) - cmd_copy_dir = cmd_tep % (key_file_path, self.mpp_file, self.trace_id, - self.local_host, key_file_path, - self.dorado_file_dir, - key_file_path, dn_inst.hostname) - status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd_copy_dir) - self.logger.debug("Copy cmd:%s" % cmd_copy_dir) - if status != 0: - self.logger.debug("Try copy secure dir from:[%s][%s], error:%s" % ( - dn_inst.hostname, key_file_path, output)) - if os.path.isdir(local_temp_secure_path) and os.path.isfile(rand_path) \ - and os.path.isfile(cipher_path): - succeed = True - if not succeed: - raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "copy secure file dir") - self.logger.debug("Successfully copy secure dir, file list:%s." % - os.listdir(local_temp_secure_path)) - - def build_and_distribute_key_files(self, only_mode=None): - """ - Distribute key files - """ - if only_mode and self.params.mode != only_mode: - self.logger.debug("Wal keep segment opts not for mode:%s." % self.params.mode) - return - self.logger.log("Start build key files from remote cluster.") - # build file - if not self.build_file_from_remote(): - raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] % "build files from cluster") - # copy file from data dir to streaming dir - self.__copy_secure_dir_from_dn_dir() - # check version consistency - self.__check_version_file() - # check cluster user consistency - self.__check_cluster_user() - # distribute key files to all node - secure_dir_path = os.path.join(self.dorado_file_dir, DoradoDisasterRecoveryConstants.GS_SECURE_FILES) - self.__copy_hadr_user_key(secure_dir_path, update=True) - FileUtil.removeDirectory(secure_dir_path) - self.logger.log("Successfully build and distribute key files to all nodes.") - def __check_version_file(self): """ function: Check whether the version numbers of the host @@ -1755,7 +1352,7 @@ class DoradoDisasterRecoveryBase(object): def remove_pg_hba(self, ignore_error=False): """ - Remove remote ips from pg hba of streaming disaster + Remove remote ips from pg hba of dorado disaster """ self.logger.log("Start remove pg_hba config.") remove_ips = [] @@ -1783,65 +1380,6 @@ class DoradoDisasterRecoveryBase(object): raise error self.logger.log("Finished remove pg_hba config.") - def streaming_drop_replication_slot(self, dn_inst, drop_slots): - """ - Delete dn_xxx_hadr on all dn nodes if dn_xxx_hadr exists when the disaster tolerance - relationship is lifted - """ - if not drop_slots: - self.logger.debug("WARNING:Not found dn_xxx_hadr on %s node, No need to " - "delete." % dn_inst.instanceId) - else: - for slot in drop_slots: - self.logger.debug("starting drop inst %s %s" % (dn_inst.instanceId, slot.strip())) - sql = "select * from pg_catalog.pg_drop_replication_slot('%s');" % slot.strip() - status_dr, output_dr = ClusterCommand.remoteSQLCommand( - sql, self.user, dn_inst.hostname, dn_inst.port, maintenance_mode=True) - self.logger.debug("get %s need drop replication_slots, status=%d, " - "output: %s." % (dn_inst.hostname, status_dr, - SensitiveMask.mask_pwd(output_dr))) - if status_dr != 0: - self.logger.debug("Failed to remove inst %s %s with error: %s" % ( - dn_inst.instanceId, slot.strip(), output_dr)) - self.logger.debug( - "Successfully drop node %s %s" % (dn_inst.instanceId, slot.strip())) - - def concurrent_drop_slot(self, dn_inst): - """ - concurrent drop all dn replication slots - """ - sql_check = "select * from pg_catalog.pg_get_replication_slots();" - self.logger.debug("Starting concurrent drop node %s instance [%s] replication slots" % - (dn_inst.hostname, dn_inst.instanceId)) - status, output = ClusterCommand.remoteSQLCommand( - sql_check, self.user, dn_inst.hostname, dn_inst.port, maintenance_mode=True) - self.logger.debug("get %s all replication slots, status=%d, output: %s." % - (dn_inst.instanceId, status, SensitiveMask.mask_pwd(output))) - if status == 0 and output.strip(): - drop_slots = [] - if str(dn_inst.instanceId).startswith("6"): - drop_slots = re.findall(r"dn_\d+_hadr", output.strip()) - if str(dn_inst.instanceId).startswith("5"): - drop_slots = re.findall(r"cn_\d+_\d+\.\d+\.\d+\.\d+_\d+", output.strip()) - self.logger.debug("Waiting to delete instance [%s] replication slots is: %s" % - (dn_inst.instanceId, drop_slots)) - self.streaming_drop_replication_slot(dn_inst, drop_slots) - else: - self.logger.debug("Obtain all replication slot results:%s." % output) - - def streaming_clean_replication_slot(self): - """ - Delete dn_xxx_hadr on all dn nodes if dn_xxx_hadr exists when the disaster tolerance - relationship is lifted - """ - self.logger.log("Starting drop all node replication slots") - params = [dn_inst for db_node in self.cluster_info.dbNodes - for dn_inst in db_node.datanodes if dn_inst.instanceId in self.normal_dn_ids] - self.logger.debug("need drop all node replication slots: %s" % - [inst.instanceId for inst in params]) - parallelTool.parallelExecute(self.concurrent_drop_slot, params) - self.logger.log("Finished drop all node replication slots") - def update_dorado_info(self, key, value, only_mode=None): """ Update info for dorado status @@ -2045,40 +1583,6 @@ class DoradoDisasterRecoveryBase(object): self.update_dorado_info(DoradoDisasterRecoveryConstants.ACTION_SWITCHOVER, "100%") self.update_dorado_info("cluster", "archive") - def streaming_clean_archive_slot(self): - """ - drop lot_type is physical and slot_name not contain (gs_roach_full,gs_roach_inc, - cn_xxx,dn_xxx, dn_xxx_hadr) on all cn node and all primary dn node if the - slot_name exists when the disaster cluster become primary cluster - """ - self.logger.debug("Starting drop archive slots") - params = [dn_inst for db_node in self.cluster_info.dbNodes - for dn_inst in db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] - self.logger.debug("need drop all node archive slots: %s" % - [inst.instanceId for inst in params]) - parallelTool.parallelExecute(self.parallel_drop_archive_slot, params) - self.logger.debug("Successfully drop all node archive slots") - - def parallel_drop_archive_slot(self, dn_inst): - """ - concurrent drop all primary dn and all cn archive slots - """ - sql_check = "select slot_name from pg_catalog.pg_get_replication_slots() " \ - "where slot_type='physical' and slot_name not in " \ - "('gs_roach_full', 'gs_roach_inc') and slot_name not like 'cn_%' and " \ - "slot_name not like 'dn_%';" - self.logger.debug("Starting concurrent drop node %s instance [%s] archive slots" % - (dn_inst.hostname, dn_inst.instanceId)) - (status, output) = ClusterCommand.remoteSQLCommand( - sql_check, self.user, dn_inst.hostname, dn_inst.port) - self.logger.debug("get %s all archive slots, status=%d, output: %s." % - (dn_inst.instanceId, status, output)) - if status == 0 and output.strip(): - archive_slots = output.strip().split('\n') - self.logger.debug("Waiting to delete instance [%s] archive slots is: %s" % - (dn_inst.instanceId, archive_slots)) - self.streaming_drop_replication_slot(dn_inst, archive_slots) - def get_specified_dn_infos(self, update=False, dn_status="Primary"): """ @@ -2111,7 +1615,7 @@ class DoradoDisasterRecoveryBase(object): def start_primary_dn(self, params): """ - Start main standby as primary dn in streaming failover. + Start main standby as primary dn in dorado failover. """ dn_info, max_term = params opt_type = " -Z datanode" if not self.cluster_info.isSingleInstCluster() else "" @@ -2159,12 +1663,12 @@ class DoradoDisasterRecoveryBase(object): :param guc_type: init type :return: NA """ - self.logger.debug("Starting set cm server for streaming disaster.") + self.logger.debug("Starting set cm server for dorado disaster.") cmd = "source %s && gs_guc %s -Z cmserver -D 'cm_instance_data_path' -c \"%s=%s\" " \ % (self.mpp_file, guc_type, guc_parameter, guc_value) - self.logger.debug("streaming disaster calling set cms, cmd=[%s]" % cmd) + self.logger.debug("dorado disaster calling set cms, cmd=[%s]" % cmd) self.ssh_tool.executeCommand(cmd, hostList=self.normal_cm_ips) - self.logger.debug("Successfully set cm server for streaming disaster.") + self.logger.debug("Successfully set cm server for dorado disaster.") def stream_disaster_set_cmagent_guc(self, guc_parameter, guc_value, guc_type): """ @@ -2174,12 +1678,12 @@ class DoradoDisasterRecoveryBase(object): :param guc_type: init type :return: NA """ - self.logger.debug("Starting set cm agent for streaming disaster.") + self.logger.debug("Starting set cm agent for dorado disaster.") cmd = "source %s && gs_guc %s -Z cmagent -D 'cm_instance_data_path' -c \"%s=%s\" " \ % (self.mpp_file, guc_type, guc_parameter, guc_value) - self.logger.debug("streaming disaster calling set cma, cmd=[%s]" % cmd) + self.logger.debug("dorado disaster calling set cma, cmd=[%s]" % cmd) self.ssh_tool.executeCommand(cmd, hostList=self.normal_node_list) - self.logger.debug("Successfully set cm agent for streaming disaster.") + self.logger.debug("Successfully set cm agent for dorado disaster.") def _failover_config_step(self, dorado_disaster_step, action_flag): """ @@ -2328,51 +1832,6 @@ class DoradoDisasterRecoveryBase(object): return True return False - def set_stream_cluster_run_mode_guc(self, guc_mode, fail_over=False): - """ - function: set cluster run mode guc - :return: - """ - cluster_run_mode = "cluster_primary" if self.params.mode == "primary" \ - else "cluster_standby" - if fail_over: - cluster_run_mode = "cluster_primary" - guc_cmd = "source %s && gs_guc %s -Z datanode -N all -I all -c " \ - "\"stream_cluster_run_mode = '%s'\"" % \ - (self.mpp_file, guc_mode, cluster_run_mode) - host_names = self.cluster_info.getClusterNodeNames() - ignore_node = [node for node in host_names if node not in self.normal_node_list] - if ignore_node: - self.logger.debug( - "WARNING: cluster_run_mode for datanode ignore nodes:%s" % ignore_node) - nodes = ",".join(ignore_node) - guc_cmd = guc_cmd + " --ignore-node %s" % nodes - self.logger.debug("Set dn stream_cluster_run_mode with cmd:%s" % guc_cmd) - (status, output) = CmdUtil.retryGetstatusoutput(guc_cmd) - if status != 0: - self.logger.debug("Warning: Failed %s dn stream_cluster_run_mode=%s, output: %s" % - (guc_mode, cluster_run_mode, str(output))) - else: - self.logger.debug("Successfully %s streaming cluster run mode for " - "datanode param %s" % (guc_mode, cluster_run_mode)) - - guc_cmd_cn = "source %s && gs_guc %s -Z coordinator -N all -I all -c " \ - "\"stream_cluster_run_mode = '%s'\"" % \ - (self.mpp_file, guc_mode, cluster_run_mode) - if ignore_node: - self.logger.debug( - "WARNING: cluster_run_mode for coordinator ignore nodes:%s" % ignore_node) - nodes = ",".join(ignore_node) - guc_cmd_cn = guc_cmd_cn + " --ignore-node %s" % nodes - self.logger.debug("Set cn stream_cluster_run_mode with cmd:%s" % guc_cmd_cn) - (status, output) = CmdUtil.retryGetstatusoutput(guc_cmd_cn) - if status != 0: - self.logger.debug("Warning: Failed %s cn stream_cluster_run_mode=%s, output: %s" % - (guc_mode, cluster_run_mode, str(output))) - else: - self.logger.debug("Successfully %s streaming cluster run mode for " - "coordinator param %s" % (guc_mode, cluster_run_mode)) - def set_data_in_dcc(self, key, value, only_mode=None): """ Set data in dcc diff --git a/script/impl/dorado_disaster_recovery/ddr_constants.py b/script/impl/dorado_disaster_recovery/ddr_constants.py index 8469e324..8279df7d 100644 --- a/script/impl/dorado_disaster_recovery/ddr_constants.py +++ b/script/impl/dorado_disaster_recovery/ddr_constants.py @@ -23,7 +23,7 @@ class DoradoDisasterRecoveryConstants: - # streaming files + # dorado files DDR_LOG_FILE = "gs_ddr.log" DDR_FILES_DIR = 'ddr_cabin' DDR_CLUSTER_STATUS_TMP_FILE = "cluster_state_tmp" @@ -32,10 +32,10 @@ class DoradoDisasterRecoveryConstants: GS_SECURE_FILES = "gs_secure_files" HADR_KEY_CIPHER = "hadr.key.cipher" HADR_KEY_RAND = "hadr.key.rand" - STREAM_SWITCHOVER_STATE = ".switchover_cluster_state" + DDR_SWITCHOVER_STATE = ".switchover_cluster_state" MAX_TERM_RECORD = ".max_term_record" PROCESS_LOCK_FILE = 'ddr_lock_' - STREAMING_CONFIG_XML = "ddr_config.xml" + DDR_CONFIG_XML = "ddr_config.xml" GUC_BACKUP_FILE = ".ddr_guc_backup" CLUSTER_USER_RECORD = ".cluster_user_record" @@ -45,13 +45,13 @@ class DoradoDisasterRecoveryConstants: ACTION_ESTABLISH = "establish" - # streaming query temp file + # dorado query temp file DDR_CLUSTER_STAT = ".ddr_cluster_stat" DDR_FAILOVER_STAT = ".ddr_failover_stat" DDR_SWICHOVER_STAT = ".ddr_switchover_stat" DDR_ESTABLISH_STAT = ".ddr_establish_stat" - DDR_DISTRIBUTE_ACTION = "distribute_stream_failover" + DDR_DISTRIBUTE_ACTION = "distribute_dorado_failover" # GUC CHANGE MAP GUC_CHANGE_MAP = {} diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py index 4f12b804..0249c85a 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_failover.py @@ -16,14 +16,14 @@ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. # ---------------------------------------------------------------------------- -# Description : streaming_disaster_recovery_failover.py is utility for +# Description : dorado_disaster_recovery_failover.py is utility for # standby cluster failover to primary cluster. from gspylib.common.Common import DefaultValue from gspylib.common.ErrorCode import ErrorCode from impl.dorado_disaster_recovery.ddr_base import DoradoDisasterRecoveryBase - +from impl.dorado_disaster_recovery.ddr_constants import DoradoDisasterRecoveryConstants class DisasterRecoveryFailoverHandler(DoradoDisasterRecoveryBase): def __init__(self, *args, **kwargs): @@ -35,7 +35,7 @@ class DisasterRecoveryFailoverHandler(DoradoDisasterRecoveryBase): step = self.check_dorado_failover_workable(check_type_step=3, check_status_step=0) self.check_is_under_upgrade() try: - self.dorado_failover_single_inst(step) + self.dorado_failover_single_inst(step, DoradoDisasterRecoveryConstants.ACTION_FAILOVER) self.update_dorado_info("cluster", "normal") self.clean_step_file() self.clean_flag_file() diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py index 371582ae..c9ad12d1 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py @@ -16,8 +16,8 @@ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. # ---------------------------------------------------------------------------- -# Description : streaming_disaster_recovery_query.py is utility for -# query streaming disaster recovery condition. +# Description : dorado_disaster_recovery_query.py is utility for +# query dorado disaster recovery condition. import os @@ -27,11 +27,11 @@ from gspylib.common.Common import ClusterCommand from impl.dorado_disaster_recovery.ddr_base import DoradoDisasterRecoveryBase -class StreamingQueryHandler(DoradoDisasterRecoveryBase): +class DoradoQueryHandler(DoradoDisasterRecoveryBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def get_streaming_cluster_query_value(self, file_name): + def get_ddr_cluster_query_value(self, file_name): """ Query infos from files. """ @@ -99,70 +99,32 @@ class StreamingQueryHandler(DoradoDisasterRecoveryBase): return recovery_status return "recovery" - def get_max_rpo_rto(self): - """ - Get max rpo and rto. - """ - self.logger.log("Start check RPO & RTO.") - rpo_sql = "SELECT current_rpo FROM dbe_perf.global_streaming_hadr_rto_and_rpo_stat;" - rto_sql = "SELECT current_rto FROM dbe_perf.global_streaming_hadr_rto_and_rpo_stat;" - rto_rpo_sql = rpo_sql + rto_sql - if not self.primary_dn_ids: - self.logger.debug("Not found primary dn in cluster, cluster status:%s, " - "main standby:%s." % (self.cluster_status, self.main_standby_ids)) - return "", "" - log_info = "Execute sql [%s] on node [%s: %s] with result:%s" - dn_instances = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes - if inst.instanceId in self.primary_dn_ids] - if dn_instances: - status, output = ClusterCommand.remoteSQLCommand( - rto_rpo_sql, self.user, dn_instances[0].hostname, dn_instances[0].port) - if status == 0 and output: - try: - rets = output.strip().split('\n') - length = len(rets) // 2 - rpo_list = [int(i) for i in rets[:length]] - rto_list = [int(j) for j in rets[length:]] - max_rpo, max_rto = str(max(rpo_list)), str(max(rto_list)) - except ValueError: - return "", "" - self.logger.debug("Successfully get max rpo:%s, rto:%s, output:%s" - % (max_rpo, max_rto, ','.join(output.split('\n')))) - return max_rpo, max_rto - else: - self.logger.debug(log_info % (rto_rpo_sql, dn_instances[0].hostname, - dn_instances[0].port, ','.join(output.split('\n')))) - return "", "" - def run(self): - self.logger.log("Start streaming disaster query.") + self.logger.log("Start dorado disaster query.") cluster_info = self.query_cluster_info() if cluster_info: self.parse_cluster_status(current_status=cluster_info) self.check_is_under_upgrade() - check_cluster_stat = self.get_streaming_cluster_query_value( + check_cluster_stat = self.get_ddr_cluster_query_value( DoradoDisasterRecoveryConstants.DDR_CLUSTER_STAT) archive_status = self.check_archive(check_cluster_stat, self.cluster_status) recovery_status = self.check_recovery(check_cluster_stat, self.cluster_status) - hadr_cluster_stat = archive_status or recovery_status or check_cluster_stat + ddr_cluster_stat = archive_status or recovery_status or check_cluster_stat - hadr_failover_stat = self.get_streaming_cluster_query_value( + ddr_failover_stat = self.get_ddr_cluster_query_value( DoradoDisasterRecoveryConstants.DDR_FAILOVER_STAT) - hadr_switchover_stat = self.get_streaming_cluster_query_value( + ddr_switchover_stat = self.get_ddr_cluster_query_value( DoradoDisasterRecoveryConstants.DDR_SWICHOVER_STAT) - if hadr_cluster_stat != "promote": - hadr_failover_stat = "" - if hadr_cluster_stat != "switchover": - hadr_switchover_stat = "" + if ddr_cluster_stat != "promote": + ddr_failover_stat = "" + if ddr_cluster_stat != "switchover": + ddr_switchover_stat = "" self.logger.debug("Start check max rpo and rto.") - max_rpo, max_rto = self.get_max_rpo_rto() self.logger.debug("Finished check max rpo and rto.") values = dict() - values["hadr_cluster_stat"] = hadr_cluster_stat - values["hadr_failover_stat"] = hadr_failover_stat - values["hadr_switchover_stat"] = hadr_switchover_stat - values["RPO"] = max_rpo - values["RTO"] = max_rto - self.logger.log("Successfully executed streaming disaster " + values["ddr_cluster_stat"] = ddr_cluster_stat + values["ddr_failover_stat"] = ddr_failover_stat + values["ddr_switchover_stat"] = ddr_switchover_stat + self.logger.log("Successfully executed dorado disaster " "recovery query, result:\n%s" % values) diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_start.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_start.py index 0bfb7ecd..cab635a7 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_start.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_start.py @@ -170,26 +170,6 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): #self.clean_gs_secure_dir() self.clean_step_file() - def _check_and_refresh_disaster_user_permission(self): - """check and refresh disaster user permission""" - if self.params.mode != "primary": - return - self.check_hadr_user(only_mode='primary') - self.check_hadr_pwd(only_mode='primary') - self.logger.debug("Encrypt hadr user info to database not " - "for mode:%s." % self.params.mode) - hadr_cipher_path = os.path.join(self.bin_path, "hadr.key.cipher") - hadr_rand_path = os.path.join(self.bin_path, "hadr.key.rand") - if not os.path.isfile(hadr_cipher_path) or not os.path.isfile(hadr_rand_path): - self.hadr_key_generator('hadr') - user_info = DefaultValue.obtain_hadr_user_encrypt_str(self.cluster_info, self.user, - self.logger, False, True) - if user_info: - self.clean_global_config() - pass_str = self.encrypt_hadr_user_info( - 'hadr', self.params.hadrUserName, self.params.hadrUserPassword) - self.keep_hadr_user_info(pass_str) - def run(self): self.logger.log("Start create dorado storage disaster relationship.") step = self.query_dorado_step() diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py index ee08e2b0..fe981dc4 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_switchover.py @@ -16,7 +16,7 @@ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. # ---------------------------------------------------------------------------- -# Description : streaming_disaster_recovery_switchover.py is a utility for +# Description : dorado_disaster_recovery_switchover.py is a utility for # changing role between primary cluster and standby cluster. import json import os @@ -231,7 +231,7 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): if not primary_dns: raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain primary dns for switchover") - if self.streaming_dr_in_switchover(primary_dns): + if self.dorado_dr_in_switchover(primary_dns): if self.streaming_dr_service_truncation_check(primary_dns): self.logger.debug("Successfully set streaming master cluster in switchover.") @@ -270,9 +270,9 @@ class DisasterRecoverySwitchoverHandler(DoradoDisasterRecoveryBase): "truncation." % (dn_inst.hostname, dn_inst.mirrorId, dn_inst.instanceId)) return True - def streaming_dr_in_switchover(self, primary_dns_list): + def dorado_dr_in_switchover(self, primary_dns_list): """ - set steaming dr in switchover + set dorado dr in switchover """ results = parallelTool.parallelExecute(self.concurrent_set_dr_in_switchover, primary_dns_list) diff --git a/script/impl/dorado_disaster_recovery/params_handler.py b/script/impl/dorado_disaster_recovery/params_handler.py index 6feadd46..7a1b1940 100644 --- a/script/impl/dorado_disaster_recovery/params_handler.py +++ b/script/impl/dorado_disaster_recovery/params_handler.py @@ -16,7 +16,7 @@ # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. # ---------------------------------------------------------------------------- -# Description : params_handler.py is a utility for parsing and verifying streaming +# Description : params_handler.py is a utility for parsing and verifying dorado # disaster recovery params. ############################################################################# @@ -33,7 +33,7 @@ from base_utils.security.security_checker import SecurityChecker, ValidationErro from domain_utils.cluster_file.version_info import VersionInfo -def check_streaming_start_mode(mode): +def check_ddr_start_mode(mode): """ Check start mode """ @@ -118,9 +118,9 @@ def check_remote_cluster_conf(value): SecurityChecker.check_ip_valid('dataIp of remoteClusterConf', data_ip) -STREAMING_PARAMS_FOR_MODULE = { +DORADO_PARAMS_FOR_MODULE = { "start": { - "mode": check_streaming_start_mode, + "mode": check_ddr_start_mode, "xml_path": check_xml_file, # "hadrUserName": check_hadr_user, # "hadrUserPassword": check_hadr_pwd, @@ -136,7 +136,7 @@ STREAMING_PARAMS_FOR_MODULE = { "remoteClusterConf": check_remote_cluster_conf }, "switchover": { - "mode": check_streaming_start_mode, + "mode": check_ddr_start_mode, "waitingTimeout": check_wait_timeout }, "failover": { @@ -146,7 +146,7 @@ STREAMING_PARAMS_FOR_MODULE = { } HELP_MSG = """ -gs_ddr is a utility for streaming disaster recovery fully options. +gs_ddr is a utility for dorado disaster recovery fully options. Usage: gs_ddr -? | --help @@ -209,7 +209,7 @@ class ParamsHandler(object): parser.add_option('-X', dest='xml_path', type='string', help='Cluster config xml path.') parser.add_option('--json', dest='json_path', type='string', - help='Config json file of streaming options') + help='Config json file of dorado options') parser.add_option('--time-out=', dest='timeout', default="1200", type='string', help='time out.') parser.add_option("-l", dest='logFile', type='string', @@ -259,16 +259,16 @@ class ParamsHandler(object): remote_cluster_conf.setdefault("port", cluster_info.remote_dn_base_port) remote_cluster_conf.setdefault("shards", cluster_info.remote_stream_ip_map) setattr(self.params, "remoteClusterConf", remote_cluster_conf) - self.logger.debug("Remote stream cluster conf: %s." % str(remote_cluster_conf)) + self.logger.debug("Remote cluster conf: %s." % str(remote_cluster_conf)) local_cluster_conf = dict() local_cluster_conf.setdefault("port", cluster_info.local_dn_base_port) local_cluster_conf.setdefault("shards", cluster_info.local_stream_ip_map) setattr(self.params, "localClusterConf", local_cluster_conf) - self.logger.debug("Local stream cluster conf: %s." % str(local_cluster_conf)) + self.logger.debug("Local cluster conf: %s." % str(local_cluster_conf)) if not remote_cluster_conf["shards"] or len(remote_cluster_conf["shards"])\ != len(local_cluster_conf["shards"]): - raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50026'] % "streaming DR") + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50026'] % "dorado DR") def __init_default_params(self): """ @@ -342,7 +342,7 @@ class ParamsHandler(object): self.logger.log(DoradoDisasterRecoveryConstants.LOG_REMARK) self.__init_default_params() #self.__reload_hadr_user_info() - for param_name, validate in STREAMING_PARAMS_FOR_MODULE[self.params.task].items(): + for param_name, validate in DORADO_PARAMS_FOR_MODULE[self.params.task].items(): check_value = getattr(self.params, param_name) if self.params.task == "stop": if param_name == "xml_path" and not check_value: -- Gitee From 4683c32902e4bf880aaa5ef9afb5bef31b302096 Mon Sep 17 00:00:00 2001 From: Hao Date: Mon, 21 Aug 2023 09:51:43 +0800 Subject: [PATCH 20/23] gs_ddr query update --- script/impl/dorado_disaster_recovery/ddr_constants.py | 3 ++- .../ddr_modules/dorado_disaster_recovery_query.py | 2 +- script/impl/dorado_disaster_recovery/params_handler.py | 4 ---- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_constants.py b/script/impl/dorado_disaster_recovery/ddr_constants.py index 8279df7d..b8f5e961 100644 --- a/script/impl/dorado_disaster_recovery/ddr_constants.py +++ b/script/impl/dorado_disaster_recovery/ddr_constants.py @@ -61,7 +61,8 @@ class DoradoDisasterRecoveryConstants: "start": ["localClusterConf", "remoteClusterConf"], "stop": ["localClusterConf", "remoteClusterConf"], "switchover": [], - "failover": [] + "failover": [], + "query": [] } # step file of each module diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py index c9ad12d1..1049790e 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_query.py @@ -60,7 +60,7 @@ class DoradoQueryHandler(DoradoDisasterRecoveryBase): self.logger.debug("Ignore update archive for disaster_standby cluster.") return archive_status sql_check = "select 1 from pg_catalog.pg_stat_get_wal_senders() where sync_state" \ - "='Async' and peer_role='Standby' and peer_state='Normal';" + "='Async' and peer_role='StandbyCluster_Standby' and peer_state='Normal';" dn_instances = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes if inst.instanceId in self.primary_dn_ids] self.logger.debug("Check archive with cmd:%s." % sql_check) diff --git a/script/impl/dorado_disaster_recovery/params_handler.py b/script/impl/dorado_disaster_recovery/params_handler.py index 7a1b1940..81c07821 100644 --- a/script/impl/dorado_disaster_recovery/params_handler.py +++ b/script/impl/dorado_disaster_recovery/params_handler.py @@ -202,10 +202,6 @@ class ParamsHandler(object): '"switchover", "failover", "query"') parser.add_option('-m', dest='mode', type='string', help='Cluster run mode. It could be ["primary", "disaster_standby"].') - # parser.add_option('-U', dest='hadrusername', type='string', - # help='hadr user name.') - # parser.add_option('-W', dest='hadruserpasswd', type='string', - # help='hadr user password.') parser.add_option('-X', dest='xml_path', type='string', help='Cluster config xml path.') parser.add_option('--json', dest='json_path', type='string', -- Gitee From 55ff5d4bd6c482a14cba4f7a735f3f81b1b77232 Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 22 Aug 2023 10:39:54 +0800 Subject: [PATCH 21/23] update comment --- .../dorado_disaster_recovery_start.py | 26 +++++-------------- .../dorado_disaster_recovery_stop.py | 2 +- .../params_handler.py | 5 ++-- 3 files changed, 10 insertions(+), 23 deletions(-) diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_start.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_start.py index cab635a7..17bfffe1 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_start.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_start.py @@ -34,14 +34,12 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): def _first_step_for_ddr_start(self, step): """ - First step for streaming start + First step for ddr start """ if step >= 2: return self.logger.debug("Start first step of DisasterRecovery start.") - #创建容灾过程使用的临时目录 self.create_disaster_recovery_dir(self.dorado_file_dir) - #检查执行的标志文件 self.check_action_and_mode() self.init_cluster_status() @@ -59,7 +57,6 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] % "check cm_ctl is available for current cluster") self.check_is_under_upgrade() - #检查dn的GUC参数 self.check_dn_instance_params() self.write_dorado_step("2_check_cluster_step") @@ -73,7 +70,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): def _third_step_for_ddr_start(self, step): """ - Third step for streaming start + Third step for ddr start """ if step >= 3: return @@ -89,7 +86,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): def _fourth_step_for_ddr_start(self, step): """ - Fourth step for streaming start + Fourth step for ddr start """ if step >= 4: return @@ -104,8 +101,6 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): if step >= 5: return self.logger.debug("Start fifth step of ddr start.") - # self.set_data_in_dcc(self.backup_open_key, "0", only_mode='primary') - # self.set_data_in_dcc(self.backup_open_key, "1", only_mode='disaster_standby') self.start_cluster(only_mode="primary") self.write_dorado_step("5_start_primary_cluster_step") @@ -122,7 +117,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): def _seventh_step_for_ddr_start(self, step): """ - Seventh step for streaming start + Seventh step for ddr start """ if step >= 7 or self.params.mode == "primary": return @@ -139,7 +134,7 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): def _eighth_step_for_ddr_start(self, step): """ - Eighth step for streaming start + Eighth step for ddr start """ if step >= 8: return @@ -161,31 +156,24 @@ class DisasterRecoveryStartHandler(DoradoDisasterRecoveryBase): def _ninth_step_for_ddr_start(self, step): """ - ninth step for streaming start + ninth step for ddr start """ if step >= 9: return - self.logger.debug("Start ninth step of streaming start.") - #self.restore_wal_keep_segments(only_mode='primary') - #self.clean_gs_secure_dir() + self.logger.debug("Start ninth step of ddr start.") self.clean_step_file() def run(self): self.logger.log("Start create dorado storage disaster relationship.") step = self.query_dorado_step() self._first_step_for_ddr_start(step) - #1.检查集群状态正常 self.parse_cluster_status() - #检查集群内dn状态和cm服务 self._second_step_for_ddr_start(step) - #更新pg_hba和replinfo self.common_step_for_ddr_start() self._third_step_for_ddr_start(step) self._fourth_step_for_ddr_start(step) self._fifth_step_for_ddr_start(step) - #设置CM backup_open参数,灾备backup_open=1, 主集群backup_open=0 self._sixth_step_for_ddr_start(step) - #start dss,build main standby self._seventh_step_for_ddr_start(step) self._eighth_step_for_ddr_start(step) self._ninth_step_for_ddr_start(step) diff --git a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py index 881aba57..c2d8fe99 100644 --- a/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py +++ b/script/impl/dorado_disaster_recovery/ddr_modules/dorado_disaster_recovery_stop.py @@ -92,7 +92,7 @@ class DisasterRecoveryStopHandler(DoradoDisasterRecoveryBase): self.clean_dorado_dir() def run(self): - self.logger.log("Start remove dorado disaster relationship.") + self.logger.log("Start remove dorado disaster recovery relationship.") step = self.query_dorado_step() self._first_step_for_ddr_stop(step) self.parse_cluster_status() diff --git a/script/impl/dorado_disaster_recovery/params_handler.py b/script/impl/dorado_disaster_recovery/params_handler.py index 81c07821..134a92d6 100644 --- a/script/impl/dorado_disaster_recovery/params_handler.py +++ b/script/impl/dorado_disaster_recovery/params_handler.py @@ -122,8 +122,6 @@ DORADO_PARAMS_FOR_MODULE = { "start": { "mode": check_ddr_start_mode, "xml_path": check_xml_file, - # "hadrUserName": check_hadr_user, - # "hadrUserPassword": check_hadr_pwd, "doradoConfig": check_dorado_config, "waitingTimeout": check_wait_timeout, "localClusterConf": check_local_cluster_conf, @@ -151,7 +149,7 @@ gs_ddr is a utility for dorado disaster recovery fully options. Usage: gs_ddr -? | --help gs_ddr -V | --version - gs_ddr -t start -m [primary|disaster_standby] -X XMLFILE [--time-out=SECS] [-l LOGFILE] + gs_ddr -t start -m [primary|disaster_standby] -X XMLFILE --dorado-config=XLOG_SHARED_DISK [--time-out=SECS] [-l LOGFILE] gs_ddr -t stop -X XMLFILE|--json JSONFILE [-l LOGFILE] gs_ddr -t switchover -m [primary|disaster_standby] [--time-out=SECS] [-l LOGFILE] gs_ddr -t failover [-l LOGFILE] @@ -168,6 +166,7 @@ General options: -W Disaster recovery user password. -X Path of the XML configuration file. -l Path of log file. + --dorado-config Path of dorado xlog share disk. --time-out=SECS Maximum waiting time when Main standby connect to the primary dn, default value is 1200s. """ -- Gitee From 767968066aefb580b3b1e7b2c00e12d298140f99 Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 22 Aug 2023 16:48:28 +0800 Subject: [PATCH 22/23] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20ddr=5Fbase.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- script/gspylib/common/Common.py | 4 ++-- script/gspylib/common/DbClusterInfo.py | 12 ------------ script/impl/dorado_disaster_recovery/ddr_base.py | 3 +-- 3 files changed, 3 insertions(+), 16 deletions(-) diff --git a/script/gspylib/common/Common.py b/script/gspylib/common/Common.py index 4f91eb9b..2d1016fa 100644 --- a/script/gspylib/common/Common.py +++ b/script/gspylib/common/Common.py @@ -3027,7 +3027,7 @@ class DefaultValue(): if os.path.isfile(cm_agent_conf_temp_file): with open(cm_agent_conf_temp_file, "r") as cma_conf_file: content = cma_conf_file.read() - ret = re.findall(r'agent_backup_open *= *1', content) + ret = re.findall(r'agent_backup_open *= *1|agent_backup_open *= *2', content) g_file.removeFile(cm_agent_conf_temp_file) if ret: return True @@ -3037,7 +3037,7 @@ class DefaultValue(): raise Exception(ErrorCode.GAUSS_502['GAUSS_50201'] % cm_agent_conf_file) with open(cm_agent_conf_file, "r") as cma_conf_file: content = cma_conf_file.read() - ret = re.findall(r'agent_backup_open *= *1', content) + ret = re.findall(r'agent_backup_open *= *1|agent_backup_open *= *2', content) if ret: return True else: diff --git a/script/gspylib/common/DbClusterInfo.py b/script/gspylib/common/DbClusterInfo.py index 305a00c4..91564fca 100644 --- a/script/gspylib/common/DbClusterInfo.py +++ b/script/gspylib/common/DbClusterInfo.py @@ -1689,18 +1689,6 @@ class dbClusterInfo(): def get_staic_conf_path(self, user, ignore_err=False): return self.__getStaticConfigFilePath(user=user, ignore_err=ignore_err) - def get_mpprc_file(self, user): - """ - get mpprc file - """ - mpprcFile = EnvUtil.getEnvironmentParameterValue('MPPDB_ENV_SEPARATE_PATH', user) - if mpprcFile is not None and mpprcFile != "": - mpprcFile = mpprcFile.replace("\\", "\\\\").replace('"', '\\"\\"') - checkPathVaild(mpprcFile) - userProfile = mpprcFile - else: - userProfile = ClusterConstants.BASHRC - return userProfile def __getEnvironmentParameterValue(self, environmentParameterName, user): """ diff --git a/script/impl/dorado_disaster_recovery/ddr_base.py b/script/impl/dorado_disaster_recovery/ddr_base.py index 5e0064a2..9b9a5a57 100644 --- a/script/impl/dorado_disaster_recovery/ddr_base.py +++ b/script/impl/dorado_disaster_recovery/ddr_base.py @@ -1436,8 +1436,7 @@ class DoradoDisasterRecoveryBase(object): check datanode info by "gs_ctl query" command. """ state, dest_ip, datadir = params - # get mpprc file - mpprcFile = self.cluster_info.get_mpprc_file(self.user) + mpprcFile = self.mpp_file if dest_ip == self.local_host: cmd = "source %s && gs_ctl query -D %s" % (mpprcFile, datadir) else: -- Gitee From 9c0355699d89bbc95bd588ac5f7c1ae6628664e4 Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 22 Aug 2023 16:57:50 +0800 Subject: [PATCH 23/23] revert build.sh and get_PlatForm_str.sh --- build.sh | 6 ++---- build/get_PlatForm_str.sh | 10 +++++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/build.sh b/build.sh index e4f76dba..ccf89dda 100644 --- a/build.sh +++ b/build.sh @@ -54,7 +54,7 @@ done PLAT_FORM_STR=$(sh "${ROOT_DIR}/build/get_PlatForm_str.sh") if [ "${PLAT_FORM_STR}"x == "Failed"x ]; then - echo "We only support kylin(aarch64), EulerOS(aarch64), FusionOS, CentOS, UnionTech(X86) platform." + echo "We only support openEuler(aarch64), EulerOS(aarch64), FusionOS, CentOS, UnionTech(X86) platform." exit 1; fi @@ -68,8 +68,6 @@ if [ X$(echo $PLAT_FORM_STR | grep "centos") != X"" ]; then dist_version="CentOS" elif [ X$(echo $PLAT_FORM_STR | grep "openeuler") != X"" ]; then dist_version="openEuler" -elif [ X$(echo $PLAT_FORM_STR | grep "kylin") != X"" ]; then - dist_version="kylin" elif [ X$(echo $PLAT_FORM_STR | grep "fusionos") != X"" ]; then dist_version="FusionOS" elif [ X$(echo $PLAT_FORM_STR | grep "euleros") != X"" ]; then @@ -81,7 +79,7 @@ elif [ X$(echo $PLAT_FORM_STR | grep "asianux") != X"" ]; then elif [ X$(echo $PLAT_FORM_STR | grep "UnionTech") != X"" ]; then dist_version="UnionTech" else - echo "We only support kylin(aarch64), EulerOS(aarch64), FusionOS, CentOS, Ubuntu(x86), UnionTech(x86) platform." + echo "We only support openEuler(aarch64), EulerOS(aarch64), FusionOS, CentOS, Ubuntu(x86), UnionTech(x86) platform." echo "Kernel is $kernel" exit 1 fi diff --git a/build/get_PlatForm_str.sh b/build/get_PlatForm_str.sh index 98e3233e..2bd8af9b 100644 --- a/build/get_PlatForm_str.sh +++ b/build/get_PlatForm_str.sh @@ -19,14 +19,14 @@ function get_os_str() { cpu_arc=$(uname -p) - if [ "$os_name"x = "centos"x ] && [ "$cpu_arc"x = "x86_64"x ]; then + if [ "$os_name"x = "centos"x ] && [ "$cpu_arc"x = "x86_64"x ]; then os_str=centos7.6_x86_64 elif [ "$os_name"x = "euleros"x ] && [ "$cpu_arc"x = "aarch64"x ]; then os_str=euleros2.0_sp8_aarch64 - elif [ "$os_name"x = "kylin"x ] && [ "$cpu_arc"x = "aarch64"x ]; then - os_str=kylin_aarch64 - elif [ "$os_name"x = "kylin"x ] && [ "$cpu_arc"x = "x86_64"x ]; then - os_str=kylin_x86_64 + elif [ "$os_name"x = "openEuler"x ] && [ "$cpu_arc"x = "aarch64"x ]; then + os_str=openeuler_aarch64 + elif [ "$os_name"x = "openEuler"x ] && [ "$cpu_arc"x = "x86_64"x ]; then + os_str=openeuler_x86_64 elif [ "$os_name"x = "fusionos"x ] && [ "$cpu_arc"x = "aarch64"x ]; then os_str=fusionos_aarch64 elif [ "$os_name"x = "fusionos"x ] && [ "$cpu_arc"x = "x86_64"x ]; then -- Gitee