From 0a01d84290a85a49e80197a5c3c827a52eaf5fe4 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Sat, 15 Aug 2020 12:38:03 -0400 Subject: [PATCH] Tie fence timers to keepalive_interval Also wait 2 full keepalive intervals after fencing before doing anything else, to give the Ceph cluster a chance to recover. --- node-daemon/pvcnoded/Daemon.py | 6 +++--- node-daemon/pvcnoded/fencing.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/node-daemon/pvcnoded/Daemon.py b/node-daemon/pvcnoded/Daemon.py index 932d0dc1..4ef5f7cb 100644 --- a/node-daemon/pvcnoded/Daemon.py +++ b/node-daemon/pvcnoded/Daemon.py @@ -155,9 +155,9 @@ def readConfig(pvcnoded_config_file, myhostname): 'log_keepalive_cluster_details': o_config['pvc']['system']['configuration']['logging']['log_keepalive_cluster_details'], 'log_keepalive_storage_details': o_config['pvc']['system']['configuration']['logging']['log_keepalive_storage_details'], 'console_log_lines': o_config['pvc']['system']['configuration']['logging']['console_log_lines'], - 'keepalive_interval': o_config['pvc']['system']['intervals']['keepalive_interval'], - 'fence_intervals': o_config['pvc']['system']['intervals']['fence_intervals'], - 'suicide_intervals': o_config['pvc']['system']['intervals']['suicide_intervals'], + 'keepalive_interval': int(o_config['pvc']['system']['intervals']['keepalive_interval']), + 'fence_intervals': int(o_config['pvc']['system']['intervals']['fence_intervals']), + 'suicide_intervals': int(o_config['pvc']['system']['intervals']['suicide_intervals']), 'successful_fence': o_config['pvc']['system']['fencing']['actions']['successful_fence'], 'failed_fence': o_config['pvc']['system']['fencing']['actions']['failed_fence'], 'migration_target_selector': o_config['pvc']['system']['migration']['target_selector'], diff --git a/node-daemon/pvcnoded/fencing.py b/node-daemon/pvcnoded/fencing.py index e58cc22d..991635ea 100644 --- a/node-daemon/pvcnoded/fencing.py +++ b/node-daemon/pvcnoded/fencing.py @@ -35,7 +35,7 @@ def fenceNode(node_name, zk_conn, config, logger): failcount = 0 while failcount < failcount_limit: # Wait 5 seconds - time.sleep(5) + time.sleep(config.keepalive_interval) # Get the state node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) # Is it still 'dead' @@ -56,8 +56,8 @@ def fenceNode(node_name, zk_conn, config, logger): # Shoot it in the head fence_status = rebootViaIPMI(ipmi_hostname, ipmi_username, ipmi_password, logger) - # Hold to ensure the fence takes effect - time.sleep(3) + # Hold to ensure the fence takes effect and system stabilizes + time.sleep(config.keepalive_interval * 2) # Force into secondary network state if needed if node_name in config['coordinators']: