2018-09-24 01:03:16 -04:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
# RouterInstance.py - Class implementing a PVC router and run by pvcrd
|
|
|
|
# Part of the Parallel Virtual Cluster (PVC) system
|
|
|
|
#
|
|
|
|
# Copyright (C) 2018 Joshua M. Boniface <joshua@boniface.me>
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
###############################################################################
|
|
|
|
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import psutil
|
|
|
|
import socket
|
|
|
|
import time
|
|
|
|
import threading
|
|
|
|
import subprocess
|
|
|
|
|
|
|
|
import daemon_lib.ansiiprint as ansiiprint
|
|
|
|
import daemon_lib.zkhandler as zkhandler
|
2018-09-24 01:13:05 -04:00
|
|
|
import daemon_lib.common as common
|
2018-09-24 01:03:16 -04:00
|
|
|
|
|
|
|
class RouterInstance():
|
|
|
|
# Initialization function
|
|
|
|
def __init__(self, this_router, name, t_router, s_network, zk_conn, config):
|
|
|
|
# Passed-in variables on creation
|
|
|
|
self.zk_conn = zk_conn
|
|
|
|
self.config = config
|
|
|
|
self.this_router = this_router
|
|
|
|
self.name = name
|
2018-09-30 18:45:39 -04:00
|
|
|
self.primary_router = None
|
2018-09-24 01:03:16 -04:00
|
|
|
self.daemon_state = 'stop'
|
2018-09-24 01:45:04 -04:00
|
|
|
self.network_state = 'secondary'
|
2018-09-24 01:03:16 -04:00
|
|
|
self.t_router = t_router
|
|
|
|
self.primary_router_list = []
|
|
|
|
self.secondary_router_list = []
|
|
|
|
self.inactive_router_list = []
|
|
|
|
self.s_network = s_network
|
|
|
|
self.network_list = []
|
|
|
|
self.ipmi_hostname = self.config['ipmi_hostname']
|
|
|
|
|
|
|
|
# Zookeeper handlers for changed states
|
|
|
|
@zk_conn.DataWatch('/routers/{}/daemonstate'.format(self.name))
|
2018-09-30 18:45:39 -04:00
|
|
|
def watch_router_daemonstate(data, stat, event=''):
|
2018-09-24 01:03:16 -04:00
|
|
|
try:
|
2018-09-30 18:45:39 -04:00
|
|
|
data = data.decode('ascii')
|
2018-09-24 01:03:16 -04:00
|
|
|
except AttributeError:
|
2018-09-30 18:45:39 -04:00
|
|
|
data = 'stop'
|
|
|
|
|
|
|
|
if data != self.daemon_state:
|
|
|
|
self.daemon_state = data
|
2018-09-24 01:03:16 -04:00
|
|
|
|
|
|
|
@zk_conn.DataWatch('/routers/{}/networkstate'.format(self.name))
|
2018-09-30 18:45:39 -04:00
|
|
|
def watch_router_networkstate(data, stat, event=''):
|
2018-09-24 01:03:16 -04:00
|
|
|
try:
|
2018-09-30 18:45:39 -04:00
|
|
|
data = data.decode('ascii')
|
2018-09-24 01:03:16 -04:00
|
|
|
except AttributeError:
|
2018-09-30 18:45:39 -04:00
|
|
|
data = 'secondary'
|
2018-09-24 01:03:16 -04:00
|
|
|
|
2018-09-30 18:45:39 -04:00
|
|
|
if data != self.network_state:
|
|
|
|
self.network_state = data
|
2018-09-30 13:45:20 -04:00
|
|
|
if self.name == self.this_router:
|
|
|
|
if self.network_state == 'primary':
|
|
|
|
self.become_primary()
|
2018-09-30 18:45:39 -04:00
|
|
|
else:
|
|
|
|
self.become_secondary()
|
|
|
|
|
|
|
|
@zk_conn.DataWatch('/routers')
|
|
|
|
def watch_primary_router(data, stat, event=''):
|
|
|
|
try:
|
|
|
|
data = data.decode('ascii')
|
|
|
|
except AttributeError:
|
|
|
|
data = 'none'
|
|
|
|
|
|
|
|
# toggle state management of this router
|
|
|
|
if data != self.primary_router:
|
|
|
|
if data == 'none':
|
|
|
|
if self.name == self.this_router:
|
|
|
|
if self.daemon_state == 'run' and self.network_state != 'primary':
|
|
|
|
# Contend for primary
|
|
|
|
ansiiprint.echo('Contending for primary', '', 'i')
|
|
|
|
zkhandler.writedata(self.zk_conn, {
|
|
|
|
'/routers': self.name
|
|
|
|
})
|
|
|
|
elif data == self.this_router:
|
|
|
|
if self.name == self.this_router:
|
|
|
|
zkhandler.writedata(self.zk_conn, {
|
|
|
|
'/routers/{}/networkstate'.format(self.name): 'primary',
|
|
|
|
})
|
|
|
|
self.primary_router = data
|
|
|
|
else:
|
|
|
|
if self.name == self.this_router:
|
|
|
|
zkhandler.writedata(self.zk_conn, {
|
|
|
|
'/routers/{}/networkstate'.format(self.name): 'secondary',
|
|
|
|
})
|
|
|
|
self.primary_router = data
|
2018-09-30 13:45:20 -04:00
|
|
|
|
2018-09-24 01:03:16 -04:00
|
|
|
# Get value functions
|
|
|
|
def getname(self):
|
|
|
|
return self.name
|
|
|
|
|
|
|
|
def getdaemonstate(self):
|
|
|
|
return self.daemon_state
|
|
|
|
|
|
|
|
def getnetworkstate(self):
|
|
|
|
return self.network_state
|
|
|
|
|
|
|
|
def getnetworklist(self):
|
|
|
|
return self.network_list
|
|
|
|
|
|
|
|
# Update value functions
|
|
|
|
def updaterouterlist(self, t_router):
|
|
|
|
self.t_router = t_router
|
|
|
|
|
|
|
|
def updatenetworklist(self, s_network):
|
|
|
|
self.s_network = s_network
|
2018-09-30 19:25:21 -04:00
|
|
|
network_list = []
|
2018-09-24 01:34:20 -04:00
|
|
|
for network in s_network:
|
2018-09-30 19:25:21 -04:00
|
|
|
network_list.append(s_network[network].getvni())
|
|
|
|
self.network_list = network_list
|
2018-09-24 01:03:16 -04:00
|
|
|
|
2018-09-24 03:20:46 -04:00
|
|
|
def become_secondary(self):
|
|
|
|
ansiiprint.echo('Setting router {} to secondary state'.format(self.name), '', 'i')
|
2018-09-24 01:03:16 -04:00
|
|
|
ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c')
|
2018-09-30 19:16:47 -04:00
|
|
|
time.sleep(0.5)
|
2018-09-24 04:07:46 -04:00
|
|
|
for network in self.s_network:
|
2018-09-30 01:30:39 -04:00
|
|
|
self.s_network[network].stopDHCPServer()
|
2018-09-30 16:09:23 -04:00
|
|
|
self.s_network[network].removeGatewayAddress()
|
2018-09-24 03:20:46 -04:00
|
|
|
|
|
|
|
def become_primary(self):
|
|
|
|
ansiiprint.echo('Setting router {} to primary state.'.format(self.name), '', 'i')
|
|
|
|
ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c')
|
2018-09-24 01:03:16 -04:00
|
|
|
for network in self.s_network:
|
2018-09-30 01:30:39 -04:00
|
|
|
self.s_network[network].createGatewayAddress()
|
|
|
|
self.s_network[network].startDHCPServer()
|
2018-09-24 01:03:16 -04:00
|
|
|
|
|
|
|
def update_zookeeper(self):
|
|
|
|
# Get past state and update if needed
|
|
|
|
past_state = zkhandler.readdata(self.zk_conn, '/routers/{}/daemonstate'.format(self.name))
|
|
|
|
if past_state != 'run':
|
|
|
|
self.daemon_state = 'run'
|
|
|
|
zkhandler.writedata(self.zk_conn, { '/routers/{}/daemonstate'.format(self.name): 'run' })
|
|
|
|
else:
|
|
|
|
self.daemon_state = 'run'
|
|
|
|
|
2018-09-30 20:06:29 -04:00
|
|
|
# Ensure the master key is properly set at a keepalive
|
|
|
|
if self.name == self.this_router:
|
|
|
|
if self.network_state == 'primary':
|
|
|
|
if zkhandler.readdata(self.zk_conn, '/routers') == 'none':
|
|
|
|
zkhandler.writedata(self.zk_conn, {'/routers': self.name})
|
|
|
|
|
2018-09-24 01:03:16 -04:00
|
|
|
# Set our information in zookeeper
|
2018-09-24 15:09:51 -04:00
|
|
|
cpuload = os.getloadavg()[0]
|
2018-09-24 01:03:16 -04:00
|
|
|
keepalive_time = int(time.time())
|
|
|
|
try:
|
|
|
|
zkhandler.writedata(self.zk_conn, {
|
2018-09-24 15:09:51 -04:00
|
|
|
'/routers/{}/keepalive'.format(self.name): str(keepalive_time),
|
|
|
|
'/routers/{}/cpuload'.format(self.name): str(cpuload),
|
2018-09-24 01:03:16 -04:00
|
|
|
})
|
|
|
|
except:
|
|
|
|
ansiiprint.echo('Failed to set keepalive data', '', 'e')
|
|
|
|
return
|
|
|
|
|
|
|
|
# Display router information to the terminal
|
|
|
|
ansiiprint.echo('{}{} keepalive{}'.format(ansiiprint.purple(), self.name, ansiiprint.end()), '', 't')
|
2018-09-24 15:09:51 -04:00
|
|
|
ansiiprint.echo('{0}Networks count:{1} {2} {0}Load average:{1} {3}'.format(ansiiprint.bold(), ansiiprint.end(), len(self.network_list), cpuload), '', 'c')
|
2018-09-24 01:03:16 -04:00
|
|
|
|
|
|
|
# Update our local router lists
|
|
|
|
for router_name in self.t_router:
|
|
|
|
try:
|
|
|
|
router_daemon_state = zkhandler.readdata(self.zk_conn, '/routers/{}/daemonstate'.format(router_name))
|
|
|
|
router_network_state = zkhandler.readdata(self.zk_conn, '/routers/{}/networkstate'.format(router_name))
|
|
|
|
router_keepalive = int(zkhandler.readdata(self.zk_conn, '/routers/{}/keepalive'.format(router_name)))
|
|
|
|
except:
|
|
|
|
router_daemon_state = 'unknown'
|
|
|
|
router_network_state = 'unknown'
|
|
|
|
router_keepalive = 0
|
|
|
|
|
|
|
|
# Handle deadtime and fencng if needed
|
|
|
|
# (A router is considered dead when its keepalive timer is >6*keepalive_interval seconds
|
|
|
|
# out-of-date while in 'start' state)
|
|
|
|
router_deadtime = int(time.time()) - ( int(self.config['keepalive_interval']) * int(self.config['fence_intervals']) )
|
|
|
|
if router_keepalive < router_deadtime and router_daemon_state == 'run':
|
|
|
|
ansiiprint.echo('Router {} seems dead - starting monitor for fencing'.format(router_name), '', 'w')
|
|
|
|
zkhandler.writedata(self.zk_conn, { '/routers/{}/daemonstate'.format(router_name): 'dead' })
|
|
|
|
fence_thread = threading.Thread(target=fenceRouter, args=(router_name, self.zk_conn, self.config), kwargs={})
|
|
|
|
fence_thread.start()
|
|
|
|
|
|
|
|
# Update the arrays
|
2018-09-30 18:45:39 -04:00
|
|
|
if router_daemon_state == 'run' and router_network_state == 'primary' and router_name not in self.primary_router_list:
|
2018-09-24 01:03:16 -04:00
|
|
|
self.primary_router_list.append(router_name)
|
|
|
|
try:
|
|
|
|
self.secondary_router_list.remove(router_name)
|
|
|
|
except ValueError:
|
|
|
|
pass
|
|
|
|
try:
|
|
|
|
self.inactive_router_list.remove(router_name)
|
|
|
|
except ValueError:
|
|
|
|
pass
|
2018-09-30 18:45:39 -04:00
|
|
|
if router_daemon_state == 'run' and router_network_state == 'secondary' and router_name not in self.secondary_router_list:
|
|
|
|
self.secondary_router_list.append(router_name)
|
2018-09-24 01:03:16 -04:00
|
|
|
try:
|
|
|
|
self.primary_router_list.remove(router_name)
|
|
|
|
except ValueError:
|
|
|
|
pass
|
|
|
|
try:
|
2018-09-30 18:45:39 -04:00
|
|
|
self.inactive_router_list.remove(router_name)
|
2018-09-24 01:03:16 -04:00
|
|
|
except ValueError:
|
|
|
|
pass
|
2018-09-30 18:45:39 -04:00
|
|
|
if router_daemon_state != 'run' and router_name not in self.inactive_router_list:
|
|
|
|
self.inactive_router_list.append(router_name)
|
2018-09-24 01:03:16 -04:00
|
|
|
try:
|
|
|
|
self.primary_router_list.remove(router_name)
|
|
|
|
except ValueError:
|
|
|
|
pass
|
|
|
|
try:
|
2018-09-30 18:45:39 -04:00
|
|
|
self.secondary_router_list.remove(router_name)
|
2018-09-24 01:03:16 -04:00
|
|
|
except ValueError:
|
|
|
|
pass
|
2018-09-24 01:45:04 -04:00
|
|
|
|
2018-09-24 01:03:16 -04:00
|
|
|
# Display cluster information to the terminal
|
|
|
|
ansiiprint.echo('{}Cluster status{}'.format(ansiiprint.purple(), ansiiprint.end()), '', 't')
|
2018-09-24 01:37:28 -04:00
|
|
|
ansiiprint.echo('{}Primary router:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.primary_router_list)), '', 'c')
|
2018-09-24 01:03:16 -04:00
|
|
|
ansiiprint.echo('{}Secondary router:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.secondary_router_list)), '', 'c')
|
|
|
|
ansiiprint.echo('{}Inactive routers:{} {}'.format(ansiiprint.bold(), ansiiprint.end(), ' '.join(self.inactive_router_list)), '', 'c')
|
|
|
|
|
2018-10-08 23:53:41 -04:00
|
|
|
# Reload firewall rules if needed
|
|
|
|
if os.path.isfile('{}/update'.format(self.config['nftables_rules_dir'])):
|
|
|
|
common.reload_firewall_rules(self.config['nftables_rules_dir'])
|
|
|
|
os.remove('{}/update'.format(self.config['nftables_rules_dir']))
|
|
|
|
|
2018-09-24 01:03:16 -04:00
|
|
|
#
|
|
|
|
# Fence thread entry function
|
|
|
|
#
|
|
|
|
def fenceRouter(router_name, zk_conn, config):
|
|
|
|
failcount = 0
|
|
|
|
# We allow exactly 3 saving throws for the host to come back online
|
|
|
|
while failcount < 3:
|
|
|
|
# Wait 5 seconds
|
|
|
|
time.sleep(5)
|
|
|
|
# Get the state
|
|
|
|
router_daemon_state = zkhandler.readdata(zk_conn, '/routers/{}/daemonstate'.format(router_name))
|
|
|
|
# Is it still 'dead'
|
|
|
|
if router_daemon_state == 'dead':
|
|
|
|
failcount += 1
|
|
|
|
ansiiprint.echo('Router "{}" failed {} saving throws'.format(router_name, failcount), '', 'w')
|
|
|
|
# It changed back to something else so it must be alive
|
|
|
|
else:
|
|
|
|
ansiiprint.echo('Router "{}" passed a saving throw; canceling fence'.format(router_name), '', 'o')
|
|
|
|
return
|
|
|
|
|
|
|
|
ansiiprint.echo('Fencing router "{}" via IPMI reboot signal'.format(router_name), '', 'e')
|
|
|
|
|
|
|
|
# Get IPMI information
|
|
|
|
ipmi_hostname = zkhandler.readdata(zk_conn, '/routers/{}/ipmihostname'.format(router_name))
|
|
|
|
ipmi_username = zkhandler.readdata(zk_conn, '/routers/{}/ipmiusername'.format(router_name))
|
|
|
|
ipmi_password = zkhandler.readdata(zk_conn, '/routers/{}/ipmipassword'.format(router_name))
|
|
|
|
|
|
|
|
# Shoot it in the head
|
|
|
|
fence_status = rebootViaIPMI(ipmi_hostname, ipmi_username, ipmi_password)
|
|
|
|
# Hold to ensure the fence takes effect
|
|
|
|
time.sleep(3)
|
|
|
|
|
|
|
|
# Set router in secondary state
|
|
|
|
zkhandler.writedata(zk_conn, { '/routers/{}/networkstate'.format(router_name): 'secondary' })
|
|
|
|
|
|
|
|
#
|
|
|
|
# Perform an IPMI fence
|
|
|
|
#
|
|
|
|
def rebootViaIPMI(ipmi_hostname, ipmi_user, ipmi_password):
|
2018-09-24 01:13:05 -04:00
|
|
|
retcode = common.run_os_command('ipmitool -I lanplus -H {} -U {} -P {} chassis power reset'.format(
|
|
|
|
ipmi_hostname, ipmi_user, ipmi_password
|
2018-09-24 01:16:38 -04:00
|
|
|
))
|
2018-09-24 01:13:05 -04:00
|
|
|
if retcode == 0:
|
2018-09-24 01:03:16 -04:00
|
|
|
ansiiprint.echo('Successfully rebooted dead router', '', 'o')
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
ansiiprint.echo('Failed to reboot dead router', '', 'e')
|
|
|
|
return False
|