Properly correct handling of primary during shutdown

This commit is contained in:
Joshua Boniface 2018-09-30 18:45:39 -04:00
parent 1483db3c2d
commit a86fd06184
2 changed files with 76 additions and 70 deletions

View File

@ -23,12 +23,13 @@
import kazoo.client import kazoo.client
import sys import sys
import os import os
import signal
import socket import socket
import psutil import psutil
import subprocess import subprocess
import time import time
import configparser import configparser
import signal
import atexit
import apscheduler.schedulers.background import apscheduler.schedulers.background
import daemon_lib.ansiiprint as ansiiprint import daemon_lib.ansiiprint as ansiiprint
@ -142,25 +143,30 @@ def zk_listener(state):
zk_conn.add_listener(zk_listener) zk_conn.add_listener(zk_listener)
# Cleanup function # Cleanup function
def cleanup(signum, frame): def cleanup():
ansiiprint.echo('Terminating daemon', '', 'e') ansiiprint.echo('Cleaning up', '', 'e')
# Stop keepalive thread
stopKeepaliveTimer(update_timer)
# Set stop state in Zookeeper # Set stop state in Zookeeper
zkhandler.writedata(zk_conn, {'/routers/{}/daemonstate'.format(myhostname): 'stop'}) zkhandler.writedata(zk_conn, {'/routers/{}/daemonstate'.format(myhostname): 'stop'})
if this_router.name == this_router.primary_router:
zkhandler.writedata(zk_conn, {'/routers': 'none'})
# Wait for everything to flush
time.sleep(3)
# Close the Zookeeper connection # Close the Zookeeper connection
try: try:
zk_conn.stop() zk_conn.stop()
zk_conn.close() zk_conn.close()
except: except:
pass pass
# Stop keepalive thread
stopKeepaliveTimer(update_timer)
# Exit
sys.exit(0)
# Handle signals gracefully ansiiprint.echo('Terminating daemon', '', 'e')
signal.signal(signal.SIGTERM, cleanup)
signal.signal(signal.SIGINT, cleanup) atexit.register(cleanup)
signal.signal(signal.SIGQUIT, cleanup)
# Gather useful data about our host for staticdata # Gather useful data about our host for staticdata
# Static data format: 'cpu_count', 'arch', 'os', 'kernel' # Static data format: 'cpu_count', 'arch', 'os', 'kernel'
@ -230,7 +236,6 @@ def updaterouters(new_router_list):
# Set up our update function # Set up our update function
this_router = t_router[myhostname] this_router = t_router[myhostname]
update_zookeeper = this_router.update_zookeeper update_zookeeper = this_router.update_zookeeper
update_zookeeper()
@zk_conn.ChildrenWatch('/networks') @zk_conn.ChildrenWatch('/networks')
def updatenetworks(new_network_list): def updatenetworks(new_network_list):
@ -269,6 +274,6 @@ update_timer = createKeepaliveTimer()
# Tick loop # Tick loop
while True: while True:
try: try:
time.sleep(0.1) time.sleep(0.5)
except: except:
break break

View File

@ -40,6 +40,7 @@ class RouterInstance():
self.config = config self.config = config
self.this_router = this_router self.this_router = this_router
self.name = name self.name = name
self.primary_router = None
self.daemon_state = 'stop' self.daemon_state = 'stop'
self.network_state = 'secondary' self.network_state = 'secondary'
self.t_router = t_router self.t_router = t_router
@ -52,26 +53,59 @@ class RouterInstance():
# Zookeeper handlers for changed states # Zookeeper handlers for changed states
@zk_conn.DataWatch('/routers/{}/daemonstate'.format(self.name)) @zk_conn.DataWatch('/routers/{}/daemonstate'.format(self.name))
def watch_hypervisor_daemonstate(data, stat, event=""): def watch_router_daemonstate(data, stat, event=''):
try: try:
self.daemon_state = data.decode('ascii') data = data.decode('ascii')
except AttributeError: except AttributeError:
self.daemon_state = 'stop' data = 'stop'
if data != self.daemon_state:
self.daemon_state = data
@zk_conn.DataWatch('/routers/{}/networkstate'.format(self.name)) @zk_conn.DataWatch('/routers/{}/networkstate'.format(self.name))
def watch_hypervisor_networkstate(data, stat, event=""): def watch_router_networkstate(data, stat, event=''):
try: try:
self.network_state = data.decode('ascii') data = data.decode('ascii')
except AttributeError: except AttributeError:
self.network_state = 'secondary' data = 'secondary'
# toggle state management of this router if data != self.network_state:
if s_network != {}: # If there's no network list, we're too early in startup self.network_state = data
if self.name == self.this_router: if self.name == self.this_router:
if self.network_state == 'secondary':
self.become_secondary()
if self.network_state == 'primary': if self.network_state == 'primary':
self.become_primary() self.become_primary()
else:
self.become_secondary()
@zk_conn.DataWatch('/routers')
def watch_primary_router(data, stat, event=''):
try:
data = data.decode('ascii')
except AttributeError:
data = 'none'
# toggle state management of this router
if data != self.primary_router:
if data == 'none':
if self.name == self.this_router:
if self.daemon_state == 'run' and self.network_state != 'primary':
# Contend for primary
ansiiprint.echo('Contending for primary', '', 'i')
zkhandler.writedata(self.zk_conn, {
'/routers': self.name
})
elif data == self.this_router:
if self.name == self.this_router:
zkhandler.writedata(self.zk_conn, {
'/routers/{}/networkstate'.format(self.name): 'primary',
})
self.primary_router = data
else:
if self.name == self.this_router:
zkhandler.writedata(self.zk_conn, {
'/routers/{}/networkstate'.format(self.name): 'secondary',
})
self.primary_router = data
# Get value functions # Get value functions
def getname(self): def getname(self):
@ -96,43 +130,19 @@ class RouterInstance():
self.network_list.append(s_network[network].getvni()) self.network_list.append(s_network[network].getvni())
def become_secondary(self): def become_secondary(self):
time.sleep(1)
ansiiprint.echo('Setting router {} to secondary state'.format(self.name), '', 'i') ansiiprint.echo('Setting router {} to secondary state'.format(self.name), '', 'i')
ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c') ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c')
for router in self.t_router:
if self.t_router[router].getname() != self.this_router:
if self.t_router[router].getnetworkstate() != 'primary':
zkhandler.writedata(self.zk_conn, { '/routers/{}/networkstate'.format(self.t_router[router].getname()): 'primary' })
time.sleep(2)
for network in self.s_network: for network in self.s_network:
self.s_network[network].stopDHCPServer() self.s_network[network].stopDHCPServer()
self.s_network[network].removeGatewayAddress() self.s_network[network].removeGatewayAddress()
def set_secondary(self):
result = zkhandler.writedata(self.zk_conn, {
'/routers/{}/networkstate'.format(self.name): 'secondary'
})
if not result:
time.sleep(1)
def become_primary(self): def become_primary(self):
ansiiprint.echo('Setting router {} to primary state.'.format(self.name), '', 'i') ansiiprint.echo('Setting router {} to primary state.'.format(self.name), '', 'i')
ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c') ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c')
for network in self.s_network: for network in self.s_network:
self.s_network[network].createGatewayAddress() self.s_network[network].createGatewayAddress()
self.s_network[network].startDHCPServer() self.s_network[network].startDHCPServer()
for router in self.t_router:
if self.t_router[router].getname() != self.this_router:
if self.t_router[router].getnetworkstate() != 'secondary':
zkhandler.writedata(self.zk_conn, { '/routers/{}/networkstate'.format(self.t_router[router].getname()): 'secondary' })
def set_primary(self):
result = zkhandler.writedata(self.zk_conn, {
'/routers': self.name,
'/routers/{}/networkstate'.format(self.name): 'primary',
})
if not result:
time.sleep(1)
def update_zookeeper(self): def update_zookeeper(self):
# Get past state and update if needed # Get past state and update if needed
@ -181,7 +191,7 @@ class RouterInstance():
fence_thread.start() fence_thread.start()
# Update the arrays # Update the arrays
if router_daemon_state == 'run' and router_network_state != 'secondary' and router_name not in self.primary_router_list: if router_daemon_state == 'run' and router_network_state == 'primary' and router_name not in self.primary_router_list:
self.primary_router_list.append(router_name) self.primary_router_list.append(router_name)
try: try:
self.secondary_router_list.remove(router_name) self.secondary_router_list.remove(router_name)
@ -191,17 +201,7 @@ class RouterInstance():
self.inactive_router_list.remove(router_name) self.inactive_router_list.remove(router_name)
except ValueError: except ValueError:
pass pass
if router_daemon_state != 'run' and router_network_state != 'secondary' and router_name not in self.inactive_router_list: if router_daemon_state == 'run' and router_network_state == 'secondary' and router_name not in self.secondary_router_list:
self.inactive_router_list.append(router_name)
try:
self.primary_router_list.remove(router_name)
except ValueError:
pass
try:
self.secondary_router_list.remove(router_name)
except ValueError:
pass
if router_network_state == 'secondary' and router_name not in self.secondary_router_list:
self.secondary_router_list.append(router_name) self.secondary_router_list.append(router_name)
try: try:
self.primary_router_list.remove(router_name) self.primary_router_list.remove(router_name)
@ -211,15 +211,16 @@ class RouterInstance():
self.inactive_router_list.remove(router_name) self.inactive_router_list.remove(router_name)
except ValueError: except ValueError:
pass pass
if router_daemon_state != 'run' and router_name not in self.inactive_router_list:
# Try to set ourself primary if there is no primary in the cluster self.inactive_router_list.append(router_name)
cluster_has_primary = False try:
for router in self.t_router: self.primary_router_list.remove(router_name)
if self.t_router[router].getnetworkstate() == 'primary': except ValueError:
cluster_has_primary = True pass
break try:
if not cluster_has_primary: self.secondary_router_list.remove(router_name)
self.set_primary() except ValueError:
pass
# Display cluster information to the terminal # Display cluster information to the terminal
ansiiprint.echo('{}Cluster status{}'.format(ansiiprint.purple(), ansiiprint.end()), '', 't') ansiiprint.echo('{}Cluster status{}'.format(ansiiprint.purple(), ansiiprint.end()), '', 't')