Properly correct handling of primary during shutdown
This commit is contained in:
parent
1483db3c2d
commit
a86fd06184
|
@ -23,12 +23,13 @@
|
||||||
import kazoo.client
|
import kazoo.client
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import signal
|
|
||||||
import socket
|
import socket
|
||||||
import psutil
|
import psutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
import configparser
|
import configparser
|
||||||
|
import signal
|
||||||
|
import atexit
|
||||||
import apscheduler.schedulers.background
|
import apscheduler.schedulers.background
|
||||||
|
|
||||||
import daemon_lib.ansiiprint as ansiiprint
|
import daemon_lib.ansiiprint as ansiiprint
|
||||||
|
@ -142,25 +143,30 @@ def zk_listener(state):
|
||||||
zk_conn.add_listener(zk_listener)
|
zk_conn.add_listener(zk_listener)
|
||||||
|
|
||||||
# Cleanup function
|
# Cleanup function
|
||||||
def cleanup(signum, frame):
|
def cleanup():
|
||||||
ansiiprint.echo('Terminating daemon', '', 'e')
|
ansiiprint.echo('Cleaning up', '', 'e')
|
||||||
|
|
||||||
|
# Stop keepalive thread
|
||||||
|
stopKeepaliveTimer(update_timer)
|
||||||
|
|
||||||
# Set stop state in Zookeeper
|
# Set stop state in Zookeeper
|
||||||
zkhandler.writedata(zk_conn, { '/routers/{}/daemonstate'.format(myhostname): 'stop' })
|
zkhandler.writedata(zk_conn, {'/routers/{}/daemonstate'.format(myhostname): 'stop'})
|
||||||
|
if this_router.name == this_router.primary_router:
|
||||||
|
zkhandler.writedata(zk_conn, {'/routers': 'none'})
|
||||||
|
|
||||||
|
# Wait for everything to flush
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
# Close the Zookeeper connection
|
# Close the Zookeeper connection
|
||||||
try:
|
try:
|
||||||
zk_conn.stop()
|
zk_conn.stop()
|
||||||
zk_conn.close()
|
zk_conn.close()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
# Stop keepalive thread
|
|
||||||
stopKeepaliveTimer(update_timer)
|
|
||||||
# Exit
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
# Handle signals gracefully
|
ansiiprint.echo('Terminating daemon', '', 'e')
|
||||||
signal.signal(signal.SIGTERM, cleanup)
|
|
||||||
signal.signal(signal.SIGINT, cleanup)
|
atexit.register(cleanup)
|
||||||
signal.signal(signal.SIGQUIT, cleanup)
|
|
||||||
|
|
||||||
# Gather useful data about our host for staticdata
|
# Gather useful data about our host for staticdata
|
||||||
# Static data format: 'cpu_count', 'arch', 'os', 'kernel'
|
# Static data format: 'cpu_count', 'arch', 'os', 'kernel'
|
||||||
|
@ -230,7 +236,6 @@ def updaterouters(new_router_list):
|
||||||
# Set up our update function
|
# Set up our update function
|
||||||
this_router = t_router[myhostname]
|
this_router = t_router[myhostname]
|
||||||
update_zookeeper = this_router.update_zookeeper
|
update_zookeeper = this_router.update_zookeeper
|
||||||
update_zookeeper()
|
|
||||||
|
|
||||||
@zk_conn.ChildrenWatch('/networks')
|
@zk_conn.ChildrenWatch('/networks')
|
||||||
def updatenetworks(new_network_list):
|
def updatenetworks(new_network_list):
|
||||||
|
@ -269,6 +274,6 @@ update_timer = createKeepaliveTimer()
|
||||||
# Tick loop
|
# Tick loop
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
time.sleep(0.1)
|
time.sleep(0.5)
|
||||||
except:
|
except:
|
||||||
break
|
break
|
||||||
|
|
|
@ -40,6 +40,7 @@ class RouterInstance():
|
||||||
self.config = config
|
self.config = config
|
||||||
self.this_router = this_router
|
self.this_router = this_router
|
||||||
self.name = name
|
self.name = name
|
||||||
|
self.primary_router = None
|
||||||
self.daemon_state = 'stop'
|
self.daemon_state = 'stop'
|
||||||
self.network_state = 'secondary'
|
self.network_state = 'secondary'
|
||||||
self.t_router = t_router
|
self.t_router = t_router
|
||||||
|
@ -52,26 +53,59 @@ class RouterInstance():
|
||||||
|
|
||||||
# Zookeeper handlers for changed states
|
# Zookeeper handlers for changed states
|
||||||
@zk_conn.DataWatch('/routers/{}/daemonstate'.format(self.name))
|
@zk_conn.DataWatch('/routers/{}/daemonstate'.format(self.name))
|
||||||
def watch_hypervisor_daemonstate(data, stat, event=""):
|
def watch_router_daemonstate(data, stat, event=''):
|
||||||
try:
|
try:
|
||||||
self.daemon_state = data.decode('ascii')
|
data = data.decode('ascii')
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
self.daemon_state = 'stop'
|
data = 'stop'
|
||||||
|
|
||||||
|
if data != self.daemon_state:
|
||||||
|
self.daemon_state = data
|
||||||
|
|
||||||
@zk_conn.DataWatch('/routers/{}/networkstate'.format(self.name))
|
@zk_conn.DataWatch('/routers/{}/networkstate'.format(self.name))
|
||||||
def watch_hypervisor_networkstate(data, stat, event=""):
|
def watch_router_networkstate(data, stat, event=''):
|
||||||
try:
|
try:
|
||||||
self.network_state = data.decode('ascii')
|
data = data.decode('ascii')
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
self.network_state = 'secondary'
|
data = 'secondary'
|
||||||
|
|
||||||
# toggle state management of this router
|
if data != self.network_state:
|
||||||
if s_network != {}: # If there's no network list, we're too early in startup
|
self.network_state = data
|
||||||
if self.name == self.this_router:
|
if self.name == self.this_router:
|
||||||
if self.network_state == 'secondary':
|
|
||||||
self.become_secondary()
|
|
||||||
if self.network_state == 'primary':
|
if self.network_state == 'primary':
|
||||||
self.become_primary()
|
self.become_primary()
|
||||||
|
else:
|
||||||
|
self.become_secondary()
|
||||||
|
|
||||||
|
@zk_conn.DataWatch('/routers')
|
||||||
|
def watch_primary_router(data, stat, event=''):
|
||||||
|
try:
|
||||||
|
data = data.decode('ascii')
|
||||||
|
except AttributeError:
|
||||||
|
data = 'none'
|
||||||
|
|
||||||
|
# toggle state management of this router
|
||||||
|
if data != self.primary_router:
|
||||||
|
if data == 'none':
|
||||||
|
if self.name == self.this_router:
|
||||||
|
if self.daemon_state == 'run' and self.network_state != 'primary':
|
||||||
|
# Contend for primary
|
||||||
|
ansiiprint.echo('Contending for primary', '', 'i')
|
||||||
|
zkhandler.writedata(self.zk_conn, {
|
||||||
|
'/routers': self.name
|
||||||
|
})
|
||||||
|
elif data == self.this_router:
|
||||||
|
if self.name == self.this_router:
|
||||||
|
zkhandler.writedata(self.zk_conn, {
|
||||||
|
'/routers/{}/networkstate'.format(self.name): 'primary',
|
||||||
|
})
|
||||||
|
self.primary_router = data
|
||||||
|
else:
|
||||||
|
if self.name == self.this_router:
|
||||||
|
zkhandler.writedata(self.zk_conn, {
|
||||||
|
'/routers/{}/networkstate'.format(self.name): 'secondary',
|
||||||
|
})
|
||||||
|
self.primary_router = data
|
||||||
|
|
||||||
# Get value functions
|
# Get value functions
|
||||||
def getname(self):
|
def getname(self):
|
||||||
|
@ -96,43 +130,19 @@ class RouterInstance():
|
||||||
self.network_list.append(s_network[network].getvni())
|
self.network_list.append(s_network[network].getvni())
|
||||||
|
|
||||||
def become_secondary(self):
|
def become_secondary(self):
|
||||||
|
time.sleep(1)
|
||||||
ansiiprint.echo('Setting router {} to secondary state'.format(self.name), '', 'i')
|
ansiiprint.echo('Setting router {} to secondary state'.format(self.name), '', 'i')
|
||||||
ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c')
|
ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c')
|
||||||
for router in self.t_router:
|
|
||||||
if self.t_router[router].getname() != self.this_router:
|
|
||||||
if self.t_router[router].getnetworkstate() != 'primary':
|
|
||||||
zkhandler.writedata(self.zk_conn, { '/routers/{}/networkstate'.format(self.t_router[router].getname()): 'primary' })
|
|
||||||
time.sleep(2)
|
|
||||||
for network in self.s_network:
|
for network in self.s_network:
|
||||||
self.s_network[network].stopDHCPServer()
|
self.s_network[network].stopDHCPServer()
|
||||||
self.s_network[network].removeGatewayAddress()
|
self.s_network[network].removeGatewayAddress()
|
||||||
|
|
||||||
def set_secondary(self):
|
|
||||||
result = zkhandler.writedata(self.zk_conn, {
|
|
||||||
'/routers/{}/networkstate'.format(self.name): 'secondary'
|
|
||||||
})
|
|
||||||
if not result:
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
def become_primary(self):
|
def become_primary(self):
|
||||||
ansiiprint.echo('Setting router {} to primary state.'.format(self.name), '', 'i')
|
ansiiprint.echo('Setting router {} to primary state.'.format(self.name), '', 'i')
|
||||||
ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c')
|
ansiiprint.echo('Network list: {}'.format(', '.join(self.network_list)), '', 'c')
|
||||||
for network in self.s_network:
|
for network in self.s_network:
|
||||||
self.s_network[network].createGatewayAddress()
|
self.s_network[network].createGatewayAddress()
|
||||||
self.s_network[network].startDHCPServer()
|
self.s_network[network].startDHCPServer()
|
||||||
for router in self.t_router:
|
|
||||||
if self.t_router[router].getname() != self.this_router:
|
|
||||||
if self.t_router[router].getnetworkstate() != 'secondary':
|
|
||||||
zkhandler.writedata(self.zk_conn, { '/routers/{}/networkstate'.format(self.t_router[router].getname()): 'secondary' })
|
|
||||||
|
|
||||||
def set_primary(self):
|
|
||||||
result = zkhandler.writedata(self.zk_conn, {
|
|
||||||
'/routers': self.name,
|
|
||||||
'/routers/{}/networkstate'.format(self.name): 'primary',
|
|
||||||
})
|
|
||||||
if not result:
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
|
|
||||||
def update_zookeeper(self):
|
def update_zookeeper(self):
|
||||||
# Get past state and update if needed
|
# Get past state and update if needed
|
||||||
|
@ -181,7 +191,7 @@ class RouterInstance():
|
||||||
fence_thread.start()
|
fence_thread.start()
|
||||||
|
|
||||||
# Update the arrays
|
# Update the arrays
|
||||||
if router_daemon_state == 'run' and router_network_state != 'secondary' and router_name not in self.primary_router_list:
|
if router_daemon_state == 'run' and router_network_state == 'primary' and router_name not in self.primary_router_list:
|
||||||
self.primary_router_list.append(router_name)
|
self.primary_router_list.append(router_name)
|
||||||
try:
|
try:
|
||||||
self.secondary_router_list.remove(router_name)
|
self.secondary_router_list.remove(router_name)
|
||||||
|
@ -191,17 +201,7 @@ class RouterInstance():
|
||||||
self.inactive_router_list.remove(router_name)
|
self.inactive_router_list.remove(router_name)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
if router_daemon_state != 'run' and router_network_state != 'secondary' and router_name not in self.inactive_router_list:
|
if router_daemon_state == 'run' and router_network_state == 'secondary' and router_name not in self.secondary_router_list:
|
||||||
self.inactive_router_list.append(router_name)
|
|
||||||
try:
|
|
||||||
self.primary_router_list.remove(router_name)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
self.secondary_router_list.remove(router_name)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
if router_network_state == 'secondary' and router_name not in self.secondary_router_list:
|
|
||||||
self.secondary_router_list.append(router_name)
|
self.secondary_router_list.append(router_name)
|
||||||
try:
|
try:
|
||||||
self.primary_router_list.remove(router_name)
|
self.primary_router_list.remove(router_name)
|
||||||
|
@ -211,15 +211,16 @@ class RouterInstance():
|
||||||
self.inactive_router_list.remove(router_name)
|
self.inactive_router_list.remove(router_name)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
if router_daemon_state != 'run' and router_name not in self.inactive_router_list:
|
||||||
# Try to set ourself primary if there is no primary in the cluster
|
self.inactive_router_list.append(router_name)
|
||||||
cluster_has_primary = False
|
try:
|
||||||
for router in self.t_router:
|
self.primary_router_list.remove(router_name)
|
||||||
if self.t_router[router].getnetworkstate() == 'primary':
|
except ValueError:
|
||||||
cluster_has_primary = True
|
pass
|
||||||
break
|
try:
|
||||||
if not cluster_has_primary:
|
self.secondary_router_list.remove(router_name)
|
||||||
self.set_primary()
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
# Display cluster information to the terminal
|
# Display cluster information to the terminal
|
||||||
ansiiprint.echo('{}Cluster status{}'.format(ansiiprint.purple(), ansiiprint.end()), '', 't')
|
ansiiprint.echo('{}Cluster status{}'.format(ansiiprint.purple(), ansiiprint.end()), '', 't')
|
||||||
|
|
Loading…
Reference in New Issue