2018-05-31 21:49:23 -04:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2020-02-08 19:16:19 -05:00
|
|
|
# NodeInstance.py - Class implementing a PVC node in pvcnoded
|
2018-06-06 01:47:53 -04:00
|
|
|
# Part of the Parallel Virtual Cluster (PVC) system
|
|
|
|
#
|
2020-01-08 19:38:02 -05:00
|
|
|
# Copyright (C) 2018-2020 Joshua M. Boniface <joshua@boniface.me>
|
2018-06-06 01:47:53 -04:00
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
###############################################################################
|
|
|
|
|
2018-09-20 03:42:40 -04:00
|
|
|
import time
|
2020-08-11 11:46:41 -04:00
|
|
|
|
|
|
|
from threading import Thread
|
2018-09-20 03:42:40 -04:00
|
|
|
|
2020-02-08 19:16:19 -05:00
|
|
|
import pvcnoded.zkhandler as zkhandler
|
|
|
|
import pvcnoded.common as common
|
2018-06-06 22:56:03 -04:00
|
|
|
|
2020-11-07 14:45:24 -05:00
|
|
|
|
2018-10-14 22:14:29 -04:00
|
|
|
class NodeInstance(object):
|
2018-06-06 22:59:31 -04:00
|
|
|
# Initialization function
|
2019-12-14 15:55:30 -05:00
|
|
|
def __init__(self, name, this_node, zk_conn, config, logger, d_node, d_network, d_domain, dns_aggregator, metadata_api):
|
2018-05-31 21:49:23 -04:00
|
|
|
# Passed-in variables on creation
|
2018-10-14 02:01:35 -04:00
|
|
|
self.name = name
|
|
|
|
self.this_node = this_node
|
2018-06-17 21:55:39 -04:00
|
|
|
self.zk_conn = zk_conn
|
2018-06-08 12:19:48 -04:00
|
|
|
self.config = config
|
2018-10-14 02:01:35 -04:00
|
|
|
self.logger = logger
|
|
|
|
# Which node is primary
|
|
|
|
self.primary_node = None
|
|
|
|
# States
|
|
|
|
self.daemon_mode = zkhandler.readdata(self.zk_conn, '/nodes/{}/daemonmode'.format(self.name))
|
2018-06-11 02:46:24 -04:00
|
|
|
self.daemon_state = 'stop'
|
2018-10-14 02:01:35 -04:00
|
|
|
self.router_state = 'client'
|
2018-06-11 02:46:24 -04:00
|
|
|
self.domain_state = 'ready'
|
2018-10-14 02:01:35 -04:00
|
|
|
# Object lists
|
|
|
|
self.d_node = d_node
|
|
|
|
self.d_network = d_network
|
|
|
|
self.d_domain = d_domain
|
2018-10-15 21:09:40 -04:00
|
|
|
self.dns_aggregator = dns_aggregator
|
2019-12-14 15:55:30 -05:00
|
|
|
self.metadata_api = metadata_api
|
2018-10-14 02:01:35 -04:00
|
|
|
# Printable lists
|
2018-06-04 16:34:41 -04:00
|
|
|
self.active_node_list = []
|
|
|
|
self.flushed_node_list = []
|
|
|
|
self.inactive_node_list = []
|
2018-10-14 02:01:35 -04:00
|
|
|
self.network_list = []
|
2018-06-01 12:21:58 -04:00
|
|
|
self.domain_list = []
|
2018-10-14 02:01:35 -04:00
|
|
|
# Node resources
|
2018-06-11 01:50:06 -04:00
|
|
|
self.domains_count = 0
|
|
|
|
self.memused = 0
|
|
|
|
self.memfree = 0
|
2018-07-17 21:34:11 -04:00
|
|
|
self.memalloc = 0
|
2018-07-18 12:09:07 -04:00
|
|
|
self.vcpualloc = 0
|
2019-12-19 18:58:53 -05:00
|
|
|
# Floating IP configurations
|
2019-03-11 01:44:26 -04:00
|
|
|
if self.config['enable_networking']:
|
|
|
|
self.upstream_dev = self.config['upstream_dev']
|
2020-12-14 14:39:51 -05:00
|
|
|
self.upstream_floatingipaddr = self.config['upstream_floating_ip'].split('/')[0]
|
|
|
|
self.upstream_ipaddr, self.upstream_cidrnetmask = self.config['upstream_dev_ip'].split('/')
|
|
|
|
self.vni_dev = self.config['vni_dev']
|
|
|
|
self.vni_floatingipaddr = self.config['vni_floating_ip'].split('/')[0]
|
|
|
|
self.vni_ipaddr, self.vni_cidrnetmask = self.config['vni_dev_ip'].split('/')
|
|
|
|
self.storage_dev = self.config['storage_dev']
|
|
|
|
self.storage_floatingipaddr = self.config['storage_floating_ip'].split('/')[0]
|
|
|
|
self.storage_ipaddr, self.storage_cidrnetmask = self.config['storage_dev_ip'].split('/')
|
2019-03-11 01:44:26 -04:00
|
|
|
else:
|
|
|
|
self.upstream_dev = None
|
2020-12-14 14:39:51 -05:00
|
|
|
self.upstream_floatingipaddr = None
|
2019-03-11 01:44:26 -04:00
|
|
|
self.upstream_ipaddr = None
|
|
|
|
self.upstream_cidrnetmask = None
|
2020-12-14 14:39:51 -05:00
|
|
|
self.vni_dev = None
|
|
|
|
self.vni_floatingipaddr = None
|
|
|
|
self.vni_ipaddr = None
|
|
|
|
self.vni_cidrnetmask = None
|
|
|
|
self.storage_dev = None
|
|
|
|
self.storage_floatingipaddr = None
|
|
|
|
self.storage_ipaddr = None
|
|
|
|
self.storage_cidrnetmask = None
|
2019-07-10 01:07:56 -04:00
|
|
|
# Threads
|
|
|
|
self.flush_thread = None
|
2018-10-14 02:01:35 -04:00
|
|
|
# Flags
|
2019-07-10 01:07:56 -04:00
|
|
|
self.flush_stopper = False
|
2018-06-01 12:21:58 -04:00
|
|
|
|
|
|
|
# Zookeeper handlers for changed states
|
2018-10-01 22:51:34 -04:00
|
|
|
@self.zk_conn.DataWatch('/nodes/{}/daemonstate'.format(self.name))
|
2018-10-14 02:01:35 -04:00
|
|
|
def watch_node_daemonstate(data, stat, event=''):
|
2018-10-01 22:51:34 -04:00
|
|
|
if event and event.type == 'DELETED':
|
|
|
|
# The key has been deleted after existing before; terminate this watcher
|
|
|
|
# because this class instance is about to be reaped in Daemon.py
|
|
|
|
return False
|
|
|
|
|
2018-06-06 01:24:28 -04:00
|
|
|
try:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = data.decode('ascii')
|
2018-06-06 01:24:28 -04:00
|
|
|
except AttributeError:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = 'stop'
|
|
|
|
|
|
|
|
if data != self.daemon_state:
|
|
|
|
self.daemon_state = data
|
|
|
|
|
|
|
|
@self.zk_conn.DataWatch('/nodes/{}/routerstate'.format(self.name))
|
|
|
|
def watch_node_routerstate(data, stat, event=''):
|
|
|
|
if event and event.type == 'DELETED':
|
|
|
|
# The key has been deleted after existing before; terminate this watcher
|
|
|
|
# because this class instance is about to be reaped in Daemon.py
|
|
|
|
return False
|
|
|
|
|
|
|
|
try:
|
|
|
|
data = data.decode('ascii')
|
|
|
|
except AttributeError:
|
|
|
|
data = 'client'
|
|
|
|
|
|
|
|
if self.name == self.this_node and self.daemon_mode == 'coordinator':
|
|
|
|
# We're a coordinator so we care about networking
|
|
|
|
if data != self.router_state:
|
|
|
|
self.router_state = data
|
2019-08-04 16:42:06 -04:00
|
|
|
if self.config['enable_networking']:
|
2020-02-19 13:18:38 -05:00
|
|
|
if self.router_state == 'takeover':
|
2019-12-19 18:58:53 -05:00
|
|
|
self.logger.out('Setting node {} to primary state'.format(self.name), state='i')
|
2020-08-11 11:46:41 -04:00
|
|
|
transition_thread = Thread(target=self.become_primary, args=(), kwargs={})
|
2019-12-19 18:58:53 -05:00
|
|
|
transition_thread.start()
|
2020-02-19 13:18:38 -05:00
|
|
|
if self.router_state == 'relinquish':
|
2019-12-19 10:45:24 -05:00
|
|
|
# Skip becoming secondary unless already running
|
2019-12-25 21:02:46 -05:00
|
|
|
if self.daemon_state == 'run' or self.daemon_state == 'shutdown':
|
2019-12-19 10:45:24 -05:00
|
|
|
self.logger.out('Setting node {} to secondary state'.format(self.name), state='i')
|
2020-08-11 11:46:41 -04:00
|
|
|
transition_thread = Thread(target=self.become_secondary, args=(), kwargs={})
|
2019-12-19 10:45:24 -05:00
|
|
|
transition_thread.start()
|
2020-02-19 13:18:38 -05:00
|
|
|
else:
|
|
|
|
# We did nothing, so just become secondary state
|
|
|
|
zkhandler.writedata(self.zk_conn, {'/nodes/{}/routerstate'.format(self.name): 'secondary'})
|
2018-06-11 02:46:24 -04:00
|
|
|
|
2018-10-01 22:51:34 -04:00
|
|
|
@self.zk_conn.DataWatch('/nodes/{}/domainstate'.format(self.name))
|
2018-10-14 02:01:35 -04:00
|
|
|
def watch_node_domainstate(data, stat, event=''):
|
2018-10-01 22:51:34 -04:00
|
|
|
if event and event.type == 'DELETED':
|
|
|
|
# The key has been deleted after existing before; terminate this watcher
|
|
|
|
# because this class instance is about to be reaped in Daemon.py
|
|
|
|
return False
|
|
|
|
|
2018-06-11 02:46:24 -04:00
|
|
|
try:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = data.decode('ascii')
|
2018-06-11 02:46:24 -04:00
|
|
|
except AttributeError:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = 'unknown'
|
|
|
|
|
|
|
|
if data != self.domain_state:
|
|
|
|
self.domain_state = data
|
|
|
|
|
|
|
|
# toggle state management of this node
|
|
|
|
if self.name == self.this_node:
|
2019-07-10 01:07:56 -04:00
|
|
|
# Stop any existing flush jobs
|
2019-12-25 21:02:46 -05:00
|
|
|
if self.flush_thread is not None:
|
2019-07-10 01:07:56 -04:00
|
|
|
self.logger.out('Waiting for previous migration to complete'.format(self.name), state='i')
|
2019-12-25 21:02:46 -05:00
|
|
|
self.flush_stopper = True
|
2019-07-10 01:07:56 -04:00
|
|
|
while self.flush_stopper:
|
2019-12-25 21:02:46 -05:00
|
|
|
time.sleep(0.1)
|
|
|
|
|
2019-07-10 01:07:56 -04:00
|
|
|
# Do flushing in a thread so it doesn't block the migrates out
|
|
|
|
if self.domain_state == 'flush':
|
2020-08-11 11:46:41 -04:00
|
|
|
self.flush_thread = Thread(target=self.flush, args=(), kwargs={})
|
2019-07-10 01:07:56 -04:00
|
|
|
self.flush_thread.start()
|
|
|
|
# Do unflushing in a thread so it doesn't block the migrates in
|
|
|
|
if self.domain_state == 'unflush':
|
2020-08-11 11:46:41 -04:00
|
|
|
self.flush_thread = Thread(target=self.unflush, args=(), kwargs={})
|
2019-07-10 01:07:56 -04:00
|
|
|
self.flush_thread.start()
|
2018-10-14 02:01:35 -04:00
|
|
|
|
2018-10-01 22:51:34 -04:00
|
|
|
@self.zk_conn.DataWatch('/nodes/{}/memfree'.format(self.name))
|
2018-10-14 02:01:35 -04:00
|
|
|
def watch_node_memfree(data, stat, event=''):
|
2018-10-01 22:51:34 -04:00
|
|
|
if event and event.type == 'DELETED':
|
|
|
|
# The key has been deleted after existing before; terminate this watcher
|
|
|
|
# because this class instance is about to be reaped in Daemon.py
|
|
|
|
return False
|
|
|
|
|
2018-06-06 01:24:28 -04:00
|
|
|
try:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = data.decode('ascii')
|
2018-06-06 01:24:28 -04:00
|
|
|
except AttributeError:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = 0
|
|
|
|
|
|
|
|
if data != self.memfree:
|
|
|
|
self.memfree = data
|
2019-06-25 22:31:04 -04:00
|
|
|
|
2018-10-01 22:51:34 -04:00
|
|
|
@self.zk_conn.DataWatch('/nodes/{}/memused'.format(self.name))
|
2018-10-14 02:01:35 -04:00
|
|
|
def watch_node_memused(data, stat, event=''):
|
2018-10-01 22:51:34 -04:00
|
|
|
if event and event.type == 'DELETED':
|
|
|
|
# The key has been deleted after existing before; terminate this watcher
|
|
|
|
# because this class instance is about to be reaped in Daemon.py
|
|
|
|
return False
|
|
|
|
|
2018-06-11 02:46:24 -04:00
|
|
|
try:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = data.decode('ascii')
|
2018-06-11 02:46:24 -04:00
|
|
|
except AttributeError:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = 0
|
|
|
|
|
|
|
|
if data != self.memused:
|
|
|
|
self.memused = data
|
2019-06-25 22:31:04 -04:00
|
|
|
|
2018-10-01 22:51:34 -04:00
|
|
|
@self.zk_conn.DataWatch('/nodes/{}/memalloc'.format(self.name))
|
2018-10-14 02:01:35 -04:00
|
|
|
def watch_node_memalloc(data, stat, event=''):
|
2018-10-01 22:51:34 -04:00
|
|
|
if event and event.type == 'DELETED':
|
|
|
|
# The key has been deleted after existing before; terminate this watcher
|
|
|
|
# because this class instance is about to be reaped in Daemon.py
|
|
|
|
return False
|
|
|
|
|
2018-07-17 21:34:11 -04:00
|
|
|
try:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = data.decode('ascii')
|
2018-07-17 21:34:11 -04:00
|
|
|
except AttributeError:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = 0
|
|
|
|
|
|
|
|
if data != self.memalloc:
|
|
|
|
self.memalloc = data
|
2019-06-25 22:31:04 -04:00
|
|
|
|
2018-10-01 22:51:34 -04:00
|
|
|
@self.zk_conn.DataWatch('/nodes/{}/vcpualloc'.format(self.name))
|
2018-10-14 02:01:35 -04:00
|
|
|
def watch_node_vcpualloc(data, stat, event=''):
|
2018-10-01 22:51:34 -04:00
|
|
|
if event and event.type == 'DELETED':
|
|
|
|
# The key has been deleted after existing before; terminate this watcher
|
|
|
|
# because this class instance is about to be reaped in Daemon.py
|
|
|
|
return False
|
|
|
|
|
2018-07-18 12:09:07 -04:00
|
|
|
try:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = data.decode('ascii')
|
2018-07-18 12:09:07 -04:00
|
|
|
except AttributeError:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = 0
|
|
|
|
|
|
|
|
if data != self.vcpualloc:
|
|
|
|
self.vcpualloc = data
|
2019-06-25 22:31:04 -04:00
|
|
|
|
2018-10-01 22:51:34 -04:00
|
|
|
@self.zk_conn.DataWatch('/nodes/{}/runningdomains'.format(self.name))
|
2018-10-14 02:01:35 -04:00
|
|
|
def watch_node_runningdomains(data, stat, event=''):
|
2018-10-01 22:51:34 -04:00
|
|
|
if event and event.type == 'DELETED':
|
|
|
|
# The key has been deleted after existing before; terminate this watcher
|
|
|
|
# because this class instance is about to be reaped in Daemon.py
|
|
|
|
return False
|
|
|
|
|
2018-06-06 01:24:28 -04:00
|
|
|
try:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = data.decode('ascii').split()
|
2018-06-06 01:24:28 -04:00
|
|
|
except AttributeError:
|
2018-10-14 02:01:35 -04:00
|
|
|
data = []
|
2018-05-31 21:49:23 -04:00
|
|
|
|
2018-10-14 02:01:35 -04:00
|
|
|
if data != self.domain_list:
|
|
|
|
self.domain_list = data
|
|
|
|
|
|
|
|
@self.zk_conn.DataWatch('/nodes/{}/domainscount'.format(self.name))
|
|
|
|
def watch_node_domainscount(data, stat, event=''):
|
|
|
|
if event and event.type == 'DELETED':
|
|
|
|
# The key has been deleted after existing before; terminate this watcher
|
|
|
|
# because this class instance is about to be reaped in Daemon.py
|
|
|
|
return False
|
2018-05-31 23:40:21 -04:00
|
|
|
|
2018-10-14 02:01:35 -04:00
|
|
|
try:
|
|
|
|
data = data.decode('ascii')
|
|
|
|
except AttributeError:
|
|
|
|
data = 0
|
2018-06-01 12:21:58 -04:00
|
|
|
|
2018-10-14 02:01:35 -04:00
|
|
|
if data != self.domains_count:
|
|
|
|
self.domains_count = data
|
2019-06-25 22:31:04 -04:00
|
|
|
|
2018-05-31 23:01:22 -04:00
|
|
|
# Update value functions
|
2018-10-14 02:01:35 -04:00
|
|
|
def update_node_list(self, d_node):
|
|
|
|
self.d_node = d_node
|
|
|
|
|
|
|
|
def update_network_list(self, d_network):
|
|
|
|
self.d_network = d_network
|
|
|
|
network_list = []
|
|
|
|
for network in self.d_network:
|
|
|
|
network_list.append(d_network[network].vni)
|
|
|
|
self.network_list = network_list
|
|
|
|
|
|
|
|
def update_domain_list(self, d_domain):
|
|
|
|
self.d_domain = d_domain
|
|
|
|
|
2019-12-19 10:45:24 -05:00
|
|
|
######
|
|
|
|
# Phases of node transition
|
|
|
|
#
|
|
|
|
# Current Primary Candidate Secondary
|
|
|
|
# -> secondary -> primary
|
|
|
|
#
|
|
|
|
# def become_secondary() def become_primary()
|
|
|
|
#
|
|
|
|
# A ----------------------------------------------------------------- SYNC (candidate)
|
|
|
|
# B ----------------------------------------------------------------- SYNC (current)
|
2020-01-30 09:18:56 -05:00
|
|
|
# 1. Stop DNS aggregator ||
|
|
|
|
# 2. Stop DHCP servers ||
|
2019-12-19 10:45:24 -05:00
|
|
|
# 4a) network 1 ||
|
|
|
|
# 4b) network 2 ||
|
|
|
|
# etc. ||
|
2020-01-30 09:18:56 -05:00
|
|
|
# 3. Stop client API ||
|
|
|
|
# 4. Stop metadata API ||
|
2019-12-19 10:45:24 -05:00
|
|
|
# --
|
|
|
|
# C ----------------------------------------------------------------- SYNC (candidate)
|
|
|
|
# 5. Remove upstream floating IP 1. Add upstream floating IP ||
|
|
|
|
# --
|
|
|
|
# D ----------------------------------------------------------------- SYNC (candidate)
|
|
|
|
# 6. Remove cluster floating IP 2. Add cluster floating IP ||
|
|
|
|
# --
|
|
|
|
# E ----------------------------------------------------------------- SYNC (candidate)
|
|
|
|
# 7. Remove metadata floating IP 3. Add metadata floating IP ||
|
|
|
|
# --
|
|
|
|
# F ----------------------------------------------------------------- SYNC (candidate)
|
|
|
|
# 8. Remove gateway IPs 4. Add gateway IPs ||
|
|
|
|
# 8a) network 1 4a) network 1 ||
|
|
|
|
# 8b) network 2 4b) network 2 ||
|
|
|
|
# etc. etc. ||
|
|
|
|
# --
|
|
|
|
# G ----------------------------------------------------------------- SYNC (candidate)
|
|
|
|
# 5. Transition Patroni primary ||
|
2020-01-30 09:18:56 -05:00
|
|
|
# 6. Start client API ||
|
|
|
|
# 7. Start metadata API ||
|
|
|
|
# 8. Start DHCP servers ||
|
2019-12-19 10:45:24 -05:00
|
|
|
# 5a) network 1 ||
|
|
|
|
# 5b) network 2 ||
|
|
|
|
# etc. ||
|
2020-01-30 09:18:56 -05:00
|
|
|
# 9. Start DNS aggregator ||
|
2019-12-19 10:45:24 -05:00
|
|
|
# --
|
|
|
|
######
|
2018-10-14 02:01:35 -04:00
|
|
|
def become_primary(self):
|
2019-12-19 10:45:24 -05:00
|
|
|
"""
|
|
|
|
Acquire primary coordinator status from a peer node
|
|
|
|
"""
|
|
|
|
# Lock the primary node until transition is complete
|
Improve handling of primary contention
Previously, contention could occasionally cause a flap/dual primary
contention state due to the lack of checking within this function. This
could cause a state where a node transitions to primary than is almost
immediately shifted away, which could cause undefined behaviour in the
cluster.
The solution includes several elements:
* Implement an exclusive lock operation in zkhandler
* Switch the become_primary function to use this exclusive lock
* Implement exclusive locking during the contention process
* As a failsafe, check stat versions before setting the node as the
primary node, in case another node already has
* Delay the start of takeover/relinquish operations by slightly
longer than the lock timeout
* Make the current router_state conditions more explicit (positive
conditionals rather than negative conditionals)
The new scenario ensures that during contention, only one secondary will
ever succeed at acquiring the lock. Ideally, the other would then grab
the lock and pass, but in testing this does not seem to be the case -
the lock always times out, so the failsafe check is technically not
needed but has been left as an added safety mechanism. With this setup,
the node that fails the contention will never block the switchover nor
will it try to force itself onto the cluster after another node has
successfully won contention.
Timeouts may need to be adjusted in the future, but the base timeout of
0.4 seconds (and transition delay of 0.5 seconds) seems to work reliably
during preliminary tests.
2020-04-12 03:40:17 -04:00
|
|
|
primary_lock = zkhandler.exclusivelock(self.zk_conn, '/primary_node')
|
2019-12-19 10:45:24 -05:00
|
|
|
primary_lock.acquire()
|
|
|
|
|
|
|
|
# Ensure our lock key is populated
|
|
|
|
zkhandler.writedata(self.zk_conn, {'/locks/primary_node': ''})
|
|
|
|
|
|
|
|
# Synchronize nodes A (I am writer)
|
|
|
|
lock = zkhandler.writelock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring write lock for synchronization phase A', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired write lock for synchronization phase A', state='o')
|
2020-11-07 13:11:03 -05:00
|
|
|
time.sleep(1) # Time fir reader to acquire the lock
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing write lock for synchronization phase A', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
zkhandler.writedata(self.zk_conn, {'/locks/primary_node': ''})
|
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released write lock for synchronization phase A', state='o')
|
2020-11-07 13:11:03 -05:00
|
|
|
time.sleep(0.1) # Time fir new writer to acquire the lock
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes B (I am reader)
|
|
|
|
lock = zkhandler.readlock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring read lock for synchronization phase B', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired read lock for synchronization phase B', state='o')
|
|
|
|
self.logger.out('Releasing read lock for synchronization phase B', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released read lock for synchronization phase B', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes C (I am writer)
|
|
|
|
lock = zkhandler.writelock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring write lock for synchronization phase C', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired write lock for synchronization phase C', state='o')
|
2020-11-07 13:11:03 -05:00
|
|
|
time.sleep(0.5) # Time fir reader to acquire the lock
|
2019-12-19 10:45:24 -05:00
|
|
|
# 1. Add Upstream floating IP
|
2019-12-14 15:55:30 -05:00
|
|
|
self.logger.out(
|
2019-12-19 10:45:24 -05:00
|
|
|
'Creating floating upstream IP {}/{} on interface {}'.format(
|
2020-12-14 14:39:51 -05:00
|
|
|
self.upstream_floatingipaddr,
|
2019-12-19 10:45:24 -05:00
|
|
|
self.upstream_cidrnetmask,
|
2020-01-12 19:04:31 -05:00
|
|
|
'brupstream'
|
2019-12-14 15:55:30 -05:00
|
|
|
),
|
|
|
|
state='o'
|
|
|
|
)
|
2020-12-14 14:39:51 -05:00
|
|
|
common.createIPAddress(self.upstream_floatingipaddr, self.upstream_cidrnetmask, 'brupstream')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing write lock for synchronization phase C', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
zkhandler.writedata(self.zk_conn, {'/locks/primary_node': ''})
|
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released write lock for synchronization phase C', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes D (I am writer)
|
|
|
|
lock = zkhandler.writelock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring write lock for synchronization phase D', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired write lock for synchronization phase D', state='o')
|
2020-11-07 13:11:03 -05:00
|
|
|
time.sleep(0.2) # Time fir reader to acquire the lock
|
2020-12-14 14:39:51 -05:00
|
|
|
# 2. Add Cluster & Storage floating IP
|
2018-10-17 00:23:43 -04:00
|
|
|
self.logger.out(
|
|
|
|
'Creating floating management IP {}/{} on interface {}'.format(
|
2020-12-14 14:39:51 -05:00
|
|
|
self.vni_floatingipaddr,
|
2018-10-17 00:23:43 -04:00
|
|
|
self.vni_cidrnetmask,
|
2019-03-18 20:17:26 -04:00
|
|
|
'brcluster'
|
2018-10-17 00:23:43 -04:00
|
|
|
),
|
|
|
|
state='o'
|
|
|
|
)
|
2020-12-14 14:39:51 -05:00
|
|
|
common.createIPAddress(self.vni_floatingipaddr, self.vni_cidrnetmask, 'brcluster')
|
|
|
|
self.logger.out(
|
|
|
|
'Creating floating management IP {}/{} on interface {}'.format(
|
|
|
|
self.storage_floatingipaddr,
|
|
|
|
self.storage_cidrnetmask,
|
|
|
|
'brcluster'
|
|
|
|
),
|
|
|
|
state='o'
|
|
|
|
)
|
|
|
|
common.createIPAddress(self.storage_floatingipaddr, self.storage_cidrnetmask, 'brstorage')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing write lock for synchronization phase D', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
zkhandler.writedata(self.zk_conn, {'/locks/primary_node': ''})
|
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released write lock for synchronization phase D', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes E (I am writer)
|
|
|
|
lock = zkhandler.writelock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring write lock for synchronization phase E', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired write lock for synchronization phase E', state='o')
|
2020-11-07 13:11:03 -05:00
|
|
|
time.sleep(0.2) # Time fir reader to acquire the lock
|
2019-12-19 10:45:24 -05:00
|
|
|
# 3. Add Metadata link-local IP
|
2018-10-17 00:23:43 -04:00
|
|
|
self.logger.out(
|
2019-12-19 10:45:24 -05:00
|
|
|
'Creating Metadata link-local IP {}/{} on interface {}'.format(
|
|
|
|
'169.254.169.254',
|
|
|
|
'32',
|
|
|
|
'lo'
|
2018-10-17 00:23:43 -04:00
|
|
|
),
|
|
|
|
state='o'
|
|
|
|
)
|
2019-12-19 10:45:24 -05:00
|
|
|
common.createIPAddress('169.254.169.254', '32', 'lo')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing write lock for synchronization phase E', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
zkhandler.writedata(self.zk_conn, {'/locks/primary_node': ''})
|
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released write lock for synchronization phase E', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes F (I am writer)
|
|
|
|
lock = zkhandler.writelock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring write lock for synchronization phase F', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired write lock for synchronization phase F', state='o')
|
2020-11-07 13:11:03 -05:00
|
|
|
time.sleep(0.2) # Time fir reader to acquire the lock
|
2019-12-19 10:45:24 -05:00
|
|
|
# 4. Add gateway IPs
|
|
|
|
for network in self.d_network:
|
|
|
|
self.d_network[network].createGateways()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing write lock for synchronization phase F', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
zkhandler.writedata(self.zk_conn, {'/locks/primary_node': ''})
|
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released write lock for synchronization phase F', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes G (I am writer)
|
|
|
|
lock = zkhandler.writelock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring write lock for synchronization phase G', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired write lock for synchronization phase G', state='o')
|
2020-11-07 13:11:03 -05:00
|
|
|
time.sleep(0.2) # Time fir reader to acquire the lock
|
2019-12-19 10:45:24 -05:00
|
|
|
# 5. Transition Patroni primary
|
|
|
|
self.logger.out('Setting Patroni leader to this node', state='i')
|
|
|
|
tick = 1
|
2019-12-19 18:58:53 -05:00
|
|
|
patroni_failed = True
|
2020-02-19 13:18:38 -05:00
|
|
|
# As long as we're in takeover, keep trying to set the Patroni leader to us
|
|
|
|
while self.router_state == 'takeover':
|
2019-12-19 10:45:24 -05:00
|
|
|
# Switch Patroni leader to the local instance
|
|
|
|
retcode, stdout, stderr = common.run_os_command(
|
|
|
|
"""
|
|
|
|
patronictl
|
|
|
|
-c /etc/patroni/config.yml
|
|
|
|
-d zookeeper://localhost:2181
|
|
|
|
switchover
|
|
|
|
--candidate {}
|
|
|
|
--force
|
2020-01-06 16:37:17 -05:00
|
|
|
pvc
|
2019-12-19 10:45:24 -05:00
|
|
|
""".format(self.name)
|
|
|
|
)
|
|
|
|
|
|
|
|
# Combine the stdout and stderr and strip the output
|
|
|
|
# Patronictl's output is pretty junky
|
|
|
|
if stderr:
|
|
|
|
stdout += stderr
|
|
|
|
stdout = stdout.strip()
|
|
|
|
|
|
|
|
# Handle our current Patroni leader being us
|
|
|
|
if stdout and stdout.split('\n')[-1].split() == ["Error:", "Switchover", "target", "and", "source", "are", "the", "same."]:
|
|
|
|
self.logger.out('Failed to switch Patroni leader to ourselves; this is fine\n{}'.format(stdout), state='w')
|
2020-03-12 13:22:12 -04:00
|
|
|
patroni_failed = False
|
2019-12-19 10:45:24 -05:00
|
|
|
break
|
|
|
|
# Handle a failed switchover
|
|
|
|
elif stdout and (stdout.split('\n')[-1].split()[:2] == ["Switchover", "failed,"] or stdout.strip().split('\n')[-1].split()[:1] == ["Error"]):
|
|
|
|
if tick > 4:
|
|
|
|
self.logger.out('Failed to switch Patroni leader after 5 tries; aborting', state='e')
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
self.logger.out('Failed to switch Patroni leader; retrying [{}/5]\n{}\n'.format(tick, stdout), state='e')
|
|
|
|
tick += 1
|
|
|
|
time.sleep(5)
|
|
|
|
# Otherwise, we succeeded
|
|
|
|
else:
|
|
|
|
self.logger.out('Successfully switched Patroni leader\n{}'.format(stdout), state='o')
|
2019-12-19 18:58:53 -05:00
|
|
|
patroni_failed = False
|
2019-12-19 10:45:24 -05:00
|
|
|
time.sleep(0.2)
|
|
|
|
break
|
2020-01-30 09:18:56 -05:00
|
|
|
# 6. Start client API (and provisioner worker)
|
|
|
|
if self.config['enable_api']:
|
|
|
|
self.logger.out('Starting PVC API client service', state='i')
|
2020-02-08 19:16:19 -05:00
|
|
|
common.run_os_command("systemctl start pvcapid.service")
|
2020-01-30 09:18:56 -05:00
|
|
|
self.logger.out('Starting PVC Provisioner Worker service', state='i')
|
2020-02-08 19:16:19 -05:00
|
|
|
common.run_os_command("systemctl start pvcapid-worker.service")
|
2020-01-30 09:18:56 -05:00
|
|
|
# 7. Start metadata API; just continue if we fail
|
|
|
|
self.metadata_api.start()
|
|
|
|
# 8. Start DHCP servers
|
2019-12-19 10:45:24 -05:00
|
|
|
for network in self.d_network:
|
|
|
|
self.d_network[network].startDHCPServer()
|
2020-01-30 09:18:56 -05:00
|
|
|
# 9. Start DNS aggregator; just continue if we fail
|
2019-12-19 18:58:53 -05:00
|
|
|
if not patroni_failed:
|
|
|
|
self.dns_aggregator.start_aggregator()
|
|
|
|
else:
|
|
|
|
self.logger.out('Not starting DNS aggregator due to Patroni failures', state='e')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing write lock for synchronization phase G', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
zkhandler.writedata(self.zk_conn, {'/locks/primary_node': ''})
|
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released write lock for synchronization phase G', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
2020-02-19 13:18:38 -05:00
|
|
|
# Wait 2 seconds for everything to stabilize before we declare all-done
|
|
|
|
time.sleep(2)
|
2019-12-19 10:45:24 -05:00
|
|
|
primary_lock.release()
|
2020-02-19 13:18:38 -05:00
|
|
|
zkhandler.writedata(self.zk_conn, {'/nodes/{}/routerstate'.format(self.name): 'primary'})
|
2019-12-19 10:45:24 -05:00
|
|
|
self.logger.out('Node {} transitioned to primary state'.format(self.name), state='o')
|
2018-10-17 00:23:43 -04:00
|
|
|
|
2019-12-19 10:45:24 -05:00
|
|
|
def become_secondary(self):
|
|
|
|
"""
|
|
|
|
Relinquish primary coordinator status to a peer node
|
|
|
|
"""
|
2020-11-07 13:11:03 -05:00
|
|
|
time.sleep(0.2) # Initial delay for the first writer to grab the lock
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes A (I am reader)
|
|
|
|
lock = zkhandler.readlock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring read lock for synchronization phase A', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired read lock for synchronization phase A', state='o')
|
|
|
|
self.logger.out('Releasing read lock for synchronization phase A', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released read lock for synchronization phase A', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes B (I am writer)
|
|
|
|
lock = zkhandler.writelock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring write lock for synchronization phase B', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired write lock for synchronization phase B', state='o')
|
2020-11-07 13:11:03 -05:00
|
|
|
time.sleep(0.2) # Time fir reader to acquire the lock
|
2020-01-30 09:18:56 -05:00
|
|
|
# 1. Stop DNS aggregator
|
2019-12-19 10:45:24 -05:00
|
|
|
self.dns_aggregator.stop_aggregator()
|
2020-01-30 09:18:56 -05:00
|
|
|
# 2. Stop DHCP servers
|
2019-12-19 10:45:24 -05:00
|
|
|
for network in self.d_network:
|
|
|
|
self.d_network[network].stopDHCPServer()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing write lock for synchronization phase B', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
zkhandler.writedata(self.zk_conn, {'/locks/primary_node': ''})
|
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released write lock for synchronization phase B', state='o')
|
2020-01-30 09:18:56 -05:00
|
|
|
# 3. Stop client API
|
|
|
|
if self.config['enable_api']:
|
|
|
|
self.logger.out('Stopping PVC API client service', state='i')
|
2020-02-08 19:16:19 -05:00
|
|
|
common.run_os_command("systemctl stop pvcapid.service")
|
2020-01-30 09:18:56 -05:00
|
|
|
# 4. Stop metadata API
|
|
|
|
self.metadata_api.stop()
|
2020-11-07 13:11:03 -05:00
|
|
|
time.sleep(0.1) # Time fir new writer to acquire the lock
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes C (I am reader)
|
|
|
|
lock = zkhandler.readlock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring read lock for synchronization phase C', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired read lock for synchronization phase C', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
# 5. Remove Upstream floating IP
|
2019-12-14 15:55:30 -05:00
|
|
|
self.logger.out(
|
2019-12-19 10:45:24 -05:00
|
|
|
'Removing floating upstream IP {}/{} from interface {}'.format(
|
2020-12-14 14:39:51 -05:00
|
|
|
self.upstream_floatingipaddr,
|
2019-12-19 10:45:24 -05:00
|
|
|
self.upstream_cidrnetmask,
|
2020-01-12 19:04:31 -05:00
|
|
|
'brupstream'
|
2019-12-14 15:55:30 -05:00
|
|
|
),
|
|
|
|
state='o'
|
|
|
|
)
|
2020-12-14 14:39:51 -05:00
|
|
|
common.removeIPAddress(self.upstream_floatingipaddr, self.upstream_cidrnetmask, 'brupstream')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing read lock for synchronization phase C', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released read lock for synchronization phase C', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes D (I am reader)
|
|
|
|
lock = zkhandler.readlock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring read lock for synchronization phase D', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired read lock for synchronization phase D', state='o')
|
2020-12-14 14:39:51 -05:00
|
|
|
# 6. Remove Cluster & Storage floating IP
|
2018-10-17 00:23:43 -04:00
|
|
|
self.logger.out(
|
|
|
|
'Removing floating management IP {}/{} from interface {}'.format(
|
2020-12-14 14:39:51 -05:00
|
|
|
self.vni_floatingipaddr,
|
2018-10-17 00:23:43 -04:00
|
|
|
self.vni_cidrnetmask,
|
2019-03-18 20:17:26 -04:00
|
|
|
'brcluster'
|
2018-10-17 00:23:43 -04:00
|
|
|
),
|
|
|
|
state='o'
|
|
|
|
)
|
2020-12-14 14:39:51 -05:00
|
|
|
common.removeIPAddress(self.vni_floatingipaddr, self.vni_cidrnetmask, 'brcluster')
|
|
|
|
self.logger.out(
|
|
|
|
'Removing floating management IP {}/{} from interface {}'.format(
|
|
|
|
self.storage_floatingipaddr,
|
|
|
|
self.storage_cidrnetmask,
|
|
|
|
'brcluster'
|
|
|
|
),
|
|
|
|
state='o'
|
|
|
|
)
|
|
|
|
common.removeIPAddress(self.storage_floatingipaddr, self.storage_cidrnetmask, 'brstorage')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing read lock for synchronization phase D', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released read lock for synchronization phase D', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes E (I am reader)
|
|
|
|
lock = zkhandler.readlock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring read lock for synchronization phase E', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired read lock for synchronization phase E', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
# 7. Remove Metadata link-local IP
|
2018-10-17 00:23:43 -04:00
|
|
|
self.logger.out(
|
2019-12-19 10:45:24 -05:00
|
|
|
'Removing Metadata link-local IP {}/{} from interface {}'.format(
|
|
|
|
'169.254.169.254',
|
|
|
|
'32',
|
|
|
|
'lo'
|
2018-10-17 00:23:43 -04:00
|
|
|
),
|
|
|
|
state='o'
|
|
|
|
)
|
2019-12-19 10:45:24 -05:00
|
|
|
common.removeIPAddress('169.254.169.254', '32', 'lo')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing read lock for synchronization phase E', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released read lock for synchronization phase E', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes F (I am reader)
|
|
|
|
lock = zkhandler.readlock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring read lock for synchronization phase F', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.acquire()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired read lock for synchronization phase F', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
# 8. Remove gateway IPs
|
|
|
|
for network in self.d_network:
|
|
|
|
self.d_network[network].removeGateways()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing read lock for synchronization phase F', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released read lock for synchronization phase F', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
|
|
|
# Synchronize nodes G (I am reader)
|
|
|
|
lock = zkhandler.readlock(self.zk_conn, '/locks/primary_node')
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquiring read lock for synchronization phase G', state='i')
|
2019-12-19 18:58:53 -05:00
|
|
|
try:
|
2020-11-07 13:11:03 -05:00
|
|
|
lock.acquire(timeout=60) # Don't wait forever and completely block us
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Acquired read lock for synchronization phase G', state='o')
|
2020-11-06 18:55:10 -05:00
|
|
|
except Exception:
|
2019-12-19 18:58:53 -05:00
|
|
|
pass
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Releasing read lock for synchronization phase G', state='i')
|
2019-12-19 10:45:24 -05:00
|
|
|
lock.release()
|
2020-10-20 13:08:18 -04:00
|
|
|
self.logger.out('Released read lock for synchronization phase G', state='o')
|
2019-12-19 10:45:24 -05:00
|
|
|
|
2020-02-19 13:18:38 -05:00
|
|
|
# Wait 2 seconds for everything to stabilize before we declare all-done
|
|
|
|
time.sleep(2)
|
|
|
|
zkhandler.writedata(self.zk_conn, {'/nodes/{}/routerstate'.format(self.name): 'secondary'})
|
2019-12-19 10:45:24 -05:00
|
|
|
self.logger.out('Node {} transitioned to secondary state'.format(self.name), state='o')
|
2018-10-17 00:23:43 -04:00
|
|
|
|
2018-05-31 22:31:20 -04:00
|
|
|
# Flush all VMs on the host
|
2018-06-04 16:34:41 -04:00
|
|
|
def flush(self):
|
2019-05-10 23:52:24 -04:00
|
|
|
# Begin flush
|
2018-10-14 02:01:35 -04:00
|
|
|
self.logger.out('Flushing node "{}" of running VMs'.format(self.name), state='i')
|
2019-07-10 22:24:03 -04:00
|
|
|
self.logger.out('VM list: {}'.format(', '.join(self.domain_list)), state='i')
|
2018-07-17 14:29:18 -04:00
|
|
|
fixed_domain_list = self.domain_list.copy()
|
|
|
|
for dom_uuid in fixed_domain_list:
|
2019-07-10 01:07:56 -04:00
|
|
|
# Allow us to cancel the operation
|
|
|
|
if self.flush_stopper:
|
|
|
|
self.logger.out('Aborting node flush'.format(self.name), state='i')
|
|
|
|
self.flush_thread = None
|
|
|
|
self.flush_stopper = False
|
|
|
|
return
|
|
|
|
|
2018-10-14 02:01:35 -04:00
|
|
|
self.logger.out('Selecting target to migrate VM "{}"'.format(dom_uuid), state='i')
|
2018-07-17 14:29:18 -04:00
|
|
|
|
2019-07-07 15:27:37 -04:00
|
|
|
# Don't replace the previous node if the VM is already migrated
|
2019-07-10 00:43:55 -04:00
|
|
|
if zkhandler.readdata(self.zk_conn, '/domains/{}/lastnode'.format(dom_uuid)):
|
2019-07-07 15:29:34 -04:00
|
|
|
current_node = zkhandler.readdata(self.zk_conn, '/domains/{}/lastnode'.format(dom_uuid))
|
2019-07-07 15:27:37 -04:00
|
|
|
else:
|
|
|
|
current_node = zkhandler.readdata(self.zk_conn, '/domains/{}/node'.format(dom_uuid))
|
|
|
|
|
2020-10-20 11:08:30 -04:00
|
|
|
target_node = common.findTargetNode(self.zk_conn, self.config, self.logger, dom_uuid)
|
2020-05-09 13:42:43 -04:00
|
|
|
if target_node == current_node:
|
|
|
|
target_node = None
|
|
|
|
|
2019-10-12 01:17:39 -04:00
|
|
|
if target_node is None:
|
|
|
|
self.logger.out('Failed to find migration target for VM "{}"; shutting down and setting autostart flag'.format(dom_uuid), state='e')
|
2020-11-07 12:57:42 -05:00
|
|
|
zkhandler.writedata(self.zk_conn, {'/domains/{}/state'.format(dom_uuid): 'shutdown'})
|
|
|
|
zkhandler.writedata(self.zk_conn, {'/domains/{}/node_autostart'.format(dom_uuid): 'True'})
|
2020-05-09 13:42:43 -04:00
|
|
|
else:
|
|
|
|
self.logger.out('Migrating VM "{}" to node "{}"'.format(dom_uuid, target_node), state='i')
|
|
|
|
zkhandler.writedata(self.zk_conn, {
|
|
|
|
'/domains/{}/state'.format(dom_uuid): 'migrate',
|
|
|
|
'/domains/{}/node'.format(dom_uuid): target_node,
|
|
|
|
'/domains/{}/lastnode'.format(dom_uuid): current_node
|
|
|
|
})
|
2019-07-07 15:27:37 -04:00
|
|
|
|
|
|
|
# Wait for the VM to migrate so the next VM's free RAM count is accurate (they migrate in serial anyways)
|
2020-05-09 13:42:43 -04:00
|
|
|
ticks = 0
|
2020-01-05 15:21:16 -05:00
|
|
|
while zkhandler.readdata(self.zk_conn, '/domains/{}/state'.format(dom_uuid)) in ['migrate', 'unmigrate', 'shutdown']:
|
2020-05-09 13:42:43 -04:00
|
|
|
ticks += 1
|
|
|
|
if ticks > 600:
|
|
|
|
# Abort if we've waited for 120 seconds, the VM is messed and just continue
|
|
|
|
break
|
|
|
|
time.sleep(0.2)
|
2019-07-07 15:27:37 -04:00
|
|
|
|
2020-11-07 12:57:42 -05:00
|
|
|
zkhandler.writedata(self.zk_conn, {'/nodes/{}/runningdomains'.format(self.name): ''})
|
|
|
|
zkhandler.writedata(self.zk_conn, {'/nodes/{}/domainstate'.format(self.name): 'flushed'})
|
2019-07-10 01:07:56 -04:00
|
|
|
self.flush_thread = None
|
|
|
|
self.flush_stopper = False
|
2019-12-26 11:08:00 -05:00
|
|
|
return
|
2018-06-11 18:05:35 -04:00
|
|
|
|
2018-06-04 03:09:51 -04:00
|
|
|
def unflush(self):
|
2018-10-14 02:01:35 -04:00
|
|
|
self.logger.out('Restoring node {} to active service.'.format(self.name), state='i')
|
|
|
|
fixed_domain_list = self.d_domain.copy()
|
2018-07-17 14:29:18 -04:00
|
|
|
for dom_uuid in fixed_domain_list:
|
2019-07-10 01:07:56 -04:00
|
|
|
# Allow us to cancel the operation
|
|
|
|
if self.flush_stopper:
|
|
|
|
self.logger.out('Aborting node unflush'.format(self.name), state='i')
|
|
|
|
self.flush_thread = None
|
|
|
|
self.flush_stopper = False
|
|
|
|
return
|
|
|
|
|
2019-10-12 01:17:39 -04:00
|
|
|
# Handle autostarts
|
|
|
|
autostart = zkhandler.readdata(self.zk_conn, '/domains/{}/node_autostart'.format(dom_uuid))
|
|
|
|
node = zkhandler.readdata(self.zk_conn, '/domains/{}/node'.format(dom_uuid))
|
|
|
|
if autostart == 'True' and node == self.name:
|
|
|
|
self.logger.out('Starting autostart VM "{}"'.format(dom_uuid), state='i')
|
|
|
|
zkhandler.writedata(self.zk_conn, {
|
|
|
|
'/domains/{}/state'.format(dom_uuid): 'start',
|
|
|
|
'/domains/{}/node'.format(dom_uuid): self.name,
|
|
|
|
'/domains/{}/lastnode'.format(dom_uuid): '',
|
|
|
|
'/domains/{}/node_autostart'.format(dom_uuid): 'False'
|
|
|
|
})
|
|
|
|
continue
|
|
|
|
|
2018-07-17 00:14:46 -04:00
|
|
|
try:
|
2018-10-14 02:01:35 -04:00
|
|
|
last_node = zkhandler.readdata(self.zk_conn, '/domains/{}/lastnode'.format(dom_uuid))
|
2020-11-06 18:55:10 -05:00
|
|
|
except Exception:
|
2018-07-17 00:14:46 -04:00
|
|
|
continue
|
|
|
|
|
2018-10-14 02:01:35 -04:00
|
|
|
if last_node != self.name:
|
2018-06-06 20:36:03 -04:00
|
|
|
continue
|
|
|
|
|
2018-10-14 02:01:35 -04:00
|
|
|
self.logger.out('Setting unmigration for VM "{}"'.format(dom_uuid), state='i')
|
2018-06-26 23:24:33 -04:00
|
|
|
zkhandler.writedata(self.zk_conn, {
|
|
|
|
'/domains/{}/state'.format(dom_uuid): 'migrate',
|
2018-10-14 02:01:35 -04:00
|
|
|
'/domains/{}/node'.format(dom_uuid): self.name,
|
|
|
|
'/domains/{}/lastnode'.format(dom_uuid): ''
|
2018-06-26 23:24:33 -04:00
|
|
|
})
|
2018-06-04 03:09:51 -04:00
|
|
|
|
2019-05-11 00:30:47 -04:00
|
|
|
# Wait for the VM to migrate back
|
2020-01-05 15:21:16 -05:00
|
|
|
while zkhandler.readdata(self.zk_conn, '/domains/{}/state'.format(dom_uuid)) in ['migrate', 'unmigrate', 'shutdown']:
|
2020-02-04 17:52:37 -05:00
|
|
|
time.sleep(0.1)
|
2019-05-11 00:30:47 -04:00
|
|
|
|
2020-11-07 12:57:42 -05:00
|
|
|
zkhandler.writedata(self.zk_conn, {'/nodes/{}/domainstate'.format(self.name): 'ready'})
|
2019-07-10 01:07:56 -04:00
|
|
|
self.flush_thread = None
|
|
|
|
self.flush_stopper = False
|
2019-12-26 11:08:00 -05:00
|
|
|
return
|