Implement cluster maintenance mode
Implements a "maintenance mode" for PVC clusters. For now, the only thing this mode does is disable node fencing while the state is true. This allows the administrator to tell PVC that network connectivity, etc. might be interrupted and to avoid fencing nodes. Closes #70
This commit is contained in:
parent
4cda5ebb52
commit
b6474198a4
|
@ -47,6 +47,7 @@ def initialize_cluster():
|
||||||
transaction = zk_conn.transaction()
|
transaction = zk_conn.transaction()
|
||||||
transaction.create('/primary_node', 'none'.encode('ascii'))
|
transaction.create('/primary_node', 'none'.encode('ascii'))
|
||||||
transaction.create('/upstream_ip', 'none'.encode('ascii'))
|
transaction.create('/upstream_ip', 'none'.encode('ascii'))
|
||||||
|
transaction.create('/maintenance', 'False'.encode('ascii'))
|
||||||
transaction.create('/nodes', ''.encode('ascii'))
|
transaction.create('/nodes', ''.encode('ascii'))
|
||||||
transaction.create('/domains', ''.encode('ascii'))
|
transaction.create('/domains', ''.encode('ascii'))
|
||||||
transaction.create('/networks', ''.encode('ascii'))
|
transaction.create('/networks', ''.encode('ascii'))
|
||||||
|
@ -69,7 +70,7 @@ def initialize_cluster():
|
||||||
return True
|
return True
|
||||||
|
|
||||||
#
|
#
|
||||||
# Status function
|
# Cluster functions
|
||||||
#
|
#
|
||||||
def cluster_status():
|
def cluster_status():
|
||||||
"""
|
"""
|
||||||
|
@ -81,6 +82,24 @@ def cluster_status():
|
||||||
|
|
||||||
return retdata, 200
|
return retdata, 200
|
||||||
|
|
||||||
|
def cluster_maintenance(maint_state='false'):
|
||||||
|
"""
|
||||||
|
Set the cluster in or out of maintenance state
|
||||||
|
"""
|
||||||
|
zk_conn = pvc_common.startZKConnection(config['coordinators'])
|
||||||
|
retflag, retdata = pvc_cluster.set_maintenance(zk_conn, maint_state)
|
||||||
|
pvc_common.stopZKConnection(zk_conn)
|
||||||
|
|
||||||
|
retdata = {
|
||||||
|
'message': retdata
|
||||||
|
}
|
||||||
|
if retflag:
|
||||||
|
retcode = 200
|
||||||
|
else:
|
||||||
|
retcode = 400
|
||||||
|
|
||||||
|
return retdata, retcode
|
||||||
|
|
||||||
#
|
#
|
||||||
# Node functions
|
# Node functions
|
||||||
#
|
#
|
||||||
|
|
|
@ -390,6 +390,37 @@ class API_Status(Resource):
|
||||||
description: Bad request
|
description: Bad request
|
||||||
"""
|
"""
|
||||||
return api_helper.cluster_status()
|
return api_helper.cluster_status()
|
||||||
|
|
||||||
|
@RequestParser([
|
||||||
|
{ 'name': 'state', 'choices': ('true', 'false'), 'required': True, 'helpmsg': "A valid state must be specified" }
|
||||||
|
])
|
||||||
|
@Authenticator
|
||||||
|
def post(self, reqargs):
|
||||||
|
"""
|
||||||
|
Set the cluster maintenance mode
|
||||||
|
---
|
||||||
|
tags:
|
||||||
|
- node
|
||||||
|
parameters:
|
||||||
|
- in: query
|
||||||
|
name: state
|
||||||
|
type: boolean
|
||||||
|
required: true
|
||||||
|
description: The cluster maintenance state
|
||||||
|
responses:
|
||||||
|
200:
|
||||||
|
description: OK
|
||||||
|
schema:
|
||||||
|
type: object
|
||||||
|
id: Message
|
||||||
|
400:
|
||||||
|
description: Bad request
|
||||||
|
schema:
|
||||||
|
type: object
|
||||||
|
id: Message
|
||||||
|
"""
|
||||||
|
return api_helper.cluster_maintenance(reqargs.get('state', 'false'))
|
||||||
|
|
||||||
api.add_resource(API_Status, '/status')
|
api.add_resource(API_Status, '/status')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,26 @@ def initialize(config):
|
||||||
|
|
||||||
return retstatus, response.json()['message']
|
return retstatus, response.json()['message']
|
||||||
|
|
||||||
|
def maintenance_mode(config, state):
|
||||||
|
"""
|
||||||
|
Enable or disable PVC cluster maintenance mode
|
||||||
|
|
||||||
|
API endpoint: POST /api/v1/status
|
||||||
|
API arguments: {state}={state}
|
||||||
|
API schema: {json_data_object}
|
||||||
|
"""
|
||||||
|
params = {
|
||||||
|
'state': state
|
||||||
|
}
|
||||||
|
response = call_api(config, 'post', '/status', params=params)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
retstatus = True
|
||||||
|
else:
|
||||||
|
retstatus = False
|
||||||
|
|
||||||
|
return retstatus, response.json()['message']
|
||||||
|
|
||||||
def get_info(config):
|
def get_info(config):
|
||||||
"""
|
"""
|
||||||
Get status of the PVC cluster
|
Get status of the PVC cluster
|
||||||
|
@ -67,6 +87,8 @@ def format_info(cluster_information, oformat):
|
||||||
# Plain formatting, i.e. human-readable
|
# Plain formatting, i.e. human-readable
|
||||||
if cluster_information['health'] == 'Optimal':
|
if cluster_information['health'] == 'Optimal':
|
||||||
health_colour = ansiprint.green()
|
health_colour = ansiprint.green()
|
||||||
|
elif cluster_information['health'] == 'Maintenance':
|
||||||
|
health_colour = ansiprint.blue()
|
||||||
else:
|
else:
|
||||||
health_colour = ansiprint.yellow()
|
health_colour = ansiprint.yellow()
|
||||||
|
|
||||||
|
|
|
@ -3054,15 +3054,40 @@ def provisioner_status(job):
|
||||||
cleanup(retcode, retdata)
|
cleanup(retcode, retdata)
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# pvc maintenance
|
||||||
|
###############################################################################
|
||||||
|
@click.group(name='maintenance', short_help='Manage PVC cluster maintenance state.', context_settings=CONTEXT_SETTINGS)
|
||||||
|
def cli_maintenance():
|
||||||
|
"""
|
||||||
|
Manage the maintenance mode of the PVC cluster.
|
||||||
|
"""
|
||||||
|
# Abort commands under this group if config is bad
|
||||||
|
if config.get('badcfg', None):
|
||||||
|
click.echo('No cluster specified and no local pvc-api.yaml configuration found. Use "pvc cluster" to add a cluster API to connect to.')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# pvc maintenance on
|
||||||
|
###############################################################################
|
||||||
|
@click.command(name='on', short_help='Enable cluster maintenance mode.')
|
||||||
|
def maintenance_on():
|
||||||
|
"""
|
||||||
|
Enable maintenance mode on the PVC cluster.
|
||||||
|
"""
|
||||||
|
retcode, retdata = pvc_cluster.maintenance_mode(config, 'true')
|
||||||
|
cleanup(retcode, retdata)
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# pvc maintenance off
|
||||||
|
###############################################################################
|
||||||
|
@click.command(name='off', short_help='Disable cluster maintenance mode.')
|
||||||
|
def maintenance_off():
|
||||||
|
"""
|
||||||
|
Disable maintenance mode on the PVC cluster.
|
||||||
|
"""
|
||||||
|
retcode, retdata = pvc_cluster.maintenance_mode(config, 'false')
|
||||||
|
cleanup(retcode, retdata)
|
||||||
|
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
@ -3291,12 +3316,16 @@ cli_provisioner.add_command(provisioner_profile)
|
||||||
cli_provisioner.add_command(provisioner_create)
|
cli_provisioner.add_command(provisioner_create)
|
||||||
cli_provisioner.add_command(provisioner_status)
|
cli_provisioner.add_command(provisioner_status)
|
||||||
|
|
||||||
|
cli_maintenance.add_command(maintenance_on)
|
||||||
|
cli_maintenance.add_command(maintenance_off)
|
||||||
|
|
||||||
cli.add_command(cli_cluster)
|
cli.add_command(cli_cluster)
|
||||||
cli.add_command(cli_node)
|
cli.add_command(cli_node)
|
||||||
cli.add_command(cli_vm)
|
cli.add_command(cli_vm)
|
||||||
cli.add_command(cli_network)
|
cli.add_command(cli_network)
|
||||||
cli.add_command(cli_storage)
|
cli.add_command(cli_storage)
|
||||||
cli.add_command(cli_provisioner)
|
cli.add_command(cli_provisioner)
|
||||||
|
cli.add_command(cli_maintenance)
|
||||||
cli.add_command(status_cluster)
|
cli.add_command(status_cluster)
|
||||||
cli.add_command(init_cluster)
|
cli.add_command(init_cluster)
|
||||||
|
|
||||||
|
|
|
@ -20,9 +20,10 @@
|
||||||
#
|
#
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
import click
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from distutils.util import strtobool
|
||||||
|
|
||||||
import client_lib.ansiprint as ansiprint
|
import client_lib.ansiprint as ansiprint
|
||||||
import client_lib.zkhandler as zkhandler
|
import client_lib.zkhandler as zkhandler
|
||||||
import client_lib.common as common
|
import client_lib.common as common
|
||||||
|
@ -31,7 +32,24 @@ import client_lib.node as pvc_node
|
||||||
import client_lib.network as pvc_network
|
import client_lib.network as pvc_network
|
||||||
import client_lib.ceph as pvc_ceph
|
import client_lib.ceph as pvc_ceph
|
||||||
|
|
||||||
|
def set_maintenance(zk_conn, maint_state):
|
||||||
|
try:
|
||||||
|
if maint_state == 'true':
|
||||||
|
zkhandler.writedata(zk_conn, {'/maintenance': 'true'})
|
||||||
|
return True, 'Successfully set cluster in maintenance mode'
|
||||||
|
else:
|
||||||
|
zkhandler.writedata(zk_conn, {'/maintenance': 'false'})
|
||||||
|
return True, 'Successfully set cluster in normal mode'
|
||||||
|
except:
|
||||||
|
return False, 'Failed to set cluster maintenance state'
|
||||||
|
|
||||||
def getClusterInformation(zk_conn):
|
def getClusterInformation(zk_conn):
|
||||||
|
# Get cluster maintenance state
|
||||||
|
try:
|
||||||
|
maint_state = zkhandler.readdata(zk_conn, '/maintenance')
|
||||||
|
except:
|
||||||
|
maint_state = 'false'
|
||||||
|
|
||||||
# Get node information object list
|
# Get node information object list
|
||||||
retcode, node_list = pvc_node.get_list(zk_conn, None)
|
retcode, node_list = pvc_node.get_list(zk_conn, None)
|
||||||
|
|
||||||
|
@ -102,7 +120,9 @@ def getClusterInformation(zk_conn):
|
||||||
ceph_osd_report_status[index] = up_texts[ceph_osd_up] + ',' + in_texts[ceph_osd_in]
|
ceph_osd_report_status[index] = up_texts[ceph_osd_up] + ',' + in_texts[ceph_osd_in]
|
||||||
|
|
||||||
# Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
|
# Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
|
||||||
if False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status:
|
if maint_state == 'true':
|
||||||
|
cluster_health = 'Maintenance'
|
||||||
|
elif False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status:
|
||||||
cluster_health = 'Degraded'
|
cluster_health = 'Degraded'
|
||||||
else:
|
else:
|
||||||
cluster_health = 'Optimal'
|
cluster_health = 'Optimal'
|
||||||
|
@ -173,75 +193,3 @@ def get_info(zk_conn):
|
||||||
return True, cluster_information
|
return True, cluster_information
|
||||||
else:
|
else:
|
||||||
return False, 'ERROR: Failed to obtain cluster information!'
|
return False, 'ERROR: Failed to obtain cluster information!'
|
||||||
|
|
||||||
def format_info(cluster_information, oformat):
|
|
||||||
if oformat == 'json':
|
|
||||||
print(json.dumps(cluster_information))
|
|
||||||
return
|
|
||||||
|
|
||||||
if oformat == 'json-pretty':
|
|
||||||
print(json.dumps(cluster_information, indent=4))
|
|
||||||
return
|
|
||||||
|
|
||||||
# Plain formatting, i.e. human-readable
|
|
||||||
if cluster_information['health'] == 'Optimal':
|
|
||||||
health_colour = ansiprint.green()
|
|
||||||
else:
|
|
||||||
health_colour = ansiprint.yellow()
|
|
||||||
|
|
||||||
ainformation = []
|
|
||||||
ainformation.append('{}PVC cluster status:{}'.format(ansiprint.bold(), ansiprint.end()))
|
|
||||||
ainformation.append('')
|
|
||||||
ainformation.append('{}Cluster health:{} {}{}{}'.format(ansiprint.purple(), ansiprint.end(), health_colour, cluster_information['health'], ansiprint.end()))
|
|
||||||
ainformation.append('{}Primary node:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['primary_node']))
|
|
||||||
ainformation.append('{}Cluster upstream IP:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['upstream_ip']))
|
|
||||||
ainformation.append('')
|
|
||||||
ainformation.append('{}Total nodes:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['nodes']['total']))
|
|
||||||
ainformation.append('{}Total VMs:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['vms']['total']))
|
|
||||||
ainformation.append('{}Total networks:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['networks']))
|
|
||||||
ainformation.append('{}Total OSDs:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['osds']['total']))
|
|
||||||
ainformation.append('{}Total pools:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['pools']))
|
|
||||||
ainformation.append('{}Total volumes:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['volumes']))
|
|
||||||
ainformation.append('{}Total snapshots:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['snapshots']))
|
|
||||||
|
|
||||||
nodes_string = '{}Nodes:{} {}/{} {}ready,run{}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['nodes']['run,ready'], cluster_information['nodes']['total'], ansiprint.green(), ansiprint.end())
|
|
||||||
for state, count in cluster_information['nodes'].items():
|
|
||||||
if state == 'total' or state == 'run,ready':
|
|
||||||
continue
|
|
||||||
|
|
||||||
nodes_string += ' {}/{} {}{}{}'.format(count, cluster_information['nodes']['total'], ansiprint.yellow(), state, ansiprint.end())
|
|
||||||
|
|
||||||
ainformation.append('')
|
|
||||||
ainformation.append(nodes_string)
|
|
||||||
|
|
||||||
vms_string = '{}VMs:{} {}/{} {}start{}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['vms']['start'], cluster_information['vms']['total'], ansiprint.green(), ansiprint.end())
|
|
||||||
for state, count in cluster_information['vms'].items():
|
|
||||||
if state == 'total' or state == 'start':
|
|
||||||
continue
|
|
||||||
|
|
||||||
if state == 'disable':
|
|
||||||
colour = ansiprint.blue()
|
|
||||||
else:
|
|
||||||
colour = ansiprint.yellow()
|
|
||||||
|
|
||||||
vms_string += ' {}/{} {}{}{}'.format(count, cluster_information['vms']['total'], colour, state, ansiprint.end())
|
|
||||||
|
|
||||||
ainformation.append('')
|
|
||||||
ainformation.append(vms_string)
|
|
||||||
|
|
||||||
if cluster_information['osds']['total'] > 0:
|
|
||||||
osds_string = '{}Ceph OSDs:{} {}/{} {}up,in{}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['osds']['up,in'], cluster_information['osds']['total'], ansiprint.green(), ansiprint.end())
|
|
||||||
for state, count in cluster_information['osds'].items():
|
|
||||||
if state == 'total' or state == 'up,in':
|
|
||||||
continue
|
|
||||||
|
|
||||||
osds_string += ' {}/{} {}{}{}'.format(count, cluster_information['osds']['total'], ansiprint.yellow(), state, ansiprint.end())
|
|
||||||
|
|
||||||
ainformation.append('')
|
|
||||||
ainformation.append(osds_string)
|
|
||||||
|
|
||||||
information = '\n'.join(ainformation)
|
|
||||||
click.echo(information)
|
|
||||||
|
|
||||||
click.echo('')
|
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ X-Python3-Version: >= 3.2
|
||||||
|
|
||||||
Package: pvc-daemon
|
Package: pvc-daemon
|
||||||
Architecture: all
|
Architecture: all
|
||||||
Depends: systemd, pvc-client-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
|
Depends: systemd, pvc-client-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
|
||||||
Suggests: pvc-client-api, pvc-client-cli
|
Suggests: pvc-client-api, pvc-client-cli
|
||||||
Description: Parallel Virtual Cluster virtualization daemon (Python 3)
|
Description: Parallel Virtual Cluster virtualization daemon (Python 3)
|
||||||
A KVM/Zookeeper/Ceph-based VM and private cloud manager
|
A KVM/Zookeeper/Ceph-based VM and private cloud manager
|
||||||
|
|
|
@ -3653,6 +3653,36 @@
|
||||||
"tags": [
|
"tags": [
|
||||||
"root"
|
"root"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
"post": {
|
||||||
|
"description": "",
|
||||||
|
"parameters": [
|
||||||
|
{
|
||||||
|
"description": "The cluster maintenance state",
|
||||||
|
"in": "query",
|
||||||
|
"name": "state",
|
||||||
|
"required": true,
|
||||||
|
"type": "boolean"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "OK",
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/definitions/Message"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"400": {
|
||||||
|
"description": "Bad request",
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/definitions/Message"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"summary": "Set the cluster maintenance mode",
|
||||||
|
"tags": [
|
||||||
|
"node"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/api/v1/storage/ceph/option": {
|
"/api/v1/storage/ceph/option": {
|
||||||
|
|
|
@ -42,6 +42,8 @@ import json
|
||||||
import ipaddress
|
import ipaddress
|
||||||
import apscheduler.schedulers.background
|
import apscheduler.schedulers.background
|
||||||
|
|
||||||
|
from distutils.util import strtobool
|
||||||
|
|
||||||
import pvcd.log as log
|
import pvcd.log as log
|
||||||
import pvcd.zkhandler as zkhandler
|
import pvcd.zkhandler as zkhandler
|
||||||
import pvcd.fencing as fencing
|
import pvcd.fencing as fencing
|
||||||
|
@ -112,6 +114,9 @@ try:
|
||||||
except IndexError:
|
except IndexError:
|
||||||
mynodeid = 1
|
mynodeid = 1
|
||||||
|
|
||||||
|
# Maintenance mode off by default
|
||||||
|
maintenance = False
|
||||||
|
|
||||||
# Gather useful data about our host
|
# Gather useful data about our host
|
||||||
# Static data format: 'cpu_count', 'arch', 'os', 'kernel'
|
# Static data format: 'cpu_count', 'arch', 'os', 'kernel'
|
||||||
staticdata = []
|
staticdata = []
|
||||||
|
@ -771,6 +776,15 @@ def update_nodes(new_node_list):
|
||||||
# Alias for our local node (passed to network and domain objects)
|
# Alias for our local node (passed to network and domain objects)
|
||||||
this_node = d_node[myhostname]
|
this_node = d_node[myhostname]
|
||||||
|
|
||||||
|
# Maintenance mode
|
||||||
|
@zk_conn.DataWatch('/maintenance')
|
||||||
|
def set_maintenance(_maintenance, stat, event=''):
|
||||||
|
global maintenance
|
||||||
|
try:
|
||||||
|
maintenance = bool(strtobool(_maintenance.decode('ascii')))
|
||||||
|
except:
|
||||||
|
maintenance = False
|
||||||
|
|
||||||
# Primary node
|
# Primary node
|
||||||
@zk_conn.DataWatch('/primary_node')
|
@zk_conn.DataWatch('/primary_node')
|
||||||
def update_primary(new_primary, stat, event=''):
|
def update_primary(new_primary, stat, event=''):
|
||||||
|
@ -1271,6 +1285,7 @@ def update_zookeeper():
|
||||||
lv_conn.close()
|
lv_conn.close()
|
||||||
|
|
||||||
# Look for dead nodes and fence them
|
# Look for dead nodes and fence them
|
||||||
|
if not maintenance:
|
||||||
if debug:
|
if debug:
|
||||||
print("Look for dead nodes and fence them")
|
print("Look for dead nodes and fence them")
|
||||||
if config['daemon_mode'] == 'coordinator':
|
if config['daemon_mode'] == 'coordinator':
|
||||||
|
@ -1321,6 +1336,7 @@ def update_zookeeper():
|
||||||
)
|
)
|
||||||
if config['log_keepalive_cluster_details']:
|
if config['log_keepalive_cluster_details']:
|
||||||
logger.out(
|
logger.out(
|
||||||
|
'{bold}Maintenance:{nofmt} {maint} '
|
||||||
'{bold}Active VMs:{nofmt} {domcount} '
|
'{bold}Active VMs:{nofmt} {domcount} '
|
||||||
'{bold}Networks:{nofmt} {netcount} '
|
'{bold}Networks:{nofmt} {netcount} '
|
||||||
'{bold}Load:{nofmt} {load} '
|
'{bold}Load:{nofmt} {load} '
|
||||||
|
@ -1329,6 +1345,7 @@ def update_zookeeper():
|
||||||
'{bold}Free:{nofmt} {freemem}'.format(
|
'{bold}Free:{nofmt} {freemem}'.format(
|
||||||
bold=fmt_bold,
|
bold=fmt_bold,
|
||||||
nofmt=fmt_end,
|
nofmt=fmt_end,
|
||||||
|
maint=maintenance,
|
||||||
domcount=this_node.domains_count,
|
domcount=this_node.domains_count,
|
||||||
netcount=len(network_list),
|
netcount=len(network_list),
|
||||||
load=this_node.cpuload,
|
load=this_node.cpuload,
|
||||||
|
|
Loading…
Reference in New Issue