pvc/daemon-common/cluster.py

#!/usr/bin/env python3

# cluster.py - PVC client function library, cluster management
# Part of the Parallel Virtual Cluster (PVC) system
#
#    Copyright (C) 2018-2020 Joshua M. Boniface <joshua@boniface.me>
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################

import json

from distutils.util import strtobool

import daemon_lib.ansiprint as ansiprint
import daemon_lib.zkhandler as zkhandler
import daemon_lib.common as common
import daemon_lib.vm as pvc_vm
import daemon_lib.node as pvc_node
import daemon_lib.network as pvc_network
import daemon_lib.ceph as pvc_ceph

def set_maintenance(zk_conn, maint_state):
    try:
        if maint_state == 'true':
            zkhandler.writedata(zk_conn, {'/maintenance': 'true'})
            return True, 'Successfully set cluster in maintenance mode'
        else:
            zkhandler.writedata(zk_conn, {'/maintenance': 'false'})
            return True, 'Successfully set cluster in normal mode'
    except:
        return False, 'Failed to set cluster maintenance state'

def getClusterInformation(zk_conn):
    # Get cluster maintenance state
    try:
        maint_state = zkhandler.readdata(zk_conn, '/maintenance')
    except:
        maint_state = 'false'

    # Get node information object list
    retcode, node_list = pvc_node.get_list(zk_conn, None)

    # Get vm information object list
    retcode, vm_list = pvc_vm.get_list(zk_conn, None, None, None)

    # Get network information object list
    retcode, network_list = pvc_network.get_list(zk_conn, None, None)

    # Get storage information object list
    retcode, ceph_osd_list = pvc_ceph.get_list_osd(zk_conn, None)
    retcode, ceph_pool_list = pvc_ceph.get_list_pool(zk_conn, None)
    retcode, ceph_volume_list = pvc_ceph.get_list_volume(zk_conn, None, None)
    retcode, ceph_snapshot_list = pvc_ceph.get_list_snapshot(zk_conn, None, None, None)

    # Determine, for each subsection, the total count
    node_count = len(node_list)
    vm_count = len(vm_list)
    network_count = len(network_list)
    ceph_osd_count = len(ceph_osd_list)
    ceph_pool_count = len(ceph_pool_list)
    ceph_volume_count = len(ceph_volume_list)
    ceph_snapshot_count = len(ceph_snapshot_list)

    # Determinations for node health
    node_healthy_status = list(range(0, node_count))
    node_report_status = list(range(0, node_count))
    for index, node in enumerate(node_list):
        daemon_state = node['daemon_state']
        domain_state = node['domain_state']
        if daemon_state != 'run' and domain_state != 'ready':
            node_healthy_status[index] = False
        else:
            node_healthy_status[index] = True
        node_report_status[index] = daemon_state + ',' +  domain_state

    # Determinations for VM health
    vm_healthy_status = list(range(0, vm_count))
    vm_report_status = list(range(0, vm_count))
    for index, vm in enumerate(vm_list):
        vm_state = vm['state']
        if vm_state not in ['start', 'disable', 'migrate', 'unmigrate', 'provision']:
            vm_healthy_status[index] = False
        else:
            vm_healthy_status[index] = True
        vm_report_status[index] = vm_state

    # Determinations for OSD health
    ceph_osd_healthy_status = list(range(0, ceph_osd_count))
    ceph_osd_report_status = list(range(0, ceph_osd_count))
    for index, ceph_osd in enumerate(ceph_osd_list):
        try:
            ceph_osd_up = ceph_osd['stats']['up']
        except KeyError:
            ceph_osd_up = 0

        try:
            ceph_osd_in = ceph_osd['stats']['in']
        except KeyError:
            ceph_osd_in = 0

        if not ceph_osd_up or not ceph_osd_in:
            ceph_osd_healthy_status[index] = False
        else:
            ceph_osd_healthy_status[index] = True
        up_texts = { 1: 'up', 0: 'down' }
        in_texts = { 1: 'in', 0: 'out' }
        ceph_osd_report_status[index] = up_texts[ceph_osd_up] + ',' + in_texts[ceph_osd_in]

    # Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
    if maint_state == 'true':
        cluster_health = 'Maintenance'
    elif False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status:
        cluster_health = 'Degraded'
    else:
        cluster_health = 'Optimal'

    # State lists
    node_state_combinations = [
        'run,ready', 'run,flush', 'run,flushed', 'run,unflush',
        'init,ready', 'init,flush', 'init,flushed', 'init,unflush',
        'stop,ready', 'stop,flush', 'stop,flushed', 'stop,unflush'
    ]
    vm_state_combinations = [
        'start', 'restart', 'shutdown', 'stop', 'disable', 'fail', 'migrate', 'unmigrate', 'provision'
    ]
    ceph_osd_state_combinations = [
        'up,in', 'up,out', 'down,in', 'down,out'
    ]

    # Format the Node states
    formatted_node_states = {'total': node_count}
    for state in node_state_combinations:
        state_count = 0
        for node_state in node_report_status:
            if node_state == state:
                state_count += 1
        if state_count > 0:
            formatted_node_states[state] = state_count

    # Format the VM states
    formatted_vm_states = {'total': vm_count}
    for state in vm_state_combinations:
        state_count = 0
        for vm_state in vm_report_status:
            if vm_state == state:
                state_count += 1
        if state_count > 0:
            formatted_vm_states[state] = state_count

    # Format the OSD states
    formatted_osd_states = {'total': ceph_osd_count}
    for state in ceph_osd_state_combinations:
        state_count = 0
        for ceph_osd_state in ceph_osd_report_status:
            if ceph_osd_state == state:
                state_count += 1
        if state_count > 0:
            formatted_osd_states[state] = state_count

    # Format the status data
    cluster_information = {
        'health': cluster_health,
        'primary_node': common.getPrimaryNode(zk_conn),
        'upstream_ip': zkhandler.readdata(zk_conn, '/upstream_ip'),
        'nodes': formatted_node_states,
        'vms': formatted_vm_states,
        'networks': network_count,
        'osds': formatted_osd_states,
        'pools': ceph_pool_count,
        'volumes': ceph_volume_count,
        'snapshots': ceph_snapshot_count
    }

    return cluster_information

def get_info(zk_conn):
    # This is a thin wrapper function for naming purposes
    cluster_information = getClusterInformation(zk_conn)
    if cluster_information:
        return True, cluster_information
    else:
        return False, 'ERROR: Failed to obtain cluster information!'
Add cluster status command 2019-10-22 11:23:12 -04:00			`#!/usr/bin/env python3`

			`# cluster.py - PVC client function library, cluster management`
			`# Part of the Parallel Virtual Cluster (PVC) system`
			`#`
Update copyright header year to 2020 2020-01-08 19:38:02 -05:00			`# Copyright (C) 2018-2020 Joshua M. Boniface <joshua@boniface.me>`
Add cluster status command 2019-10-22 11:23:12 -04:00			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <https://www.gnu.org/licenses/>.`
			`#`
			`###############################################################################`

			`import json`

Implement cluster maintenance mode Implements a "maintenance mode" for PVC clusters. For now, the only thing this mode does is disable node fencing while the state is true. This allows the administrator to tell PVC that network connectivity, etc. might be interrupted and to avoid fencing nodes. Closes #70 2020-01-09 10:53:27 -05:00			`from distutils.util import strtobool`

Rename API and common Debian packages Closes #79 2020-02-08 18:48:59 -05:00			`import daemon_lib.ansiprint as ansiprint`
			`import daemon_lib.zkhandler as zkhandler`
			`import daemon_lib.common as common`
			`import daemon_lib.vm as pvc_vm`
			`import daemon_lib.node as pvc_node`
			`import daemon_lib.network as pvc_network`
			`import daemon_lib.ceph as pvc_ceph`
Add cluster status command 2019-10-22 11:23:12 -04:00
Implement cluster maintenance mode Implements a "maintenance mode" for PVC clusters. For now, the only thing this mode does is disable node fencing while the state is true. This allows the administrator to tell PVC that network connectivity, etc. might be interrupted and to avoid fencing nodes. Closes #70 2020-01-09 10:53:27 -05:00			`def set_maintenance(zk_conn, maint_state):`
			`try:`
			`if maint_state == 'true':`
			`zkhandler.writedata(zk_conn, {'/maintenance': 'true'})`
			`return True, 'Successfully set cluster in maintenance mode'`
			`else:`
			`zkhandler.writedata(zk_conn, {'/maintenance': 'false'})`
			`return True, 'Successfully set cluster in normal mode'`
			`except:`
			`return False, 'Failed to set cluster maintenance state'`

Add cluster status command 2019-10-22 11:23:12 -04:00			`def getClusterInformation(zk_conn):`
Implement cluster maintenance mode Implements a "maintenance mode" for PVC clusters. For now, the only thing this mode does is disable node fencing while the state is true. This allows the administrator to tell PVC that network connectivity, etc. might be interrupted and to avoid fencing nodes. Closes #70 2020-01-09 10:53:27 -05:00			`# Get cluster maintenance state`
			`try:`
			`maint_state = zkhandler.readdata(zk_conn, '/maintenance')`
			`except:`
			`maint_state = 'false'`

Add cluster status command 2019-10-22 11:23:12 -04:00			`# Get node information object list`
			`retcode, node_list = pvc_node.get_list(zk_conn, None)`

			`# Get vm information object list`
			`retcode, vm_list = pvc_vm.get_list(zk_conn, None, None, None)`

			`# Get network information object list`
			`retcode, network_list = pvc_network.get_list(zk_conn, None, None)`

			`# Get storage information object list`
			`retcode, ceph_osd_list = pvc_ceph.get_list_osd(zk_conn, None)`
			`retcode, ceph_pool_list = pvc_ceph.get_list_pool(zk_conn, None)`
			`retcode, ceph_volume_list = pvc_ceph.get_list_volume(zk_conn, None, None)`
			`retcode, ceph_snapshot_list = pvc_ceph.get_list_snapshot(zk_conn, None, None, None)`

			`# Determine, for each subsection, the total count`
			`node_count = len(node_list)`
			`vm_count = len(vm_list)`
			`network_count = len(network_list)`
			`ceph_osd_count = len(ceph_osd_list)`
			`ceph_pool_count = len(ceph_pool_list)`
			`ceph_volume_count = len(ceph_volume_list)`
			`ceph_snapshot_count = len(ceph_snapshot_list)`

			`# Determinations for node health`
			`node_healthy_status = list(range(0, node_count))`
			`node_report_status = list(range(0, node_count))`
			`for index, node in enumerate(node_list):`
			`daemon_state = node['daemon_state']`
			`domain_state = node['domain_state']`
			`if daemon_state != 'run' and domain_state != 'ready':`
			`node_healthy_status[index] = False`
			`else:`
			`node_healthy_status[index] = True`
			`node_report_status[index] = daemon_state + ',' + domain_state`

			`# Determinations for VM health`
			`vm_healthy_status = list(range(0, vm_count))`
			`vm_report_status = list(range(0, vm_count))`
			`for index, vm in enumerate(vm_list):`
			`vm_state = vm['state']`
Add provision state for VMs 2020-01-08 17:40:02 -05:00			`if vm_state not in ['start', 'disable', 'migrate', 'unmigrate', 'provision']:`
Add cluster status command 2019-10-22 11:23:12 -04:00			`vm_healthy_status[index] = False`
			`else:`
			`vm_healthy_status[index] = True`
			`vm_report_status[index] = vm_state`

			`# Determinations for OSD health`
			`ceph_osd_healthy_status = list(range(0, ceph_osd_count))`
			`ceph_osd_report_status = list(range(0, ceph_osd_count))`
			`for index, ceph_osd in enumerate(ceph_osd_list):`
Better handle missing OSD stat keys 2019-10-22 13:59:28 -04:00			`try:`
			`ceph_osd_up = ceph_osd['stats']['up']`
			`except KeyError:`
			`ceph_osd_up = 0`

			`try:`
			`ceph_osd_in = ceph_osd['stats']['in']`
			`except KeyError:`
			`ceph_osd_in = 0`

Add cluster status command 2019-10-22 11:23:12 -04:00			`if not ceph_osd_up or not ceph_osd_in:`
			`ceph_osd_healthy_status[index] = False`
			`else:`
			`ceph_osd_healthy_status[index] = True`
			`up_texts = { 1: 'up', 0: 'down' }`
			`in_texts = { 1: 'in', 0: 'out' }`
Correct invalid variable name 2019-10-22 12:18:51 -04:00			`ceph_osd_report_status[index] = up_texts[ceph_osd_up] + ',' + in_texts[ceph_osd_in]`
Add cluster status command 2019-10-22 11:23:12 -04:00
			`# Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy`
Implement cluster maintenance mode Implements a "maintenance mode" for PVC clusters. For now, the only thing this mode does is disable node fencing while the state is true. This allows the administrator to tell PVC that network connectivity, etc. might be interrupted and to avoid fencing nodes. Closes #70 2020-01-09 10:53:27 -05:00			`if maint_state == 'true':`
			`cluster_health = 'Maintenance'`
			`elif False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status:`
Add cluster status command 2019-10-22 11:23:12 -04:00			`cluster_health = 'Degraded'`
			`else:`
			`cluster_health = 'Optimal'`

			`# State lists`
			`node_state_combinations = [`
			`'run,ready', 'run,flush', 'run,flushed', 'run,unflush',`
			`'init,ready', 'init,flush', 'init,flushed', 'init,unflush',`
			`'stop,ready', 'stop,flush', 'stop,flushed', 'stop,unflush'`
			`]`
			`vm_state_combinations = [`
Add provision state for VMs 2020-01-08 17:40:02 -05:00			`'start', 'restart', 'shutdown', 'stop', 'disable', 'fail', 'migrate', 'unmigrate', 'provision'`
Add cluster status command 2019-10-22 11:23:12 -04:00			`]`
			`ceph_osd_state_combinations = [`
			`'up,in', 'up,out', 'down,in', 'down,out'`
			`]`

			`# Format the Node states`
			`formatted_node_states = {'total': node_count}`
			`for state in node_state_combinations:`
			`state_count = 0`
			`for node_state in node_report_status:`
			`if node_state == state:`
			`state_count += 1`
			`if state_count > 0:`
			`formatted_node_states[state] = state_count`

			`# Format the VM states`
			`formatted_vm_states = {'total': vm_count}`
			`for state in vm_state_combinations:`
			`state_count = 0`
			`for vm_state in vm_report_status:`
			`if vm_state == state:`
			`state_count += 1`
			`if state_count > 0:`
			`formatted_vm_states[state] = state_count`

			`# Format the OSD states`
			`formatted_osd_states = {'total': ceph_osd_count}`
			`for state in ceph_osd_state_combinations:`
			`state_count = 0`
			`for ceph_osd_state in ceph_osd_report_status:`
			`if ceph_osd_state == state:`
			`state_count += 1`
			`if state_count > 0:`
			`formatted_osd_states[state] = state_count`

			`# Format the status data`
			`cluster_information = {`
			`'health': cluster_health,`
			`'primary_node': common.getPrimaryNode(zk_conn),`
			`'upstream_ip': zkhandler.readdata(zk_conn, '/upstream_ip'),`
			`'nodes': formatted_node_states,`
			`'vms': formatted_vm_states,`
			`'networks': network_count,`
			`'osds': formatted_osd_states,`
			`'pools': ceph_pool_count,`
			`'volumes': ceph_volume_count,`
			`'snapshots': ceph_snapshot_count`
			`}`

			`return cluster_information`

			`def get_info(zk_conn):`
			`# This is a thin wrapper function for naming purposes`
			`cluster_information = getClusterInformation(zk_conn)`
			`if cluster_information:`
			`return True, cluster_information`
			`else:`
			`return False, 'ERROR: Failed to obtain cluster information!'`