Add node log following functionality

This commit is contained in:
Joshua Boniface 2021-07-18 20:37:43 -04:00
commit bd7a773d6b
12 changed files with 370 additions and 14 deletions

View File

@ -834,6 +834,52 @@ class API_Node_DomainState(Resource):
api.add_resource(API_Node_DomainState, '/node/<node>/domain-state')
# /node/<node</log
class API_Node_Log(Resource):
@RequestParser([
{'name': 'lines'}
])
@Authenticator
def get(self, node, reqargs):
"""
Return the recent logs of {node}
---
tags:
- node
parameters:
- in: query
name: lines
type: integer
required: false
description: The number of lines to retrieve
responses:
200:
description: OK
schema:
type: object
id: NodeLog
properties:
name:
type: string
description: The name of the Node
data:
type: string
description: The recent log text
404:
description: Node not found
schema:
type: object
id: Message
"""
return api_helper.node_log(
node,
reqargs.get('lines', None)
)
api.add_resource(API_Node_Log, '/node/<node>/log')
##########################################################
# Client API - VM
##########################################################

View File

@ -307,6 +307,34 @@ def node_ready(zkhandler, node, wait):
return output, retcode
@ZKConnection(config)
def node_log(zkhandler, node, lines=None):
"""
Return the current logs for Node.
"""
# Default to 10 lines of log if not set
try:
lines = int(lines)
except TypeError:
lines = 10
retflag, retdata = pvc_node.get_node_log(zkhandler, node, lines)
if retflag:
retcode = 200
retdata = {
'name': node,
'data': retdata
}
else:
retcode = 400
retdata = {
'message': retdata
}
return retdata, retcode
#
# VM functions
#

View File

@ -19,6 +19,8 @@
#
###############################################################################
import time
import pvc.cli_lib.ansiprint as ansiprint
from pvc.cli_lib.common import call_api
@ -69,6 +71,89 @@ def node_domain_state(config, node, action, wait):
return retstatus, response.json().get('message', '')
def view_node_log(config, node, lines=100):
"""
Return node log lines from the API (and display them in a pager in the main CLI)
API endpoint: GET /node/{node}/log
API arguments: lines={lines}
API schema: {"name":"{node}","data":"{node_log}"}
"""
params = {
'lines': lines
}
response = call_api(config, 'get', '/node/{node}/log'.format(node=node), params=params)
if response.status_code != 200:
return False, response.json().get('message', '')
node_log = response.json()['data']
# Shrink the log buffer to length lines
shrunk_log = node_log.split('\n')[-lines:]
loglines = '\n'.join(shrunk_log)
return True, loglines
def follow_node_log(config, node, lines=10):
"""
Return and follow node log lines from the API
API endpoint: GET /node/{node}/log
API arguments: lines={lines}
API schema: {"name":"{nodename}","data":"{node_log}"}
"""
# We always grab 200 to match the follow call, but only _show_ `lines` number
params = {
'lines': 200
}
response = call_api(config, 'get', '/node/{node}/log'.format(node=node), params=params)
if response.status_code != 200:
return False, response.json().get('message', '')
# Shrink the log buffer to length lines
node_log = response.json()['data']
shrunk_log = node_log.split('\n')[-int(lines):]
loglines = '\n'.join(shrunk_log)
# Print the initial data and begin following
print(loglines, end='')
print('\n', end='')
while True:
# Grab the next line set (200 is a reasonable number of lines per half-second; any more are skipped)
try:
params = {
'lines': 200
}
response = call_api(config, 'get', '/node/{node}/log'.format(node=node), params=params)
new_node_log = response.json()['data']
except Exception:
break
# Split the new and old log strings into constitutent lines
old_node_loglines = node_log.split('\n')
new_node_loglines = new_node_log.split('\n')
# Set the node log to the new log value for the next iteration
node_log = new_node_log
# Get the difference between the two sets of lines
old_node_loglines_set = set(old_node_loglines)
diff_node_loglines = [x for x in new_node_loglines if x not in old_node_loglines_set]
# If there's a difference, print it out
if len(diff_node_loglines) > 0:
print('\n'.join(diff_node_loglines), end='')
print('\n', end='')
# Wait half a second
time.sleep(0.5)
return True, ''
def node_info(config, node):
"""
Get information about node

View File

@ -1215,9 +1215,9 @@ def follow_console_log(config, vm, lines=10):
API arguments: lines={lines}
API schema: {"name":"{vmname}","data":"{console_log}"}
"""
# We always grab 500 to match the follow call, but only _show_ `lines` number
# We always grab 200 to match the follow call, but only _show_ `lines` number
params = {
'lines': 500
'lines': 200
}
response = call_api(config, 'get', '/vm/{vm}/console'.format(vm=vm), params=params)
@ -1233,10 +1233,10 @@ def follow_console_log(config, vm, lines=10):
print(loglines, end='')
while True:
# Grab the next line set (500 is a reasonable number of lines per second; any more are skipped)
# Grab the next line set (200 is a reasonable number of lines per half-second; any more are skipped)
try:
params = {
'lines': 500
'lines': 200
}
response = call_api(config, 'get', '/vm/{vm}/console'.format(vm=vm), params=params)
new_console_log = response.json()['data']
@ -1245,8 +1245,10 @@ def follow_console_log(config, vm, lines=10):
# Split the new and old log strings into constitutent lines
old_console_loglines = console_log.split('\n')
new_console_loglines = new_console_log.split('\n')
# Set the console log to the new log value for the next iteration
console_log = new_console_log
# Remove the lines from the old log until we hit the first line of the new log; this
# ensures that the old log is a string that we can remove from the new log entirely
for index, line in enumerate(old_console_loglines, start=0):
@ -1261,8 +1263,8 @@ def follow_console_log(config, vm, lines=10):
# If there's a difference, print it out
if diff_console_log:
print(diff_console_log, end='')
# Wait a second
time.sleep(1)
# Wait half a second
time.sleep(0.5)
return True, ''

View File

@ -540,6 +540,43 @@ def node_unflush(node, wait):
cleanup(retcode, retmsg)
###############################################################################
# pvc node log
###############################################################################
@click.command(name='log', short_help='Show logs of a node.')
@click.argument(
'node'
)
@click.option(
'-l', '--lines', 'lines', default=None, show_default=False,
help='Display this many log lines from the end of the log buffer. [default: 1000; with follow: 10]'
)
@click.option(
'-f', '--follow', 'follow', is_flag=True, default=False,
help='Follow the log buffer; output may be delayed by a few seconds relative to the live system. The --lines value defaults to 10 for the initial output.'
)
@cluster_req
def node_log(node, lines, follow):
"""
Show node logs of virtual machine DOMAIN on its current node in a pager or continuously. DOMAIN may be a UUID or name. Note that migrating a VM to a different node will cause the log buffer to be overwritten by entries from the new node.
"""
# Set the default here so we can handle it
if lines is None:
if follow:
lines = 10
else:
lines = 1000
if follow:
retcode, retmsg = pvc_node.follow_node_log(config, node, lines)
else:
retcode, retmsg = pvc_node.view_node_log(config, node, lines)
click.echo_via_pager(retmsg)
retmsg = ''
cleanup(retcode, retmsg)
###############################################################################
# pvc node info
###############################################################################
@ -4707,6 +4744,7 @@ cli_node.add_command(node_primary)
cli_node.add_command(node_flush)
cli_node.add_command(node_ready)
cli_node.add_command(node_unflush)
cli_node.add_command(node_log)
cli_node.add_command(node_info)
cli_node.add_command(node_list)

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python3
# log.py - Output (stdout + logfile) functions
# log.py - PVC daemon logger functions
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2021 Joshua M. Boniface <joshua@boniface.me>
@ -19,7 +19,12 @@
#
###############################################################################
import datetime
from collections import deque
from threading import Thread
from queue import Queue
from datetime import datetime
from daemon_lib.zkhandler import ZKHandler
class Logger(object):
@ -77,17 +82,32 @@ class Logger(object):
self.last_colour = ''
self.last_prompt = ''
if self.config['zookeeper_logging']:
self.zookeeper_logger = ZookeeperLogger(config)
self.zookeeper_logger.start()
# Provide a hup function to close and reopen the writer
def hup(self):
self.writer.close()
self.writer = open(self.logfile, 'a', buffering=0)
# Provide a termination function so all messages are flushed before terminating the main daemon
def terminate(self):
if self.config['file_logging']:
self.writer.close()
if self.config['zookeeper_logging']:
self.out("Waiting for Zookeeper message queue to drain", state='s')
while not self.zookeeper_logger.queue.empty():
pass
self.zookeeper_logger.stop()
self.zookeeper_logger.join()
# Output function
def out(self, message, state=None, prefix=''):
# Get the date
if self.config['log_dates']:
date = '{} - '.format(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S.%f'))
date = '{} '.format(datetime.now().strftime('%Y/%m/%d %H:%M:%S.%f'))
else:
date = ''
@ -123,6 +143,64 @@ class Logger(object):
if self.config['file_logging']:
self.writer.write(message + '\n')
# Log to Zookeeper
if self.config['zookeeper_logging']:
self.zookeeper_logger.queue.put(message)
# Set last message variables
self.last_colour = colour
self.last_prompt = prompt
class ZookeeperLogger(Thread):
"""
Defines a threaded writer for Zookeeper locks. Threading prevents the blocking of other
daemon events while the records are written. They will be eventually-consistent
"""
def __init__(self, config):
self.config = config
self.node = self.config['node']
self.max_lines = self.config['node_log_lines']
self.queue = Queue()
self.zkhandler = None
self.start_zkhandler()
self.zkhandler.write([(('logs', self.node), '')])
self.running = False
Thread.__init__(self, args=(), kwargs=None)
def start_zkhandler(self):
# We must open our own dedicated Zookeeper instance because we can't guarantee one already exists when this starts
if self.zkhandler is not None:
try:
self.zkhandler.disconnect()
except Exception:
pass
self.zkhandler = ZKHandler(self.config, logger=None)
self.zkhandler.connect(persistent=True)
def run(self):
self.running = True
# Get the logs that are currently in Zookeeper and populate our deque
logs = deque(self.zkhandler.read(('logs.messages', self.node)).split('\n'), self.max_lines)
while self.running:
# Get a new message
try:
message = self.queue.get(block=False)
if not message:
continue
except Exception:
continue
if not self.config['log_dates']:
# We want to log dates here, even if the log_dates config is not set
date = '{} '.format(datetime.now().strftime('%Y/%m/%d %H:%M:%S.%f'))
else:
date = ''
# Add the message to the deque
logs.append(f'{date}{message}')
# Write the updated messages into Zookeeper
self.zkhandler.write([(('logs.messages', self.node), '\n'.join(logs))])
return
def stop(self):
self.running = False

View File

@ -0,0 +1 @@
{"version": "4", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}

View File

@ -182,6 +182,24 @@ def ready_node(zkhandler, node, wait=False):
return True, retmsg
def get_node_log(zkhandler, node, lines=2000):
# Verify node is valid
if not common.verifyNode(zkhandler, node):
return False, 'ERROR: No node named "{}" is present in the cluster.'.format(node)
# Get the data from ZK
node_log = zkhandler.read(('logs.messages', node))
if node_log is None:
return True, ''
# Shrink the log buffer to length lines
shrunk_log = node_log.split('\n')[-lines:]
loglines = '\n'.join(shrunk_log)
return True, loglines
def get_info(zkhandler, node):
# Verify node is valid
if not common.verifyNode(zkhandler, node):

View File

@ -466,7 +466,7 @@ class ZKHandler(object):
#
class ZKSchema(object):
# Current version
_version = 3
_version = 4
# Root for doing nested keys
_schema_root = ''
@ -490,6 +490,7 @@ class ZKSchema(object):
'cmd.node': f'{_schema_root}/cmd/nodes',
'cmd.domain': f'{_schema_root}/cmd/domains',
'cmd.ceph': f'{_schema_root}/cmd/ceph',
'logs': '/logs',
'node': f'{_schema_root}/nodes',
'domain': f'{_schema_root}/domains',
'network': f'{_schema_root}/networks',
@ -500,6 +501,11 @@ class ZKSchema(object):
'volume': f'{_schema_root}/ceph/volumes',
'snapshot': f'{_schema_root}/ceph/snapshots',
},
# The schema of an individual logs entry (/logs/{node_name})
'logs': {
'node': '', # The root key
'messages': '/messages',
},
# The schema of an individual node entry (/nodes/{node_name})
'node': {
'name': '', # The root key
@ -771,7 +777,7 @@ class ZKSchema(object):
logger.out(f'Key not found: {self.path(kpath)}', state='w')
result = False
for elem in ['node', 'domain', 'network', 'osd', 'pool']:
for elem in ['logs', 'node', 'domain', 'network', 'osd', 'pool']:
# First read all the subelements of the key class
for child in zkhandler.zk_conn.get_children(self.path(f'base.{elem}')):
# For each key in the schema for that particular elem
@ -850,7 +856,7 @@ class ZKSchema(object):
data = ''
zkhandler.zk_conn.create(self.path(kpath), data.encode(zkhandler.encoding))
for elem in ['node', 'domain', 'network', 'osd', 'pool']:
for elem in ['logs', 'node', 'domain', 'network', 'osd', 'pool']:
# First read all the subelements of the key class
for child in zkhandler.zk_conn.get_children(self.path(f'base.{elem}')):
# For each key in the schema for that particular elem

View File

@ -144,6 +144,19 @@
},
"type": "object"
},
"NodeLog": {
"properties": {
"data": {
"description": "The recent log text",
"type": "string"
},
"name": {
"description": "The name of the Node",
"type": "string"
}
},
"type": "object"
},
"VMLog": {
"properties": {
"data": {
@ -2665,6 +2678,38 @@
]
}
},
"/api/v1/node/{node}/log": {
"get": {
"description": "",
"parameters": [
{
"description": "The number of lines to retrieve",
"in": "query",
"name": "lines",
"required": false,
"type": "integer"
}
],
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/NodeLog"
}
},
"404": {
"description": "Node not found",
"schema": {
"$ref": "#/definitions/Message"
}
}
},
"summary": "Return the recent logs of {node}",
"tags": [
"node"
]
}
},
"/api/v1/provisioner/create": {
"post": {
"description": "Note: Starts a background job in the pvc-provisioner-worker Celery worker while returning a task ID; the task ID can be used to query the \"GET /provisioner/status/<task_id>\" endpoint for the job status",

View File

@ -140,6 +140,8 @@ pvc:
file_logging: True
# stdout_logging: Enable or disable logging to stdout (i.e. journald)
stdout_logging: True
# zookeeper_logging: Enable ot disable logging to Zookeeper (for `pvc node log` functionality)
zookeeper_logging: True
# log_colours: Enable or disable ANSI colours in log output
log_colours: True
# log_dates: Enable or disable date strings in log output
@ -152,10 +154,12 @@ pvc:
log_keepalive_storage_details: True
# console_log_lines: Number of console log lines to store in Zookeeper per VM
console_log_lines: 1000
# node_log_lines: Number of node log lines to store in Zookeeper per node
node_log_lines: 2000
# networking: PVC networking configuration
# OPTIONAL if enable_networking: False
networking:
# bridge_device: Underlying device to use for bridged vLAN networks; usually the device underlying <cluster>
# bridge_device: Underlying device to use for bridged vLAN networks; usually the device of <cluster>
bridge_device: ens4
# sriov_enable: Enable or disable (default if absent) SR-IOV network support
sriov_enable: False

View File

@ -147,6 +147,7 @@ def readConfig(pvcnoded_config_file, myhostname):
# Handle the basic config (hypervisor-only)
try:
config_general = {
'node': o_config['pvc']['node'],
'coordinators': o_config['pvc']['cluster']['coordinators'],
'enable_hypervisor': o_config['pvc']['functions']['enable_hypervisor'],
'enable_networking': o_config['pvc']['functions']['enable_networking'],
@ -157,12 +158,14 @@ def readConfig(pvcnoded_config_file, myhostname):
'console_log_directory': o_config['pvc']['system']['configuration']['directories']['console_log_directory'],
'file_logging': o_config['pvc']['system']['configuration']['logging']['file_logging'],
'stdout_logging': o_config['pvc']['system']['configuration']['logging']['stdout_logging'],
'zookeeper_logging': o_config['pvc']['system']['configuration']['logging'].get('zookeeper_logging', False),
'log_colours': o_config['pvc']['system']['configuration']['logging']['log_colours'],
'log_dates': o_config['pvc']['system']['configuration']['logging']['log_dates'],
'log_keepalives': o_config['pvc']['system']['configuration']['logging']['log_keepalives'],
'log_keepalive_cluster_details': o_config['pvc']['system']['configuration']['logging']['log_keepalive_cluster_details'],
'log_keepalive_storage_details': o_config['pvc']['system']['configuration']['logging']['log_keepalive_storage_details'],
'console_log_lines': o_config['pvc']['system']['configuration']['logging']['console_log_lines'],
'node_log_lines': o_config['pvc']['system']['configuration']['logging'].get('node_log_lines', 0),
'vm_shutdown_timeout': int(o_config['pvc']['system']['intervals']['vm_shutdown_timeout']),
'keepalive_interval': int(o_config['pvc']['system']['intervals']['keepalive_interval']),
'fence_intervals': int(o_config['pvc']['system']['intervals']['fence_intervals']),
@ -694,7 +697,7 @@ else:
# Cleanup function
def cleanup():
global zkhandler, update_timer, d_domain
global logger, zkhandler, update_timer, d_domain
logger.out('Terminating pvcnoded and cleaning up', state='s')
@ -758,6 +761,8 @@ def cleanup():
pass
logger.out('Terminated pvc daemon', state='s')
logger.terminate()
os._exit(0)