Compare commits

...

8 Commits

Author SHA1 Message Date
dcb9c0d12c Improve fence handling conditions
Use the intermediate output text when judging the fence status, rather
than the retcode of the stop as this should be more reliable.
2024-05-08 10:55:15 -04:00
f6e856bf98 Fix debug output on timeout 2024-05-06 10:49:57 -04:00
f1fe0c63f5 Bump version to 0.9.97 2024-04-19 10:32:16 -04:00
ab944f9b95 Add RBD snap purge during volume removal
Fixes #180
2024-04-19 10:31:11 -04:00
9714ac20b2 Update formatting for Black 24.4.0 2024-04-19 10:26:06 -04:00
79ad09ae59 Switch virtual memory free to allocated
Avoids incorrect reporting if cache/buffers exceeds normal.
2024-04-19 10:25:33 -04:00
4c6aabec6a Fix bug if d_network changes 2024-04-05 14:05:51 -04:00
559400ed90 Explicitly set --lines to integer type 2024-03-13 13:01:02 -04:00
18 changed files with 78 additions and 35 deletions

View File

@ -1 +1 @@
0.9.96
0.9.97

View File

@ -1,5 +1,12 @@
## PVC Changelog
###### [v0.9.97](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.97)
* [Client CLI] Ensures --lines is always an integer value
* [Node Daemon] Fixes a bug if d_network changes during iteration
* [Node Daemon] Moves to using allocated instead of free memory for node reporting
* [API Daemon] Fixes a bug if lingering RBD snapshots exist when removing a volume (#180)
###### [v0.9.96](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.96)
* [API Daemon] Fixes a bug when reporting node stats

View File

@ -27,7 +27,7 @@ from distutils.util import strtobool as dustrtobool
import daemon_lib.config as cfg
# Daemon version
version = "0.9.96"
version = "0.9.97"
# API version
API_VERSION = 1.0

View File

@ -671,9 +671,9 @@ def cli_cluster_maintenance_off():
@format_opt(
{
"pretty": cli_cluster_task_format_pretty,
"raw": lambda d: "\n".join([t["id"] for t in d])
if isinstance(d, list)
else d["state"],
"raw": lambda d: (
"\n".join([t["id"] for t in d]) if isinstance(d, list) else d["state"]
),
"json": lambda d: jdumps(d),
"json-pretty": lambda d: jdumps(d, indent=2),
}
@ -892,6 +892,7 @@ def cli_node_ready(
"--lines",
"lines",
default=None,
type=int,
show_default=False,
help="Display this many log lines from the end of the log buffer. [default: 1000; with follow: 10]",
)
@ -2516,6 +2517,7 @@ def cli_vm_volume_remove(domain, volume, live_flag, restart_flag):
"--lines",
"lines",
default=None,
type=int,
show_default=False,
help="Display this many log lines from the end of the log buffer. [default: 1000; with follow: 10]",
)

View File

@ -580,9 +580,11 @@ def cli_cluster_fault_list_format_long(CLI_CONFIG, fault_data):
fault_id=fault["id"],
fault_status=fault["status"].title(),
fault_health_delta=f"-{fault['health_delta']}%",
fault_acknowledged_at=fault["acknowledged_at"]
if fault["acknowledged_at"] != ""
else "N/A",
fault_acknowledged_at=(
fault["acknowledged_at"]
if fault["acknowledged_at"] != ""
else "N/A"
),
fault_last_reported=fault["last_reported"],
fault_first_reported=fault["first_reported"],
)

View File

@ -108,9 +108,10 @@ class UploadProgressBar(object):
class ErrorResponse(requests.Response):
def __init__(self, json_data, status_code):
def __init__(self, json_data, status_code, headers):
self.json_data = json_data
self.status_code = status_code
self.headers = headers
def json(self):
return self.json_data
@ -206,7 +207,7 @@ def call_api(
except Exception as e:
message = "Failed to connect to the API: {}".format(e)
code = response.status_code if response else 504
response = ErrorResponse({"message": message}, code)
response = ErrorResponse({"message": message}, code, None)
# Display debug output
if config["debug"]:

View File

@ -1765,9 +1765,9 @@ def format_info(config, domain_information, long_output):
tags_name=tag["name"],
tags_type=tag["type"],
tags_protected=str(tag["protected"]),
tags_protected_colour=ansiprint.green()
if tag["protected"]
else ansiprint.blue(),
tags_protected_colour=(
ansiprint.green() if tag["protected"] else ansiprint.blue()
),
end=ansiprint.end(),
)
)

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="pvc",
version="0.9.96",
version="0.9.97",
packages=["pvc.cli", "pvc.lib"],
install_requires=[
"Click",

View File

@ -320,7 +320,11 @@ def get_list_osd(zkhandler, limit=None, is_fuzzy=True):
#
def getPoolInformation(zkhandler, pool):
# Parse the stats data
(pool_stats_raw, tier, pgs,) = zkhandler.read_many(
(
pool_stats_raw,
tier,
pgs,
) = zkhandler.read_many(
[
("pool.stats", pool),
("pool.tier", pool),
@ -824,10 +828,22 @@ def remove_volume(zkhandler, pool, name):
name, pool
)
# 1. Remove volume snapshots
# 1a. Remove PVC-managed volume snapshots
for snapshot in zkhandler.children(("snapshot", f"{pool}/{name}")):
remove_snapshot(zkhandler, pool, name, snapshot)
# 1b. Purge any remaining volume snapshots
retcode, stdout, stderr = common.run_os_command(
"rbd snap purge {}/{}".format(pool, name)
)
if retcode:
return (
False,
'ERROR: Failed to purge snapshots from RBD volume "{}" in pool "{}": {}'.format(
name, pool, stderr
),
)
# 2. Remove the volume
retcode, stdout, stderr = common.run_os_command("rbd rm {}/{}".format(pool, name))
if retcode:

View File

@ -244,9 +244,9 @@ def get_parsed_configuration(config_file):
]
][0]
config_cluster_networks_specific[
f"{network_type}_dev_ip"
] = f"{list(network.hosts())[address_id]}/{network.prefixlen}"
config_cluster_networks_specific[f"{network_type}_dev_ip"] = (
f"{list(network.hosts())[address_id]}/{network.prefixlen}"
)
config = {**config, **config_cluster_networks_specific}

9
debian/changelog vendored
View File

@ -1,3 +1,12 @@
pvc (0.9.97-0) unstable; urgency=high
* [Client CLI] Ensures --lines is always an integer value
* [Node Daemon] Fixes a bug if d_network changes during iteration
* [Node Daemon] Moves to using allocated instead of free memory for node reporting
* [API Daemon] Fixes a bug if lingering RBD snapshots exist when removing a volume (#180)
-- Joshua M. Boniface <joshua@boniface.me> Fri, 19 Apr 2024 10:32:16 -0400
pvc (0.9.96-0) unstable; urgency=high
* [API Daemon] Fixes a bug when reporting node stats

View File

@ -33,7 +33,7 @@ import os
import signal
# Daemon version
version = "0.9.96"
version = "0.9.97"
##########################################################

View File

@ -49,7 +49,7 @@ import re
import json
# Daemon version
version = "0.9.96"
version = "0.9.97"
##########################################################

View File

@ -231,7 +231,7 @@ class NetstatsInstance(object):
# Get a list of all active interfaces
net_root_path = "/sys/class/net"
all_ifaces = list()
for (_, dirnames, _) in walk(net_root_path):
for _, dirnames, _ in walk(net_root_path):
all_ifaces.extend(dirnames)
all_ifaces.sort()

View File

@ -521,7 +521,7 @@ class NodeInstance(object):
self.logger.out("Acquired write lock for synchronization phase F", state="o")
time.sleep(0.2) # Time fir reader to acquire the lock
# 4. Add gateway IPs
for network in self.d_network:
for network in self.d_network.copy():
self.d_network[network].createGateways()
self.logger.out("Releasing write lock for synchronization phase F", state="i")
self.zkhandler.write([("base.config.primary_node.sync_lock", "")])

View File

@ -253,12 +253,16 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
state="i",
prefix=f"fencing {node_name}",
)
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
(
ipmi_intermediate_status_retcode,
ipmi_intermediate_status_stdout,
ipmi_intermediate_status_stderr,
) = common.run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
)
if ipmi_status_retcode == 0:
if ipmi_intermediate_status_retcode == 0:
logger.out(
f"Current chassis power state is: {ipmi_status_stdout.strip()}",
f"Current chassis power state is: {ipmi_intermediate_status_stdout.strip()}",
state="i",
prefix=f"fencing {node_name}",
)
@ -299,12 +303,14 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
state="i",
prefix=f"fencing {node_name}",
)
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
ipmi_final_status_retcode, ipmi_final_status_stdout, ipmi_final_status_stderr = (
common.run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
)
)
if ipmi_stop_retcode == 0:
if ipmi_status_stdout.strip() == "Chassis Power is on":
if ipmi_intermediate_status_stdout.strip() == "Chassis power is off":
if ipmi_final_status_stdout.strip() == "Chassis Power is on":
# We successfully rebooted the node and it is powered on; this is a succeessful fence
logger.out(
"Successfully rebooted dead node; proceeding with fence recovery action",
@ -312,7 +318,7 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
prefix=f"fencing {node_name}",
)
return True
elif ipmi_status_stdout.strip() == "Chassis Power is off":
elif ipmi_final_status_stdout.strip() == "Chassis Power is off":
# We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
logger.out(
"Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action",
@ -323,13 +329,13 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
else:
# We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
logger.out(
f"Chassis power is in an unknown state ({ipmi_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
f"Chassis power is in an unknown state ({ipmi_final_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
state="e",
prefix=f"fencing {node_name}",
)
return False
else:
if ipmi_status_stdout.strip() == "Chassis Power is off":
if ipmi_final_status_stdout.strip() == "Chassis Power is off":
# We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
logger.out(
"Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action",

View File

@ -743,7 +743,7 @@ def node_keepalive(logger, config, zkhandler, this_node, netstats):
# Get node performance statistics
this_node.memtotal = int(psutil.virtual_memory().total / 1024 / 1024)
this_node.memused = int(psutil.virtual_memory().used / 1024 / 1024)
this_node.memfree = int(psutil.virtual_memory().free / 1024 / 1024)
this_node.memfree = int(psutil.virtual_memory().available / 1024 / 1024)
this_node.cpuload = round(os.getloadavg()[0], 2)
# Get node network statistics via netstats instance

View File

@ -44,7 +44,7 @@ from daemon_lib.vmbuilder import (
)
# Daemon version
version = "0.9.96"
version = "0.9.97"
config = cfg.get_configuration()