Compare commits
2 Commits
c08c3b2d7d
...
a6f8500309
Author | SHA1 | Date | |
---|---|---|---|
a6f8500309 | |||
ebec1332e9 |
1
daemon-common/migrations/versions/15.json
Normal file
1
daemon-common/migrations/versions/15.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"version": "15", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.fence_lock": "/config/fence_lock", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "logs": "/logs", "faults": "/faults", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "faults": {"id": "", "last_time": "/last_time", "first_time": "/first_time", "ack_time": "/ack_time", "status": "/status", "delta": "/delta", "message": "/message"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health", "network.stats": "/network_stats"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.migrate_max_downtime": "/migration_max_downtime", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock", "snapshots": "/snapshots"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "domain_snapshot": {"name": "", "timestamp": "/timestamp", "xml": "/xml", "rbd_snapshots": "/rbdsnaplist"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
|
@ -30,7 +30,7 @@ from kazoo.client import KazooClient, KazooState
|
|||||||
from kazoo.exceptions import NoNodeError
|
from kazoo.exceptions import NoNodeError
|
||||||
|
|
||||||
|
|
||||||
SCHEMA_ROOT_PATH = "/usr/share/pvc/daemon_lib/migrations/versions"
|
SCHEMA_ROOT_PATH = "daemon_lib/migrations/versions"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -576,7 +576,7 @@ class ZKHandler(object):
|
|||||||
#
|
#
|
||||||
class ZKSchema(object):
|
class ZKSchema(object):
|
||||||
# Current version
|
# Current version
|
||||||
_version = 14
|
_version = 15
|
||||||
|
|
||||||
# Root for doing nested keys
|
# Root for doing nested keys
|
||||||
_schema_root = ""
|
_schema_root = ""
|
||||||
@ -592,6 +592,7 @@ class ZKSchema(object):
|
|||||||
"schema.version": f"{_schema_root}/schema/version",
|
"schema.version": f"{_schema_root}/schema/version",
|
||||||
"config": f"{_schema_root}/config",
|
"config": f"{_schema_root}/config",
|
||||||
"config.maintenance": f"{_schema_root}/config/maintenance",
|
"config.maintenance": f"{_schema_root}/config/maintenance",
|
||||||
|
"config.fence_lock": f"{_schema_root}/config/fence_lock",
|
||||||
"config.primary_node": f"{_schema_root}/config/primary_node",
|
"config.primary_node": f"{_schema_root}/config/primary_node",
|
||||||
"config.primary_node.sync_lock": f"{_schema_root}/config/primary_node/sync_lock",
|
"config.primary_node.sync_lock": f"{_schema_root}/config/primary_node/sync_lock",
|
||||||
"config.upstream_ip": f"{_schema_root}/config/upstream_ip",
|
"config.upstream_ip": f"{_schema_root}/config/upstream_ip",
|
||||||
|
@ -21,15 +21,72 @@
|
|||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from kazoo.exceptions import LockTimeout
|
||||||
|
|
||||||
import daemon_lib.common as common
|
import daemon_lib.common as common
|
||||||
|
|
||||||
from daemon_lib.vm import vm_worker_flush_locks
|
from daemon_lib.vm import vm_worker_flush_locks
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Fence thread entry function
|
# Fence monitor thread entrypoint
|
||||||
#
|
#
|
||||||
def fence_node(node_name, zkhandler, config, logger):
|
def fence_monitor(zkhandler, config, logger):
|
||||||
|
# Attempt to acquire an exclusive lock on the fence_lock key
|
||||||
|
# If it is already held, we'll abort since another node is processing fences
|
||||||
|
lock = zkhandler.exclusivelock("base.config.fence_lock")
|
||||||
|
|
||||||
|
try:
|
||||||
|
lock.acquire(timeout=config["keepalive_interval"] - 1)
|
||||||
|
|
||||||
|
for node_name in zkhandler.children("base.node"):
|
||||||
|
try:
|
||||||
|
node_daemon_state = zkhandler.read(("node.state.daemon", node_name))
|
||||||
|
node_keepalive = int(zkhandler.read(("node.keepalive", node_name)))
|
||||||
|
except Exception:
|
||||||
|
node_daemon_state = "unknown"
|
||||||
|
node_keepalive = 0
|
||||||
|
|
||||||
|
node_deadtime = int(time.time()) - (
|
||||||
|
int(config["keepalive_interval"]) * int(config["fence_intervals"])
|
||||||
|
)
|
||||||
|
if node_keepalive < node_deadtime and node_daemon_state == "run":
|
||||||
|
logger.out(
|
||||||
|
f"Node {node_name} seems dead; starting monitor for fencing",
|
||||||
|
state="w",
|
||||||
|
)
|
||||||
|
zk_lock = zkhandler.writelock(("node.state.daemon", node_name))
|
||||||
|
with zk_lock:
|
||||||
|
# Ensures that, if we lost the lock race and come out of waiting,
|
||||||
|
# we won't try to trigger our own fence thread.
|
||||||
|
if zkhandler.read(("node.state.daemon", node_name)) != "dead":
|
||||||
|
# Write the updated data after we start the fence thread
|
||||||
|
zkhandler.write([(("node.state.daemon", node_name), "dead")])
|
||||||
|
# Start the fence monitoring task for this node
|
||||||
|
# NOTE: This is not a subthread and is designed to block this for loop
|
||||||
|
# This ensures that only one node is ever being fenced at a time
|
||||||
|
fence_node(zkhandler, config, logger, node_name)
|
||||||
|
else:
|
||||||
|
logger.out(
|
||||||
|
f"Node {node_name} is OK; last checkin is {node_deadtime - node_keepalive}s from threshold, node state is '{node_daemon_state}'",
|
||||||
|
state="d",
|
||||||
|
prefix="fence-thread",
|
||||||
|
)
|
||||||
|
except LockTimeout:
|
||||||
|
logger.out(
|
||||||
|
"Fence monitor thread failed to acquire exclusive lock; skipping", state="i"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.out(f"Fence monitor thread failed: {e}", state="w")
|
||||||
|
finally:
|
||||||
|
# We're finished, so release the global lock
|
||||||
|
lock.release()
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Fence action function
|
||||||
|
#
|
||||||
|
def fence_node(zkhandler, config, logger, node_name):
|
||||||
# We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it
|
# We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it
|
||||||
failcount_limit = 6
|
failcount_limit = 6
|
||||||
failcount = 0
|
failcount = 0
|
||||||
@ -202,6 +259,9 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
|
|||||||
|
|
||||||
# Loop through the VMs
|
# Loop through the VMs
|
||||||
for dom_uuid in dead_node_running_domains:
|
for dom_uuid in dead_node_running_domains:
|
||||||
|
if dom_uuid in ["0", 0]:
|
||||||
|
# Skip the invalid "0" UUID we sometimes get
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
fence_migrate_vm(dom_uuid)
|
fence_migrate_vm(dom_uuid)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -877,44 +877,12 @@ def node_keepalive(logger, config, zkhandler, this_node, netstats):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Look for dead nodes and fence them
|
# Look for dead nodes and fence them
|
||||||
if not this_node.maintenance:
|
if not this_node.maintenance and config["daemon_mode"] == "coordinator":
|
||||||
logger.out(
|
logger.out(
|
||||||
"Look for dead nodes and fence them", state="d", prefix="main-thread"
|
"Look for dead nodes and fence them", state="d", prefix="main-thread"
|
||||||
)
|
)
|
||||||
if config["daemon_mode"] == "coordinator":
|
fence_monitor_thread = Thread(
|
||||||
for node_name in zkhandler.children("base.node"):
|
target=pvcnoded.util.fencing.fence_monitor,
|
||||||
try:
|
args=(zkhandler, config, logger),
|
||||||
node_daemon_state = zkhandler.read(("node.state.daemon", node_name))
|
)
|
||||||
node_keepalive = int(zkhandler.read(("node.keepalive", node_name)))
|
fence_monitor_thread.start()
|
||||||
except Exception:
|
|
||||||
node_daemon_state = "unknown"
|
|
||||||
node_keepalive = 0
|
|
||||||
|
|
||||||
# Handle deadtime and fencng if needed
|
|
||||||
# (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds
|
|
||||||
# out-of-date while in 'start' state)
|
|
||||||
node_deadtime = int(time.time()) - (
|
|
||||||
int(config["keepalive_interval"]) * int(config["fence_intervals"])
|
|
||||||
)
|
|
||||||
if node_keepalive < node_deadtime and node_daemon_state == "run":
|
|
||||||
logger.out(
|
|
||||||
"Node {} seems dead - starting monitor for fencing".format(
|
|
||||||
node_name
|
|
||||||
),
|
|
||||||
state="w",
|
|
||||||
)
|
|
||||||
zk_lock = zkhandler.writelock(("node.state.daemon", node_name))
|
|
||||||
with zk_lock:
|
|
||||||
# Ensures that, if we lost the lock race and come out of waiting,
|
|
||||||
# we won't try to trigger our own fence thread.
|
|
||||||
if zkhandler.read(("node.state.daemon", node_name)) != "dead":
|
|
||||||
fence_thread = Thread(
|
|
||||||
target=pvcnoded.util.fencing.fence_node,
|
|
||||||
args=(node_name, zkhandler, config, logger),
|
|
||||||
kwargs={},
|
|
||||||
)
|
|
||||||
fence_thread.start()
|
|
||||||
# Write the updated data after we start the fence thread
|
|
||||||
zkhandler.write(
|
|
||||||
[(("node.state.daemon", node_name), "dead")]
|
|
||||||
)
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user