Compare commits
329 Commits
v0.9.61
...
8896c6914c
Author | SHA1 | Date | |
---|---|---|---|
8896c6914c | |||
73e04ad2aa | |||
6f5aecfa22 | |||
c834a3e9c8 | |||
a40de4b7f8 | |||
55f0aae2a7 | |||
f04f816e1b | |||
3f9c1c735b | |||
396f424f80 | |||
529e6d6878 | |||
75639c17d9 | |||
3c6c33a326 | |||
25d0fde5e4 | |||
4ab0bdd9e8 | |||
21965d280c | |||
3408e27355 | |||
fa900f6212 | |||
b236127dba | |||
0ae77d7e77 | |||
8b5011c266 | |||
6ac5b0d02f | |||
3a1b8f0e7a | |||
f6bea50a0a | |||
fc16e26f23 | |||
8aa74aae62 | |||
265e1e29d7 | |||
c6a8c6d39b | |||
8e6632bf10 | |||
96d3aff7ad | |||
134f59f9ee | |||
54373c5bec | |||
7378affcb5 | |||
8df189aa22 | |||
af436a93cc | |||
edb3aea990 | |||
4d786c11e3 | |||
25f3faa08f | |||
3ad6ff2d9c | |||
c7c47d9f86 | |||
3c5a5f08bc | |||
59b2dbeb5e | |||
0b8d26081b | |||
f076554b15 | |||
35f5219916 | |||
f7eaa11a5f | |||
924a0b22ec | |||
6a5f54d169 | |||
7741400370 | |||
5eafa475b9 | |||
f3ba4b6294 | |||
faf9cc537f | |||
a28df75a5d | |||
13dab7a285 | |||
f89dbe802e | |||
d63e80675a | |||
263f3570ab | |||
90f9336041 | |||
5415985ed2 | |||
3384f24ef5 | |||
ef3c22d793 | |||
078f85b431 | |||
bfb363c459 | |||
13e6a0f0bd | |||
c1302cf8b6 | |||
9358949991 | |||
cd0b8c23e6 | |||
fb30263a41 | |||
172e3627d4 | |||
53ffe6cd55 | |||
df6e11ae7a | |||
de2135db42 | |||
72e093c2c4 | |||
60e32f7795 | |||
23e7d84f53 | |||
dd81594f26 | |||
0d09f5d089 | |||
365c70e873 | |||
4f7e2fe146 | |||
77f49654b9 | |||
c158e4e0f5 | |||
31a5c8801f | |||
0a4e4c7048 | |||
de97f2f476 | |||
165ce15dfe | |||
a81d419a2e | |||
85a7088e5a | |||
b58fa06f67 | |||
3b3d2e7f7e | |||
72a5de800c | |||
f450d1d313 | |||
2db58488a2 | |||
1bbf8f6bf6 | |||
191f8780c9 | |||
80c1f78864 | |||
c8c0987fe7 | |||
67560c6457 | |||
79c9eba28c | |||
36e924d339 | |||
aeb1443410 | |||
eccd2a98b2 | |||
6e2c1fb45e | |||
b14ba9172c | |||
e9235a627c | |||
c84ee0f4f1 | |||
76c51460b0 | |||
6ed37f5b4a | |||
4b41ee2817 | |||
dc36c40690 | |||
459b16386b | |||
6146b062d6 | |||
74193c7e2a | |||
73c1ac732e | |||
58dd5830eb | |||
90e515c46f | |||
a6a5f71226 | |||
60a3ef1604 | |||
95807b23eb | |||
5ae430e1c5 | |||
4731faa2f0 | |||
42f4907dec | |||
02168a5ecf | |||
8cfcd02ac2 | |||
e464dcb483 | |||
27214c8190 | |||
f78669a175 | |||
00a4a01517 | |||
a40a69816d | |||
baf5a132ff | |||
584cb95b8d | |||
21bbb0393f | |||
d18e009b00 | |||
1f8f3252a6 | |||
b47c9832b7 | |||
d2757004db | |||
7323269775 | |||
85463f9aec | |||
19c37c3ed5 | |||
7d2ea494e7 | |||
cb50eee2a9 | |||
f3f4eaadf1 | |||
313a5d1c7d | |||
b6d689b769 | |||
a0fccf83f7 | |||
46896c593e | |||
02138974fa | |||
c3d255be65 | |||
45fc8a47a3 | |||
07f2006f68 | |||
f4c7fdffb8 | |||
be1b67b8f0 | |||
d68f6a945e | |||
c776aba8b3 | |||
2461941421 | |||
68954a79ec | |||
a2fa6ed450 | |||
02a2f6a27a | |||
a75b951605 | |||
658e80350f | |||
3aa20fbaa3 | |||
6d101df1ff | |||
be6a3992c1 | |||
d76da0f25a | |||
bc722ce9b8 | |||
7890c32c59 | |||
6febcfdd97 | |||
11d8ce70cd | |||
a17d9439c0 | |||
9cd02eb148 | |||
459485c202 | |||
9f92d5d822 | |||
947ac561c8 | |||
ca143c1968 | |||
6e110b178c | |||
d07d37d08e | |||
0639b16c86 | |||
1cf8706a52 | |||
dd8f07526f | |||
5a5e5da663 | |||
739b60b91e | |||
16544227eb | |||
73e3746885 | |||
66230ce971 | |||
fbfbd70461 | |||
2506098223 | |||
83e887c4ee | |||
4eb0f3bb8a | |||
adc767e32f | |||
2083fd824a | |||
3aa74a3940 | |||
71d94bbeab | |||
718f689df9 | |||
268b5c0b86 | |||
b016b9bf3d | |||
7604b9611f | |||
b21278fd80 | |||
3b02034b70 | |||
c7a5b41b1e | |||
48b0091d3e | |||
2e94516ee2 | |||
d7f26b27ea | |||
872f35a7ee | |||
52c3e8ced3 | |||
1d7acf62bf | |||
c790c331a7 | |||
23165482df | |||
057071a7b7 | |||
554fa9f412 | |||
5a5f924268 | |||
cc309fc021 | |||
5f783f1663 | |||
bc89bb5b68 | |||
eb233ef588 | |||
d3efb54cb4 | |||
da15357c8a | |||
b6939a28c0 | |||
a1da479a4c | |||
ace4082820 | |||
4036af6045 | |||
f96de97861 | |||
04cad46305 | |||
e9dea4d2d1 | |||
39fd85fcc3 | |||
cbbab46b55 | |||
d1f2ce0b0a | |||
2f01edca14 | |||
12a3a3a6a6 | |||
c44732be83 | |||
a8b68e0968 | |||
e59152afee | |||
56021c443a | |||
ebdea165f1 | |||
fb0651fb05 | |||
35e7e11403 | |||
b7555468eb | |||
f1b4ee02ba | |||
4698edc98e | |||
40e7e04aad | |||
7f074847c4 | |||
b0b0b75605 | |||
89f62318bd | |||
925141ed65 | |||
f7a826bf52 | |||
e176f3b2f6 | |||
b339d5e641 | |||
d476b13cc0 | |||
ce8b2c22cc | |||
feab5d3479 | |||
ee348593c9 | |||
e403146bcf | |||
bde684dd3a | |||
992e003500 | |||
eaeb860a83 | |||
1198ca9f5c | |||
e79d200244 | |||
5b3bb9f306 | |||
5501586a47 | |||
c160648c5c | |||
fa37227127 | |||
2cac98963c | |||
8e50428707 | |||
a4953bc6ef | |||
3c10d57148 | |||
26d8551388 | |||
57342541dd | |||
50f8afd749 | |||
3449069e3d | |||
cb66b16045 | |||
8edce74b85 | |||
e9b69c4124 | |||
3948206225 | |||
a09578fcf5 | |||
73be807b84 | |||
4a9805578e | |||
f70f052df1 | |||
1e8841ce69 | |||
9c7d39d523 | |||
011490bcca | |||
8de63b2785 | |||
8f8f00b2e9 | |||
1daab49b50 | |||
9f6041b9cf | |||
5b27e438a9 | |||
3e8a85b029 | |||
19ac1e17c3 | |||
252175fb6f | |||
f39b041471 | |||
3b41759262 | |||
e514eed414 | |||
b81e70ec18 | |||
c2a473ed8b | |||
5355f6ff48 | |||
bf7823deb5 | |||
8ba371723e | |||
e10ac52116 | |||
341073521b | |||
16c38da5ef | |||
c8134d3a1c | |||
9f41373324 | |||
8e62d5b30b | |||
7a8eee244a | |||
7df5b8e52e | |||
6f96219023 | |||
51967e164b | |||
7a3a44d47c | |||
44491dd988 | |||
eba142f470 | |||
6cef68d157 | |||
e8caf3369e | |||
3e3776a25b | |||
6e0d0e264e | |||
1855d03a36 | |||
1a286dc8dd | |||
1b6d10e03a | |||
73c96d1e93 | |||
5841c98a59 | |||
bc6395c959 | |||
d582f87472 | |||
e9735113af | |||
722fd0a65d | |||
3b41beb0f3 | |||
d3392c0282 | |||
560c013e95 | |||
384c6320ef | |||
445dec1c38 | |||
534c7cd7f0 | |||
4014ef7714 | |||
180f0445ac | |||
074664d4c1 | |||
418ac23d40 |
2
.flake8
2
.flake8
@ -6,7 +6,7 @@
|
|||||||
ignore = W503, E501
|
ignore = W503, E501
|
||||||
extend-ignore = E203
|
extend-ignore = E203
|
||||||
# We exclude the Debian, migrations, and provisioner examples
|
# We exclude the Debian, migrations, and provisioner examples
|
||||||
exclude = debian,api-daemon/migrations/versions,api-daemon/provisioner/examples
|
exclude = debian,api-daemon/migrations/versions,api-daemon/provisioner/examples,node-daemon/monitoring
|
||||||
# Set the max line length to 88 for Black
|
# Set the max line length to 88 for Black
|
||||||
max-line-length = 88
|
max-line-length = 88
|
||||||
|
|
||||||
|
@ -448,14 +448,40 @@ class API_Status(Resource):
|
|||||||
type: object
|
type: object
|
||||||
id: ClusterStatus
|
id: ClusterStatus
|
||||||
properties:
|
properties:
|
||||||
health:
|
cluster_health:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
health:
|
||||||
|
type: integer
|
||||||
|
description: The overall health (%) of the cluster
|
||||||
|
example: 100
|
||||||
|
messages:
|
||||||
|
type: array
|
||||||
|
description: A list of health event strings
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
example: "hv1: plugin 'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps"
|
||||||
|
node_health:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
hvX:
|
||||||
|
type: object
|
||||||
|
description: A node entry for per-node health details, one per node in the cluster
|
||||||
|
properties:
|
||||||
|
health:
|
||||||
|
type: integer
|
||||||
|
description: The health (%) of the node
|
||||||
|
example: 100
|
||||||
|
messages:
|
||||||
|
type: array
|
||||||
|
description: A list of health event strings
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
example: "'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps"
|
||||||
|
maintenance:
|
||||||
type: string
|
type: string
|
||||||
description: The overall cluster health
|
description: Whether the cluster is in maintenance mode or not (string boolean)
|
||||||
example: Optimal
|
example: true
|
||||||
storage_health:
|
|
||||||
type: string
|
|
||||||
description: The overall storage cluster health
|
|
||||||
example: Optimal
|
|
||||||
primary_node:
|
primary_node:
|
||||||
type: string
|
type: string
|
||||||
description: The current primary coordinator node
|
description: The current primary coordinator node
|
||||||
@ -605,6 +631,38 @@ class API_Node_Root(Resource):
|
|||||||
arch:
|
arch:
|
||||||
type: string
|
type: string
|
||||||
description: The architecture of the CPU
|
description: The architecture of the CPU
|
||||||
|
health:
|
||||||
|
type: integer
|
||||||
|
description: The overall health (%) of the node
|
||||||
|
example: 100
|
||||||
|
health_plugins:
|
||||||
|
type: array
|
||||||
|
description: A list of health plugin names currently loaded on the node
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
example: "nics"
|
||||||
|
health_details:
|
||||||
|
type: array
|
||||||
|
description: A list of health plugin results
|
||||||
|
items:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
name:
|
||||||
|
type: string
|
||||||
|
description: The name of the health plugin
|
||||||
|
example: nics
|
||||||
|
last_run:
|
||||||
|
type: integer
|
||||||
|
description: The UNIX timestamp (s) of the last plugin run
|
||||||
|
example: 1676786078
|
||||||
|
health_delta:
|
||||||
|
type: integer
|
||||||
|
description: The health delta (negatively applied to the health percentage) of the plugin's current state
|
||||||
|
example: 10
|
||||||
|
message:
|
||||||
|
type: string
|
||||||
|
description: The output message of the plugin
|
||||||
|
example: "bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps"
|
||||||
load:
|
load:
|
||||||
type: number
|
type: number
|
||||||
format: float
|
format: float
|
||||||
|
@ -125,81 +125,56 @@ def format_info(cluster_information, oformat):
|
|||||||
return json.dumps(cluster_information, indent=4)
|
return json.dumps(cluster_information, indent=4)
|
||||||
|
|
||||||
# Plain formatting, i.e. human-readable
|
# Plain formatting, i.e. human-readable
|
||||||
if cluster_information["health"] == "Optimal":
|
if cluster_information["maintenance"] == "true":
|
||||||
health_colour = ansiprint.green()
|
|
||||||
elif cluster_information["health"] == "Maintenance":
|
|
||||||
health_colour = ansiprint.blue()
|
health_colour = ansiprint.blue()
|
||||||
else:
|
elif cluster_information["cluster_health"]["health"] > 90:
|
||||||
|
health_colour = ansiprint.green()
|
||||||
|
elif cluster_information["cluster_health"]["health"] > 50:
|
||||||
health_colour = ansiprint.yellow()
|
health_colour = ansiprint.yellow()
|
||||||
|
|
||||||
if cluster_information["storage_health"] == "Optimal":
|
|
||||||
storage_health_colour = ansiprint.green()
|
|
||||||
elif cluster_information["storage_health"] == "Maintenance":
|
|
||||||
storage_health_colour = ansiprint.blue()
|
|
||||||
else:
|
else:
|
||||||
storage_health_colour = ansiprint.yellow()
|
health_colour = ansiprint.red()
|
||||||
|
|
||||||
ainformation = []
|
ainformation = []
|
||||||
|
|
||||||
if oformat == "short":
|
|
||||||
ainformation.append(
|
|
||||||
"{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
|
|
||||||
)
|
|
||||||
ainformation.append(
|
|
||||||
"{}Cluster health:{} {}{}{}".format(
|
|
||||||
ansiprint.purple(),
|
|
||||||
ansiprint.end(),
|
|
||||||
health_colour,
|
|
||||||
cluster_information["health"],
|
|
||||||
ansiprint.end(),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if cluster_information["health_msg"]:
|
|
||||||
for line in cluster_information["health_msg"]:
|
|
||||||
ainformation.append(" > {}".format(line))
|
|
||||||
ainformation.append(
|
|
||||||
"{}Storage health:{} {}{}{}".format(
|
|
||||||
ansiprint.purple(),
|
|
||||||
ansiprint.end(),
|
|
||||||
storage_health_colour,
|
|
||||||
cluster_information["storage_health"],
|
|
||||||
ansiprint.end(),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if cluster_information["storage_health_msg"]:
|
|
||||||
for line in cluster_information["storage_health_msg"]:
|
|
||||||
ainformation.append(" > {}".format(line))
|
|
||||||
|
|
||||||
return "\n".join(ainformation)
|
|
||||||
|
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
|
"{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
|
||||||
)
|
)
|
||||||
ainformation.append("")
|
ainformation.append("")
|
||||||
|
|
||||||
|
health_text = f"{cluster_information['cluster_health']['health']}%"
|
||||||
|
if cluster_information["maintenance"] == "true":
|
||||||
|
health_text += " (maintenance on)"
|
||||||
|
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{}Cluster health:{} {}{}{}".format(
|
"{}Cluster health:{} {}{}{}".format(
|
||||||
ansiprint.purple(),
|
ansiprint.purple(),
|
||||||
ansiprint.end(),
|
ansiprint.end(),
|
||||||
health_colour,
|
health_colour,
|
||||||
cluster_information["health"],
|
health_text,
|
||||||
ansiprint.end(),
|
ansiprint.end(),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if cluster_information["health_msg"]:
|
if cluster_information["cluster_health"]["messages"]:
|
||||||
for line in cluster_information["health_msg"]:
|
health_messages = "\n > ".join(
|
||||||
ainformation.append(" > {}".format(line))
|
sorted(cluster_information["cluster_health"]["messages"])
|
||||||
ainformation.append(
|
|
||||||
"{}Storage health:{} {}{}{}".format(
|
|
||||||
ansiprint.purple(),
|
|
||||||
ansiprint.end(),
|
|
||||||
storage_health_colour,
|
|
||||||
cluster_information["storage_health"],
|
|
||||||
ansiprint.end(),
|
|
||||||
)
|
)
|
||||||
)
|
ainformation.append(
|
||||||
if cluster_information["storage_health_msg"]:
|
"{}Health messages:{} > {}".format(
|
||||||
for line in cluster_information["storage_health_msg"]:
|
ansiprint.purple(),
|
||||||
ainformation.append(" > {}".format(line))
|
ansiprint.end(),
|
||||||
|
health_messages,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ainformation.append(
|
||||||
|
"{}Health messages:{} N/A".format(
|
||||||
|
ansiprint.purple(),
|
||||||
|
ansiprint.end(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if oformat == "short":
|
||||||
|
return "\n".join(ainformation)
|
||||||
|
|
||||||
ainformation.append("")
|
ainformation.append("")
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
|
@ -215,6 +215,19 @@ def node_list(
|
|||||||
# Output display functions
|
# Output display functions
|
||||||
#
|
#
|
||||||
def getOutputColours(node_information):
|
def getOutputColours(node_information):
|
||||||
|
node_health = node_information.get("health", "N/A")
|
||||||
|
if isinstance(node_health, int):
|
||||||
|
if node_health <= 50:
|
||||||
|
health_colour = ansiprint.red()
|
||||||
|
elif node_health <= 90:
|
||||||
|
health_colour = ansiprint.yellow()
|
||||||
|
elif node_health <= 100:
|
||||||
|
health_colour = ansiprint.green()
|
||||||
|
else:
|
||||||
|
health_colour = ansiprint.blue()
|
||||||
|
else:
|
||||||
|
health_colour = ansiprint.blue()
|
||||||
|
|
||||||
if node_information["daemon_state"] == "run":
|
if node_information["daemon_state"] == "run":
|
||||||
daemon_state_colour = ansiprint.green()
|
daemon_state_colour = ansiprint.green()
|
||||||
elif node_information["daemon_state"] == "stop":
|
elif node_information["daemon_state"] == "stop":
|
||||||
@ -251,6 +264,7 @@ def getOutputColours(node_information):
|
|||||||
mem_provisioned_colour = ""
|
mem_provisioned_colour = ""
|
||||||
|
|
||||||
return (
|
return (
|
||||||
|
health_colour,
|
||||||
daemon_state_colour,
|
daemon_state_colour,
|
||||||
coordinator_state_colour,
|
coordinator_state_colour,
|
||||||
domain_state_colour,
|
domain_state_colour,
|
||||||
@ -261,6 +275,7 @@ def getOutputColours(node_information):
|
|||||||
|
|
||||||
def format_info(node_information, long_output):
|
def format_info(node_information, long_output):
|
||||||
(
|
(
|
||||||
|
health_colour,
|
||||||
daemon_state_colour,
|
daemon_state_colour,
|
||||||
coordinator_state_colour,
|
coordinator_state_colour,
|
||||||
domain_state_colour,
|
domain_state_colour,
|
||||||
@ -273,14 +288,56 @@ def format_info(node_information, long_output):
|
|||||||
# Basic information
|
# Basic information
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{}Name:{} {}".format(
|
"{}Name:{} {}".format(
|
||||||
ansiprint.purple(), ansiprint.end(), node_information["name"]
|
ansiprint.purple(),
|
||||||
|
ansiprint.end(),
|
||||||
|
node_information["name"],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{}PVC Version:{} {}".format(
|
"{}PVC Version:{} {}".format(
|
||||||
ansiprint.purple(), ansiprint.end(), node_information["pvc_version"]
|
ansiprint.purple(),
|
||||||
|
ansiprint.end(),
|
||||||
|
node_information["pvc_version"],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
node_health = node_information.get("health", "N/A")
|
||||||
|
if isinstance(node_health, int):
|
||||||
|
node_health_text = f"{node_health}%"
|
||||||
|
else:
|
||||||
|
node_health_text = node_health
|
||||||
|
ainformation.append(
|
||||||
|
"{}Health:{} {}{}{}".format(
|
||||||
|
ansiprint.purple(),
|
||||||
|
ansiprint.end(),
|
||||||
|
health_colour,
|
||||||
|
node_health_text,
|
||||||
|
ansiprint.end(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
node_health_details = node_information.get("health_details", [])
|
||||||
|
if long_output:
|
||||||
|
node_health_messages = "\n ".join(
|
||||||
|
[f"{plugin['name']}: {plugin['message']}" for plugin in node_health_details]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
node_health_messages = "\n ".join(
|
||||||
|
[
|
||||||
|
f"{plugin['name']}: {plugin['message']}"
|
||||||
|
for plugin in node_health_details
|
||||||
|
if int(plugin.get("health_delta", 0)) > 0
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(node_health_messages) > 0:
|
||||||
|
ainformation.append(
|
||||||
|
"{}Health Plugin Details:{} {}".format(
|
||||||
|
ansiprint.purple(), ansiprint.end(), node_health_messages
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ainformation.append("")
|
||||||
|
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{}Daemon State:{} {}{}{}".format(
|
"{}Daemon State:{} {}{}{}".format(
|
||||||
ansiprint.purple(),
|
ansiprint.purple(),
|
||||||
@ -308,11 +365,6 @@ def format_info(node_information, long_output):
|
|||||||
ansiprint.end(),
|
ansiprint.end(),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
ainformation.append(
|
|
||||||
"{}Active VM Count:{} {}".format(
|
|
||||||
ansiprint.purple(), ansiprint.end(), node_information["domains_count"]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if long_output:
|
if long_output:
|
||||||
ainformation.append("")
|
ainformation.append("")
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
@ -331,6 +383,11 @@ def format_info(node_information, long_output):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
ainformation.append("")
|
ainformation.append("")
|
||||||
|
ainformation.append(
|
||||||
|
"{}Active VM Count:{} {}".format(
|
||||||
|
ansiprint.purple(), ansiprint.end(), node_information["domains_count"]
|
||||||
|
)
|
||||||
|
)
|
||||||
ainformation.append(
|
ainformation.append(
|
||||||
"{}Host CPUs:{} {}".format(
|
"{}Host CPUs:{} {}".format(
|
||||||
ansiprint.purple(), ansiprint.end(), node_information["vcpu"]["total"]
|
ansiprint.purple(), ansiprint.end(), node_information["vcpu"]["total"]
|
||||||
@ -397,6 +454,7 @@ def format_list(node_list, raw):
|
|||||||
# Determine optimal column widths
|
# Determine optimal column widths
|
||||||
node_name_length = 5
|
node_name_length = 5
|
||||||
pvc_version_length = 8
|
pvc_version_length = 8
|
||||||
|
health_length = 7
|
||||||
daemon_state_length = 7
|
daemon_state_length = 7
|
||||||
coordinator_state_length = 12
|
coordinator_state_length = 12
|
||||||
domain_state_length = 7
|
domain_state_length = 7
|
||||||
@ -417,6 +475,15 @@ def format_list(node_list, raw):
|
|||||||
_pvc_version_length = len(node_information.get("pvc_version", "N/A")) + 1
|
_pvc_version_length = len(node_information.get("pvc_version", "N/A")) + 1
|
||||||
if _pvc_version_length > pvc_version_length:
|
if _pvc_version_length > pvc_version_length:
|
||||||
pvc_version_length = _pvc_version_length
|
pvc_version_length = _pvc_version_length
|
||||||
|
# node_health column
|
||||||
|
node_health = node_information.get("health", "N/A")
|
||||||
|
if isinstance(node_health, int):
|
||||||
|
node_health_text = f"{node_health}%"
|
||||||
|
else:
|
||||||
|
node_health_text = node_health
|
||||||
|
_health_length = len(node_health_text) + 1
|
||||||
|
if _health_length > health_length:
|
||||||
|
health_length = _health_length
|
||||||
# daemon_state column
|
# daemon_state column
|
||||||
_daemon_state_length = len(node_information["daemon_state"]) + 1
|
_daemon_state_length = len(node_information["daemon_state"]) + 1
|
||||||
if _daemon_state_length > daemon_state_length:
|
if _daemon_state_length > daemon_state_length:
|
||||||
@ -466,7 +533,10 @@ def format_list(node_list, raw):
|
|||||||
# Format the string (header)
|
# Format the string (header)
|
||||||
node_list_output.append(
|
node_list_output.append(
|
||||||
"{bold}{node_header: <{node_header_length}} {state_header: <{state_header_length}} {resource_header: <{resource_header_length}} {memory_header: <{memory_header_length}}{end_bold}".format(
|
"{bold}{node_header: <{node_header_length}} {state_header: <{state_header_length}} {resource_header: <{resource_header_length}} {memory_header: <{memory_header_length}}{end_bold}".format(
|
||||||
node_header_length=node_name_length + pvc_version_length + 1,
|
node_header_length=node_name_length
|
||||||
|
+ pvc_version_length
|
||||||
|
+ health_length
|
||||||
|
+ 2,
|
||||||
state_header_length=daemon_state_length
|
state_header_length=daemon_state_length
|
||||||
+ coordinator_state_length
|
+ coordinator_state_length
|
||||||
+ domain_state_length
|
+ domain_state_length
|
||||||
@ -484,7 +554,14 @@ def format_list(node_list, raw):
|
|||||||
bold=ansiprint.bold(),
|
bold=ansiprint.bold(),
|
||||||
end_bold=ansiprint.end(),
|
end_bold=ansiprint.end(),
|
||||||
node_header="Nodes "
|
node_header="Nodes "
|
||||||
+ "".join(["-" for _ in range(6, node_name_length + pvc_version_length)]),
|
+ "".join(
|
||||||
|
[
|
||||||
|
"-"
|
||||||
|
for _ in range(
|
||||||
|
6, node_name_length + pvc_version_length + health_length + 1
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
state_header="States "
|
state_header="States "
|
||||||
+ "".join(
|
+ "".join(
|
||||||
[
|
[
|
||||||
@ -526,12 +603,13 @@ def format_list(node_list, raw):
|
|||||||
)
|
)
|
||||||
|
|
||||||
node_list_output.append(
|
node_list_output.append(
|
||||||
"{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} \
|
"{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {node_health: <{health_length}} \
|
||||||
{daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \
|
{daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \
|
||||||
{node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \
|
{node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \
|
||||||
{node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {node_mem_allocated: <{mem_alloc_length}} {node_mem_provisioned: <{mem_prov_length}}{end_bold}".format(
|
{node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {node_mem_allocated: <{mem_alloc_length}} {node_mem_provisioned: <{mem_prov_length}}{end_bold}".format(
|
||||||
node_name_length=node_name_length,
|
node_name_length=node_name_length,
|
||||||
pvc_version_length=pvc_version_length,
|
pvc_version_length=pvc_version_length,
|
||||||
|
health_length=health_length,
|
||||||
daemon_state_length=daemon_state_length,
|
daemon_state_length=daemon_state_length,
|
||||||
coordinator_state_length=coordinator_state_length,
|
coordinator_state_length=coordinator_state_length,
|
||||||
domain_state_length=domain_state_length,
|
domain_state_length=domain_state_length,
|
||||||
@ -551,6 +629,7 @@ def format_list(node_list, raw):
|
|||||||
end_colour="",
|
end_colour="",
|
||||||
node_name="Name",
|
node_name="Name",
|
||||||
node_pvc_version="Version",
|
node_pvc_version="Version",
|
||||||
|
node_health="Health",
|
||||||
node_daemon_state="Daemon",
|
node_daemon_state="Daemon",
|
||||||
node_coordinator_state="Coordinator",
|
node_coordinator_state="Coordinator",
|
||||||
node_domain_state="Domain",
|
node_domain_state="Domain",
|
||||||
@ -568,19 +647,28 @@ def format_list(node_list, raw):
|
|||||||
# Format the string (elements)
|
# Format the string (elements)
|
||||||
for node_information in sorted(node_list, key=lambda n: n["name"]):
|
for node_information in sorted(node_list, key=lambda n: n["name"]):
|
||||||
(
|
(
|
||||||
|
health_colour,
|
||||||
daemon_state_colour,
|
daemon_state_colour,
|
||||||
coordinator_state_colour,
|
coordinator_state_colour,
|
||||||
domain_state_colour,
|
domain_state_colour,
|
||||||
mem_allocated_colour,
|
mem_allocated_colour,
|
||||||
mem_provisioned_colour,
|
mem_provisioned_colour,
|
||||||
) = getOutputColours(node_information)
|
) = getOutputColours(node_information)
|
||||||
|
|
||||||
|
node_health = node_information.get("health", "N/A")
|
||||||
|
if isinstance(node_health, int):
|
||||||
|
node_health_text = f"{node_health}%"
|
||||||
|
else:
|
||||||
|
node_health_text = node_health
|
||||||
|
|
||||||
node_list_output.append(
|
node_list_output.append(
|
||||||
"{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} \
|
"{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {health_colour}{node_health: <{health_length}}{end_colour} \
|
||||||
{daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \
|
{daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \
|
||||||
{node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \
|
{node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \
|
||||||
{node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {mem_allocated_colour}{node_mem_allocated: <{mem_alloc_length}}{end_colour} {mem_provisioned_colour}{node_mem_provisioned: <{mem_prov_length}}{end_colour}{end_bold}".format(
|
{node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {mem_allocated_colour}{node_mem_allocated: <{mem_alloc_length}}{end_colour} {mem_provisioned_colour}{node_mem_provisioned: <{mem_prov_length}}{end_colour}{end_bold}".format(
|
||||||
node_name_length=node_name_length,
|
node_name_length=node_name_length,
|
||||||
pvc_version_length=pvc_version_length,
|
pvc_version_length=pvc_version_length,
|
||||||
|
health_length=health_length,
|
||||||
daemon_state_length=daemon_state_length,
|
daemon_state_length=daemon_state_length,
|
||||||
coordinator_state_length=coordinator_state_length,
|
coordinator_state_length=coordinator_state_length,
|
||||||
domain_state_length=domain_state_length,
|
domain_state_length=domain_state_length,
|
||||||
@ -594,6 +682,7 @@ def format_list(node_list, raw):
|
|||||||
mem_prov_length=mem_prov_length,
|
mem_prov_length=mem_prov_length,
|
||||||
bold="",
|
bold="",
|
||||||
end_bold="",
|
end_bold="",
|
||||||
|
health_colour=health_colour,
|
||||||
daemon_state_colour=daemon_state_colour,
|
daemon_state_colour=daemon_state_colour,
|
||||||
coordinator_state_colour=coordinator_state_colour,
|
coordinator_state_colour=coordinator_state_colour,
|
||||||
domain_state_colour=domain_state_colour,
|
domain_state_colour=domain_state_colour,
|
||||||
@ -602,6 +691,7 @@ def format_list(node_list, raw):
|
|||||||
end_colour=ansiprint.end(),
|
end_colour=ansiprint.end(),
|
||||||
node_name=node_information["name"],
|
node_name=node_information["name"],
|
||||||
node_pvc_version=node_information.get("pvc_version", "N/A"),
|
node_pvc_version=node_information.get("pvc_version", "N/A"),
|
||||||
|
node_health=node_health_text,
|
||||||
node_daemon_state=node_information["daemon_state"],
|
node_daemon_state=node_information["daemon_state"],
|
||||||
node_coordinator_state=node_information["coordinator_state"],
|
node_coordinator_state=node_information["coordinator_state"],
|
||||||
node_domain_state=node_information["domain_state"],
|
node_domain_state=node_information["domain_state"],
|
||||||
|
@ -697,15 +697,29 @@ def node_log(node, lines, follow):
|
|||||||
default=False,
|
default=False,
|
||||||
help="Display more detailed information.",
|
help="Display more detailed information.",
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"-f",
|
||||||
|
"--format",
|
||||||
|
"oformat",
|
||||||
|
default="plain",
|
||||||
|
show_default=True,
|
||||||
|
type=click.Choice(["plain", "json", "json-pretty"]),
|
||||||
|
help="Output format of node status information.",
|
||||||
|
)
|
||||||
@cluster_req
|
@cluster_req
|
||||||
def node_info(node, long_output):
|
def node_info(node, long_output, oformat):
|
||||||
"""
|
"""
|
||||||
Show information about node NODE. If unspecified, defaults to this host.
|
Show information about node NODE. If unspecified, defaults to this host.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
retcode, retdata = pvc_node.node_info(config, node)
|
retcode, retdata = pvc_node.node_info(config, node)
|
||||||
if retcode:
|
if retcode:
|
||||||
retdata = pvc_node.format_info(retdata, long_output)
|
if oformat == "json":
|
||||||
|
retdata = json.dumps(retdata)
|
||||||
|
elif oformat == "json-pretty":
|
||||||
|
retdata = json.dumps(retdata, indent=4)
|
||||||
|
else:
|
||||||
|
retdata = pvc_node.format_info(retdata, long_output)
|
||||||
cleanup(retcode, retdata)
|
cleanup(retcode, retdata)
|
||||||
|
|
||||||
|
|
||||||
|
@ -158,6 +158,19 @@ def get_status(zkhandler):
|
|||||||
return True, status_data
|
return True, status_data
|
||||||
|
|
||||||
|
|
||||||
|
def get_health(zkhandler):
|
||||||
|
primary_node = zkhandler.read("base.config.primary_node")
|
||||||
|
ceph_health = zkhandler.read("base.storage.health").rstrip()
|
||||||
|
|
||||||
|
# Create a data structure for the information
|
||||||
|
status_data = {
|
||||||
|
"type": "health",
|
||||||
|
"primary_node": primary_node,
|
||||||
|
"ceph_data": ceph_health,
|
||||||
|
}
|
||||||
|
return True, status_data
|
||||||
|
|
||||||
|
|
||||||
def get_util(zkhandler):
|
def get_util(zkhandler):
|
||||||
primary_node = zkhandler.read("base.config.primary_node")
|
primary_node = zkhandler.read("base.config.primary_node")
|
||||||
ceph_df = zkhandler.read("base.storage.util").rstrip()
|
ceph_df = zkhandler.read("base.storage.util").rstrip()
|
||||||
|
@ -19,7 +19,7 @@
|
|||||||
#
|
#
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
import re
|
from json import loads
|
||||||
|
|
||||||
import daemon_lib.common as common
|
import daemon_lib.common as common
|
||||||
import daemon_lib.vm as pvc_vm
|
import daemon_lib.vm as pvc_vm
|
||||||
@ -44,13 +44,152 @@ def set_maintenance(zkhandler, maint_state):
|
|||||||
return True, "Successfully set cluster in normal mode"
|
return True, "Successfully set cluster in normal mode"
|
||||||
|
|
||||||
|
|
||||||
|
def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
|
||||||
|
health_delta_map = {
|
||||||
|
"node_stopped": 50,
|
||||||
|
"node_flushed": 10,
|
||||||
|
"vm_stopped": 10,
|
||||||
|
"osd_out": 50,
|
||||||
|
"osd_down": 10,
|
||||||
|
"memory_overprovisioned": 50,
|
||||||
|
"ceph_err": 50,
|
||||||
|
"ceph_warn": 10,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Generate total cluster health numbers
|
||||||
|
cluster_health_value = 100
|
||||||
|
cluster_health_messages = list()
|
||||||
|
|
||||||
|
for index, node in enumerate(node_list):
|
||||||
|
# Apply node health values to total health number
|
||||||
|
try:
|
||||||
|
node_health_int = int(node["health"])
|
||||||
|
except Exception:
|
||||||
|
node_health_int = 100
|
||||||
|
cluster_health_value -= 100 - node_health_int
|
||||||
|
|
||||||
|
for entry in node["health_details"]:
|
||||||
|
if entry["health_delta"] > 0:
|
||||||
|
cluster_health_messages.append(
|
||||||
|
f"{node['name']}: plugin '{entry['name']}': {entry['message']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle unhealthy node states
|
||||||
|
if node["daemon_state"] not in ["run"]:
|
||||||
|
cluster_health_value -= health_delta_map["node_stopped"]
|
||||||
|
cluster_health_messages.append(
|
||||||
|
f"cluster: Node {node['name']} in {node['daemon_state'].upper()} daemon state"
|
||||||
|
)
|
||||||
|
elif node["domain_state"] not in ["ready"]:
|
||||||
|
cluster_health_value -= health_delta_map["node_flushed"]
|
||||||
|
cluster_health_messages.append(
|
||||||
|
f"cluster: Node {node['name']} in {node['domain_state'].upper()} domain state"
|
||||||
|
)
|
||||||
|
|
||||||
|
for index, vm in enumerate(vm_list):
|
||||||
|
# Handle unhealthy VM states
|
||||||
|
if vm["state"] in ["stop", "fail"]:
|
||||||
|
cluster_health_value -= health_delta_map["vm_stopped"]
|
||||||
|
cluster_health_messages.append(
|
||||||
|
f"cluster: VM {vm['name']} in {vm['state'].upper()} state"
|
||||||
|
)
|
||||||
|
|
||||||
|
for index, ceph_osd in enumerate(ceph_osd_list):
|
||||||
|
in_texts = {1: "in", 0: "out"}
|
||||||
|
up_texts = {1: "up", 0: "down"}
|
||||||
|
|
||||||
|
# Handle unhealthy OSD states
|
||||||
|
if in_texts[ceph_osd["stats"]["in"]] not in ["in"]:
|
||||||
|
cluster_health_value -= health_delta_map["osd_out"]
|
||||||
|
cluster_health_messages.append(
|
||||||
|
f"cluster: Ceph OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']].upper()} state"
|
||||||
|
)
|
||||||
|
elif up_texts[ceph_osd["stats"]["up"]] not in ["up"]:
|
||||||
|
cluster_health_value -= health_delta_map["osd_down"]
|
||||||
|
cluster_health_messages.append(
|
||||||
|
f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for (n-1) overprovisioning
|
||||||
|
# Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
|
||||||
|
# the total memory of the (n-1) smallest nodes, trigger this warning.
|
||||||
|
n_minus_1_total = 0
|
||||||
|
alloc_total = 0
|
||||||
|
node_largest_index = None
|
||||||
|
node_largest_count = 0
|
||||||
|
for index, node in enumerate(node_list):
|
||||||
|
node_mem_total = node["memory"]["total"]
|
||||||
|
node_mem_alloc = node["memory"]["allocated"]
|
||||||
|
alloc_total += node_mem_alloc
|
||||||
|
# Determine if this node is the largest seen so far
|
||||||
|
if node_mem_total > node_largest_count:
|
||||||
|
node_largest_index = index
|
||||||
|
node_largest_count = node_mem_total
|
||||||
|
n_minus_1_node_list = list()
|
||||||
|
for index, node in enumerate(node_list):
|
||||||
|
if index == node_largest_index:
|
||||||
|
continue
|
||||||
|
n_minus_1_node_list.append(node)
|
||||||
|
for index, node in enumerate(n_minus_1_node_list):
|
||||||
|
n_minus_1_total += node["memory"]["total"]
|
||||||
|
if alloc_total > n_minus_1_total:
|
||||||
|
cluster_health_value -= health_delta_map["memory_overprovisioned"]
|
||||||
|
cluster_health_messages.append(
|
||||||
|
f"cluster: Total memory is OVERPROVISIONED ({alloc_total} > {n_minus_1_total} @ N-1)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check Ceph cluster health
|
||||||
|
ceph_health = loads(zkhandler.read("base.storage.health"))
|
||||||
|
ceph_health_status = ceph_health["status"]
|
||||||
|
ceph_health_entries = ceph_health["checks"].keys()
|
||||||
|
|
||||||
|
ceph_health_status_map = {
|
||||||
|
"HEALTH_ERR": "ERROR",
|
||||||
|
"HEALTH_WARN": "WARNING",
|
||||||
|
}
|
||||||
|
for entry in ceph_health_entries:
|
||||||
|
cluster_health_messages.append(
|
||||||
|
f"cluster: Ceph {ceph_health_status_map[ceph_health['checks'][entry]['severity']]} {entry}: {ceph_health['checks'][entry]['summary']['message']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if ceph_health_status == "HEALTH_ERR":
|
||||||
|
cluster_health_value -= health_delta_map["ceph_err"]
|
||||||
|
elif ceph_health_status == "HEALTH_WARN":
|
||||||
|
cluster_health_value -= health_delta_map["ceph_warn"]
|
||||||
|
|
||||||
|
if cluster_health_value < 0:
|
||||||
|
cluster_health_value = 0
|
||||||
|
|
||||||
|
cluster_health = {
|
||||||
|
"health": cluster_health_value,
|
||||||
|
"messages": cluster_health_messages,
|
||||||
|
}
|
||||||
|
|
||||||
|
return cluster_health
|
||||||
|
|
||||||
|
|
||||||
|
def getNodeHealth(zkhandler, node_list):
|
||||||
|
node_health = dict()
|
||||||
|
for index, node in enumerate(node_list):
|
||||||
|
node_health_messages = list()
|
||||||
|
node_health_value = node["health"]
|
||||||
|
for entry in node["health_details"]:
|
||||||
|
if entry["health_delta"] > 0:
|
||||||
|
node_health_messages.append(f"'{entry['name']}': {entry['message']}")
|
||||||
|
|
||||||
|
node_health_entry = {
|
||||||
|
"health": node_health_value,
|
||||||
|
"messages": node_health_messages,
|
||||||
|
}
|
||||||
|
|
||||||
|
node_health[node["name"]] = node_health_entry
|
||||||
|
|
||||||
|
return node_health
|
||||||
|
|
||||||
|
|
||||||
def getClusterInformation(zkhandler):
|
def getClusterInformation(zkhandler):
|
||||||
# Get cluster maintenance state
|
# Get cluster maintenance state
|
||||||
maint_state = zkhandler.read("base.config.maintenance")
|
maintenance_state = zkhandler.read("base.config.maintenance")
|
||||||
|
|
||||||
# List of messages to display to the clients
|
|
||||||
cluster_health_msg = []
|
|
||||||
storage_health_msg = []
|
|
||||||
|
|
||||||
# Get node information object list
|
# Get node information object list
|
||||||
retcode, node_list = pvc_node.get_list(zkhandler, None)
|
retcode, node_list = pvc_node.get_list(zkhandler, None)
|
||||||
@ -78,135 +217,6 @@ def getClusterInformation(zkhandler):
|
|||||||
ceph_volume_count = len(ceph_volume_list)
|
ceph_volume_count = len(ceph_volume_list)
|
||||||
ceph_snapshot_count = len(ceph_snapshot_list)
|
ceph_snapshot_count = len(ceph_snapshot_list)
|
||||||
|
|
||||||
# Determinations for general cluster health
|
|
||||||
cluster_healthy_status = True
|
|
||||||
# Check for (n-1) overprovisioning
|
|
||||||
# Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
|
|
||||||
# the total memory of the (n-1) smallest nodes, trigger this warning.
|
|
||||||
n_minus_1_total = 0
|
|
||||||
alloc_total = 0
|
|
||||||
|
|
||||||
node_largest_index = None
|
|
||||||
node_largest_count = 0
|
|
||||||
for index, node in enumerate(node_list):
|
|
||||||
node_mem_total = node["memory"]["total"]
|
|
||||||
node_mem_alloc = node["memory"]["allocated"]
|
|
||||||
alloc_total += node_mem_alloc
|
|
||||||
|
|
||||||
# Determine if this node is the largest seen so far
|
|
||||||
if node_mem_total > node_largest_count:
|
|
||||||
node_largest_index = index
|
|
||||||
node_largest_count = node_mem_total
|
|
||||||
n_minus_1_node_list = list()
|
|
||||||
for index, node in enumerate(node_list):
|
|
||||||
if index == node_largest_index:
|
|
||||||
continue
|
|
||||||
n_minus_1_node_list.append(node)
|
|
||||||
for index, node in enumerate(n_minus_1_node_list):
|
|
||||||
n_minus_1_total += node["memory"]["total"]
|
|
||||||
if alloc_total > n_minus_1_total:
|
|
||||||
cluster_healthy_status = False
|
|
||||||
cluster_health_msg.append(
|
|
||||||
"Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format(
|
|
||||||
alloc_total, n_minus_1_total
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Determinations for node health
|
|
||||||
node_healthy_status = list(range(0, node_count))
|
|
||||||
node_report_status = list(range(0, node_count))
|
|
||||||
for index, node in enumerate(node_list):
|
|
||||||
daemon_state = node["daemon_state"]
|
|
||||||
domain_state = node["domain_state"]
|
|
||||||
if daemon_state != "run" and domain_state != "ready":
|
|
||||||
node_healthy_status[index] = False
|
|
||||||
cluster_health_msg.append(
|
|
||||||
"Node '{}' in {},{} state".format(
|
|
||||||
node["name"], daemon_state, domain_state
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
node_healthy_status[index] = True
|
|
||||||
node_report_status[index] = daemon_state + "," + domain_state
|
|
||||||
|
|
||||||
# Determinations for VM health
|
|
||||||
vm_healthy_status = list(range(0, vm_count))
|
|
||||||
vm_report_status = list(range(0, vm_count))
|
|
||||||
for index, vm in enumerate(vm_list):
|
|
||||||
vm_state = vm["state"]
|
|
||||||
if vm_state not in ["start", "disable", "migrate", "unmigrate", "provision"]:
|
|
||||||
vm_healthy_status[index] = False
|
|
||||||
cluster_health_msg.append(
|
|
||||||
"VM '{}' in {} state".format(vm["name"], vm_state)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
vm_healthy_status[index] = True
|
|
||||||
vm_report_status[index] = vm_state
|
|
||||||
|
|
||||||
# Determinations for OSD health
|
|
||||||
ceph_osd_healthy_status = list(range(0, ceph_osd_count))
|
|
||||||
ceph_osd_report_status = list(range(0, ceph_osd_count))
|
|
||||||
for index, ceph_osd in enumerate(ceph_osd_list):
|
|
||||||
try:
|
|
||||||
ceph_osd_up = ceph_osd["stats"]["up"]
|
|
||||||
except KeyError:
|
|
||||||
ceph_osd_up = 0
|
|
||||||
|
|
||||||
try:
|
|
||||||
ceph_osd_in = ceph_osd["stats"]["in"]
|
|
||||||
except KeyError:
|
|
||||||
ceph_osd_in = 0
|
|
||||||
|
|
||||||
up_texts = {1: "up", 0: "down"}
|
|
||||||
in_texts = {1: "in", 0: "out"}
|
|
||||||
|
|
||||||
if not ceph_osd_up or not ceph_osd_in:
|
|
||||||
ceph_osd_healthy_status[index] = False
|
|
||||||
cluster_health_msg.append(
|
|
||||||
"OSD {} in {},{} state".format(
|
|
||||||
ceph_osd["id"], up_texts[ceph_osd_up], in_texts[ceph_osd_in]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
ceph_osd_healthy_status[index] = True
|
|
||||||
ceph_osd_report_status[index] = (
|
|
||||||
up_texts[ceph_osd_up] + "," + in_texts[ceph_osd_in]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
|
|
||||||
if maint_state == "true":
|
|
||||||
cluster_health = "Maintenance"
|
|
||||||
elif (
|
|
||||||
cluster_healthy_status is False
|
|
||||||
or False in node_healthy_status
|
|
||||||
or False in vm_healthy_status
|
|
||||||
or False in ceph_osd_healthy_status
|
|
||||||
):
|
|
||||||
cluster_health = "Degraded"
|
|
||||||
else:
|
|
||||||
cluster_health = "Optimal"
|
|
||||||
|
|
||||||
# Find out our storage health from Ceph
|
|
||||||
ceph_status = zkhandler.read("base.storage").split("\n")
|
|
||||||
ceph_health = ceph_status[2].split()[-1]
|
|
||||||
|
|
||||||
# Parse the status output to get the health indicators
|
|
||||||
line_record = False
|
|
||||||
for index, line in enumerate(ceph_status):
|
|
||||||
if re.search("services:", line):
|
|
||||||
line_record = False
|
|
||||||
if line_record and len(line.strip()) > 0:
|
|
||||||
storage_health_msg.append(line.strip())
|
|
||||||
if re.search("health:", line):
|
|
||||||
line_record = True
|
|
||||||
|
|
||||||
if maint_state == "true":
|
|
||||||
storage_health = "Maintenance"
|
|
||||||
elif ceph_health != "HEALTH_OK":
|
|
||||||
storage_health = "Degraded"
|
|
||||||
else:
|
|
||||||
storage_health = "Optimal"
|
|
||||||
|
|
||||||
# State lists
|
# State lists
|
||||||
node_state_combinations = [
|
node_state_combinations = [
|
||||||
"run,ready",
|
"run,ready",
|
||||||
@ -237,13 +247,19 @@ def getClusterInformation(zkhandler):
|
|||||||
"unmigrate",
|
"unmigrate",
|
||||||
"provision",
|
"provision",
|
||||||
]
|
]
|
||||||
ceph_osd_state_combinations = ["up,in", "up,out", "down,in", "down,out"]
|
ceph_osd_state_combinations = [
|
||||||
|
"up,in",
|
||||||
|
"up,out",
|
||||||
|
"down,in",
|
||||||
|
"down,out",
|
||||||
|
]
|
||||||
|
|
||||||
# Format the Node states
|
# Format the Node states
|
||||||
formatted_node_states = {"total": node_count}
|
formatted_node_states = {"total": node_count}
|
||||||
for state in node_state_combinations:
|
for state in node_state_combinations:
|
||||||
state_count = 0
|
state_count = 0
|
||||||
for node_state in node_report_status:
|
for node in node_list:
|
||||||
|
node_state = f"{node['daemon_state']},{node['domain_state']}"
|
||||||
if node_state == state:
|
if node_state == state:
|
||||||
state_count += 1
|
state_count += 1
|
||||||
if state_count > 0:
|
if state_count > 0:
|
||||||
@ -253,17 +269,20 @@ def getClusterInformation(zkhandler):
|
|||||||
formatted_vm_states = {"total": vm_count}
|
formatted_vm_states = {"total": vm_count}
|
||||||
for state in vm_state_combinations:
|
for state in vm_state_combinations:
|
||||||
state_count = 0
|
state_count = 0
|
||||||
for vm_state in vm_report_status:
|
for vm in vm_list:
|
||||||
if vm_state == state:
|
if vm["state"] == state:
|
||||||
state_count += 1
|
state_count += 1
|
||||||
if state_count > 0:
|
if state_count > 0:
|
||||||
formatted_vm_states[state] = state_count
|
formatted_vm_states[state] = state_count
|
||||||
|
|
||||||
# Format the OSD states
|
# Format the OSD states
|
||||||
|
up_texts = {1: "up", 0: "down"}
|
||||||
|
in_texts = {1: "in", 0: "out"}
|
||||||
formatted_osd_states = {"total": ceph_osd_count}
|
formatted_osd_states = {"total": ceph_osd_count}
|
||||||
for state in ceph_osd_state_combinations:
|
for state in ceph_osd_state_combinations:
|
||||||
state_count = 0
|
state_count = 0
|
||||||
for ceph_osd_state in ceph_osd_report_status:
|
for ceph_osd in ceph_osd_list:
|
||||||
|
ceph_osd_state = f"{up_texts[ceph_osd['stats']['up']]},{in_texts[ceph_osd['stats']['in']]}"
|
||||||
if ceph_osd_state == state:
|
if ceph_osd_state == state:
|
||||||
state_count += 1
|
state_count += 1
|
||||||
if state_count > 0:
|
if state_count > 0:
|
||||||
@ -271,10 +290,11 @@ def getClusterInformation(zkhandler):
|
|||||||
|
|
||||||
# Format the status data
|
# Format the status data
|
||||||
cluster_information = {
|
cluster_information = {
|
||||||
"health": cluster_health,
|
"cluster_health": getClusterHealth(
|
||||||
"health_msg": cluster_health_msg,
|
zkhandler, node_list, vm_list, ceph_osd_list
|
||||||
"storage_health": storage_health,
|
),
|
||||||
"storage_health_msg": storage_health_msg,
|
"node_health": getNodeHealth(zkhandler, node_list),
|
||||||
|
"maintenance": maintenance_state,
|
||||||
"primary_node": common.getPrimaryNode(zkhandler),
|
"primary_node": common.getPrimaryNode(zkhandler),
|
||||||
"upstream_ip": zkhandler.read("base.config.upstream_ip"),
|
"upstream_ip": zkhandler.read("base.config.upstream_ip"),
|
||||||
"nodes": formatted_node_states,
|
"nodes": formatted_node_states,
|
||||||
|
1
daemon-common/migrations/versions/9.json
Normal file
1
daemon-common/migrations/versions/9.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
|
@ -21,6 +21,7 @@
|
|||||||
|
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
import daemon_lib.common as common
|
import daemon_lib.common as common
|
||||||
|
|
||||||
@ -49,6 +50,44 @@ def getNodeInformation(zkhandler, node_name):
|
|||||||
zkhandler.read(("node.count.provisioned_domains", node_name))
|
zkhandler.read(("node.count.provisioned_domains", node_name))
|
||||||
)
|
)
|
||||||
node_running_domains = zkhandler.read(("node.running_domains", node_name)).split()
|
node_running_domains = zkhandler.read(("node.running_domains", node_name)).split()
|
||||||
|
try:
|
||||||
|
node_health = int(zkhandler.read(("node.monitoring.health", node_name)))
|
||||||
|
except Exception:
|
||||||
|
node_health = "N/A"
|
||||||
|
try:
|
||||||
|
node_health_plugins = zkhandler.read(
|
||||||
|
("node.monitoring.plugins", node_name)
|
||||||
|
).split()
|
||||||
|
except Exception:
|
||||||
|
node_health_plugins = list()
|
||||||
|
|
||||||
|
node_health_details = list()
|
||||||
|
for plugin in node_health_plugins:
|
||||||
|
plugin_last_run = zkhandler.read(
|
||||||
|
("node.monitoring.data", node_name, "monitoring_plugin.last_run", plugin)
|
||||||
|
)
|
||||||
|
plugin_health_delta = zkhandler.read(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
node_name,
|
||||||
|
"monitoring_plugin.health_delta",
|
||||||
|
plugin,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
plugin_message = zkhandler.read(
|
||||||
|
("node.monitoring.data", node_name, "monitoring_plugin.message", plugin)
|
||||||
|
)
|
||||||
|
plugin_data = zkhandler.read(
|
||||||
|
("node.monitoring.data", node_name, "monitoring_plugin.data", plugin)
|
||||||
|
)
|
||||||
|
plugin_output = {
|
||||||
|
"name": plugin,
|
||||||
|
"last_run": int(plugin_last_run),
|
||||||
|
"health_delta": int(plugin_health_delta),
|
||||||
|
"message": plugin_message,
|
||||||
|
"data": json.loads(plugin_data),
|
||||||
|
}
|
||||||
|
node_health_details.append(plugin_output)
|
||||||
|
|
||||||
# Construct a data structure to represent the data
|
# Construct a data structure to represent the data
|
||||||
node_information = {
|
node_information = {
|
||||||
@ -61,10 +100,16 @@ def getNodeInformation(zkhandler, node_name):
|
|||||||
"kernel": node_kernel,
|
"kernel": node_kernel,
|
||||||
"os": node_os,
|
"os": node_os,
|
||||||
"arch": node_arch,
|
"arch": node_arch,
|
||||||
|
"health": node_health,
|
||||||
|
"health_plugins": node_health_plugins,
|
||||||
|
"health_details": node_health_details,
|
||||||
"load": node_load,
|
"load": node_load,
|
||||||
"domains_count": node_domains_count,
|
"domains_count": node_domains_count,
|
||||||
"running_domains": node_running_domains,
|
"running_domains": node_running_domains,
|
||||||
"vcpu": {"total": node_cpu_count, "allocated": node_vcpu_allocated},
|
"vcpu": {
|
||||||
|
"total": node_cpu_count,
|
||||||
|
"allocated": node_vcpu_allocated,
|
||||||
|
},
|
||||||
"memory": {
|
"memory": {
|
||||||
"total": node_mem_total,
|
"total": node_mem_total,
|
||||||
"allocated": node_mem_allocated,
|
"allocated": node_mem_allocated,
|
||||||
|
@ -540,7 +540,7 @@ class ZKHandler(object):
|
|||||||
#
|
#
|
||||||
class ZKSchema(object):
|
class ZKSchema(object):
|
||||||
# Current version
|
# Current version
|
||||||
_version = 8
|
_version = 9
|
||||||
|
|
||||||
# Root for doing nested keys
|
# Root for doing nested keys
|
||||||
_schema_root = ""
|
_schema_root = ""
|
||||||
@ -569,6 +569,7 @@ class ZKSchema(object):
|
|||||||
"domain": f"{_schema_root}/domains",
|
"domain": f"{_schema_root}/domains",
|
||||||
"network": f"{_schema_root}/networks",
|
"network": f"{_schema_root}/networks",
|
||||||
"storage": f"{_schema_root}/ceph",
|
"storage": f"{_schema_root}/ceph",
|
||||||
|
"storage.health": f"{_schema_root}/ceph/health",
|
||||||
"storage.util": f"{_schema_root}/ceph/util",
|
"storage.util": f"{_schema_root}/ceph/util",
|
||||||
"osd": f"{_schema_root}/ceph/osds",
|
"osd": f"{_schema_root}/ceph/osds",
|
||||||
"pool": f"{_schema_root}/ceph/pools",
|
"pool": f"{_schema_root}/ceph/pools",
|
||||||
@ -608,6 +609,18 @@ class ZKSchema(object):
|
|||||||
"sriov": "/sriov",
|
"sriov": "/sriov",
|
||||||
"sriov.pf": "/sriov/pf",
|
"sriov.pf": "/sriov/pf",
|
||||||
"sriov.vf": "/sriov/vf",
|
"sriov.vf": "/sriov/vf",
|
||||||
|
"monitoring.plugins": "/monitoring_plugins",
|
||||||
|
"monitoring.data": "/monitoring_data",
|
||||||
|
"monitoring.health": "/monitoring_health",
|
||||||
|
},
|
||||||
|
# The schema of an individual monitoring plugin data entry (/nodes/{node_name}/monitoring_data/{plugin})
|
||||||
|
"monitoring_plugin": {
|
||||||
|
"name": "", # The root key
|
||||||
|
"last_run": "/last_run",
|
||||||
|
"health_delta": "/health_delta",
|
||||||
|
"message": "/message",
|
||||||
|
"data": "/data",
|
||||||
|
"runtime": "/runtime",
|
||||||
},
|
},
|
||||||
# The schema of an individual SR-IOV PF entry (/nodes/{node_name}/sriov/pf/{pf})
|
# The schema of an individual SR-IOV PF entry (/nodes/{node_name}/sriov/pf/{pf})
|
||||||
"sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, # The root key
|
"sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, # The root key
|
||||||
@ -874,9 +887,10 @@ class ZKSchema(object):
|
|||||||
if not zkhandler.zk_conn.exists(nkipath):
|
if not zkhandler.zk_conn.exists(nkipath):
|
||||||
result = False
|
result = False
|
||||||
|
|
||||||
# One might expect child keys under node (specifically, sriov.pf and sriov.vf) to be
|
# One might expect child keys under node (specifically, sriov.pf, sriov.vf,
|
||||||
# managed here as well, but those are created automatically every time pvcnoded starts
|
# monitoring.data) to be managed here as well, but those are created
|
||||||
# and thus never need to be validated or applied.
|
# automatically every time pvcnoded started and thus never need to be validated
|
||||||
|
# or applied.
|
||||||
|
|
||||||
# These two have several children layers that must be parsed through
|
# These two have several children layers that must be parsed through
|
||||||
for elem in ["volume"]:
|
for elem in ["volume"]:
|
||||||
|
1
debian/pvc-daemon-node.install
vendored
1
debian/pvc-daemon-node.install
vendored
@ -5,3 +5,4 @@ node-daemon/pvcnoded.service lib/systemd/system
|
|||||||
node-daemon/pvc.target lib/systemd/system
|
node-daemon/pvc.target lib/systemd/system
|
||||||
node-daemon/pvcautoready.service lib/systemd/system
|
node-daemon/pvcautoready.service lib/systemd/system
|
||||||
node-daemon/monitoring usr/share/pvc
|
node-daemon/monitoring usr/share/pvc
|
||||||
|
node-daemon/plugins usr/share/pvc
|
||||||
|
@ -132,6 +132,7 @@ pvc:
|
|||||||
target_selector: mem
|
target_selector: mem
|
||||||
configuration:
|
configuration:
|
||||||
directories:
|
directories:
|
||||||
|
plugin_directory: "/usr/share/pvc/plugins"
|
||||||
dynamic_directory: "/run/pvc"
|
dynamic_directory: "/run/pvc"
|
||||||
log_directory: "/var/log/pvc"
|
log_directory: "/var/log/pvc"
|
||||||
console_log_directory: "/var/log/libvirt"
|
console_log_directory: "/var/log/libvirt"
|
||||||
@ -142,7 +143,7 @@ pvc:
|
|||||||
log_dates: True
|
log_dates: True
|
||||||
log_keepalives: True
|
log_keepalives: True
|
||||||
log_keepalive_cluster_details: True
|
log_keepalive_cluster_details: True
|
||||||
log_keepalive_storage_details: True
|
log_keepalive_plugin_details: True
|
||||||
console_log_lines: 1000
|
console_log_lines: 1000
|
||||||
networking:
|
networking:
|
||||||
bridge_device: ens4
|
bridge_device: ens4
|
||||||
@ -367,6 +368,12 @@ For most clusters, `mem` should be sufficient, but others may be used based on t
|
|||||||
* `memprov` looks at the provisioned memory, not the allocated memory; thus, stopped or disabled VMs are counted towards a node's memory for this selector, even though their memory is not actively in use.
|
* `memprov` looks at the provisioned memory, not the allocated memory; thus, stopped or disabled VMs are counted towards a node's memory for this selector, even though their memory is not actively in use.
|
||||||
* `load` looks at the system load of the node in general, ignoring load in any particular VMs; if any VM's CPU usage changes, this value would be affected. This might be preferable on clusters with some very CPU intensive VMs.
|
* `load` looks at the system load of the node in general, ignoring load in any particular VMs; if any VM's CPU usage changes, this value would be affected. This might be preferable on clusters with some very CPU intensive VMs.
|
||||||
|
|
||||||
|
#### `system` → `configuration` → `directories` → `plugin_directory`
|
||||||
|
|
||||||
|
* *optional*
|
||||||
|
|
||||||
|
The directory to load node health plugins from. Defaults to `/usr/share/pvc/plugins` if unset as per default packaging; should only be overridden by advanced users.
|
||||||
|
|
||||||
#### `system` → `configuration` → `directories` → `dynamic_directory`
|
#### `system` → `configuration` → `directories` → `dynamic_directory`
|
||||||
|
|
||||||
* *required*
|
* *required*
|
||||||
@ -421,11 +428,11 @@ Whether to log keepalive messages or not.
|
|||||||
|
|
||||||
Whether to log node status information during keepalives or not.
|
Whether to log node status information during keepalives or not.
|
||||||
|
|
||||||
#### `system` → `configuration` → `logging` → `log_keepalive_storage_details`
|
#### `system` → `configuration` → `logging` → `log_keepalive_plugin_details`
|
||||||
|
|
||||||
* *required*
|
* *required*
|
||||||
|
|
||||||
Whether to log storage cluster status information during keepalives or not.
|
Whether to log node health plugin status information during keepalives or not.
|
||||||
|
|
||||||
#### `system` → `configuration` → `logging` → `console_log_lines`
|
#### `system` → `configuration` → `logging` → `console_log_lines`
|
||||||
|
|
||||||
|
@ -15,15 +15,57 @@
|
|||||||
},
|
},
|
||||||
"ClusterStatus": {
|
"ClusterStatus": {
|
||||||
"properties": {
|
"properties": {
|
||||||
"health": {
|
"cluster_health": {
|
||||||
"description": "The overall cluster health",
|
"properties": {
|
||||||
"example": "Optimal",
|
"health": {
|
||||||
|
"description": "The overall health (%) of the cluster",
|
||||||
|
"example": 100,
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"messages": {
|
||||||
|
"description": "A list of health event strings",
|
||||||
|
"items": {
|
||||||
|
"example": "hv1: plugin 'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"type": "array"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"type": "object"
|
||||||
|
},
|
||||||
|
"maintenance": {
|
||||||
|
"description": "Whether the cluster is in maintenance mode or not (string boolean)",
|
||||||
|
"example": true,
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
"networks": {
|
"networks": {
|
||||||
"description": "The total number of networks in the cluster",
|
"description": "The total number of networks in the cluster",
|
||||||
"type": "integer"
|
"type": "integer"
|
||||||
},
|
},
|
||||||
|
"node_health": {
|
||||||
|
"properties": {
|
||||||
|
"hvX": {
|
||||||
|
"description": "A node entry for per-node health details, one per node in the cluster",
|
||||||
|
"properties": {
|
||||||
|
"health": {
|
||||||
|
"description": "The health (%) of the node",
|
||||||
|
"example": 100,
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"messages": {
|
||||||
|
"description": "A list of health event strings",
|
||||||
|
"items": {
|
||||||
|
"example": "'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"type": "array"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"type": "object"
|
||||||
|
},
|
||||||
"nodes": {
|
"nodes": {
|
||||||
"properties": {
|
"properties": {
|
||||||
"state-combination": {
|
"state-combination": {
|
||||||
@ -65,11 +107,6 @@
|
|||||||
"description": "The total number of snapshots in the storage cluster",
|
"description": "The total number of snapshots in the storage cluster",
|
||||||
"type": "integer"
|
"type": "integer"
|
||||||
},
|
},
|
||||||
"storage_health": {
|
|
||||||
"description": "The overall storage cluster health",
|
|
||||||
"example": "Optimal",
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"upstream_ip": {
|
"upstream_ip": {
|
||||||
"description": "The cluster upstream IP address in CIDR format",
|
"description": "The cluster upstream IP address in CIDR format",
|
||||||
"example": "10.0.0.254/24",
|
"example": "10.0.0.254/24",
|
||||||
@ -456,6 +493,48 @@
|
|||||||
"description": "The number of running domains (VMs)",
|
"description": "The number of running domains (VMs)",
|
||||||
"type": "integer"
|
"type": "integer"
|
||||||
},
|
},
|
||||||
|
"health": {
|
||||||
|
"description": "The overall health (%) of the node",
|
||||||
|
"example": 100,
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"health_details": {
|
||||||
|
"description": "A list of health plugin results",
|
||||||
|
"items": {
|
||||||
|
"properties": {
|
||||||
|
"health_delta": {
|
||||||
|
"description": "The health delta (negatively applied to the health percentage) of the plugin's current state",
|
||||||
|
"example": 10,
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"last_run": {
|
||||||
|
"description": "The UNIX timestamp (s) of the last plugin run",
|
||||||
|
"example": 1676786078,
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"message": {
|
||||||
|
"description": "The output message of the plugin",
|
||||||
|
"example": "bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"description": "The name of the health plugin",
|
||||||
|
"example": "nics",
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"type": "object"
|
||||||
|
},
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
"health_plugins": {
|
||||||
|
"description": "A list of health plugin names currently loaded on the node",
|
||||||
|
"items": {
|
||||||
|
"example": "nics",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
"kernel": {
|
"kernel": {
|
||||||
"desription": "The running kernel version from uname",
|
"desription": "The running kernel version from uname",
|
||||||
"type": "string"
|
"type": "string"
|
||||||
@ -6177,7 +6256,7 @@
|
|||||||
"description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference",
|
"description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference",
|
||||||
"enum": [
|
"enum": [
|
||||||
"mem",
|
"mem",
|
||||||
"memfree",
|
"memprov",
|
||||||
"vcpus",
|
"vcpus",
|
||||||
"load",
|
"load",
|
||||||
"vms",
|
"vms",
|
||||||
@ -6336,7 +6415,7 @@
|
|||||||
"description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference",
|
"description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference",
|
||||||
"enum": [
|
"enum": [
|
||||||
"mem",
|
"mem",
|
||||||
"memfree",
|
"memprov",
|
||||||
"vcpus",
|
"vcpus",
|
||||||
"load",
|
"load",
|
||||||
"vms",
|
"vms",
|
||||||
@ -6597,7 +6676,7 @@
|
|||||||
"description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference",
|
"description": "The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference",
|
||||||
"enum": [
|
"enum": [
|
||||||
"mem",
|
"mem",
|
||||||
"memfree",
|
"memprov",
|
||||||
"vcpus",
|
"vcpus",
|
||||||
"load",
|
"load",
|
||||||
"vms",
|
"vms",
|
||||||
|
@ -2,23 +2,34 @@
|
|||||||
|
|
||||||
This directory contains several monitoring resources that can be used with various monitoring systems to track and alert on a PVC cluster system.
|
This directory contains several monitoring resources that can be used with various monitoring systems to track and alert on a PVC cluster system.
|
||||||
|
|
||||||
### Munin
|
## Munin
|
||||||
|
|
||||||
The included munin plugin can be activated by linking to it from `/etc/munin/plugins/pvc`. By default, this plugin triggers a CRITICAL state when either the PVC or Storage cluster becomes Degraded, and is otherwise OK. The overall health is graphed numerically (Optimal is 0, Maintenance is 1, Degraded is 2) so that the cluster health can be tracked over time.
|
The included Munin plugins can be activated by linking to them from `/etc/munin/plugins/`. Two plugins are provided:
|
||||||
|
|
||||||
When using this plugin, it might be useful to adjust the thresholds with a plugin configuration. For instance, one could adjust the Degraded value from CRITICAL to WARNING by adjusting the critical threshold to a value higher than 1.99 (e.g. 3, 10, etc.) so that only the WARNING threshold will be hit. Alternatively one could instead make Maintenance mode trigger a WARNING by lowering the threshold to 0.99.
|
* `pvc`: Checks the PVC cluster and node health, as well as their status (OK/Warning/Critical, based on maintenance status), providing 4 graphs.
|
||||||
|
|
||||||
Example plugin configuration:
|
* `ceph_utilization`: Checks the Ceph cluster statistics, providing multiple graphs. Note that this plugin is independent of PVC itself, and makes local calls to various Ceph commands itself.
|
||||||
|
|
||||||
```
|
The `pvc` plugin provides no configuration; the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK. The alerting is provided by two separate graphs from the health graph so that actual health state is logged regardless of alerting.
|
||||||
[pvc]
|
|
||||||
# Make cluster warn on maintenance
|
|
||||||
env.pvc_cluster_warning 0.99
|
|
||||||
# Disable critical threshold (>2)
|
|
||||||
env.pvc_cluster_critical 3
|
|
||||||
# Make storage warn on maintenance, crit on degraded (latter is default)
|
|
||||||
env.pvc_storage_warning 0.99
|
|
||||||
env.pvc_storage_critical 1.99
|
|
||||||
```
|
|
||||||
|
|
||||||
### Check_MK
|
The `ceph_utilization` plugin provides no configuration; only the cluster utilization graph alerts such that >80% used is warning and >90% used is critical. Ceph itself begins warning above 80% as well.
|
||||||
|
|
||||||
|
## CheckMK
|
||||||
|
|
||||||
|
The included CheckMK plugin is divided into two parts: the agent plugin, and the monitoring server plugin. This monitoring server plugin requires CheckMK version 2.0 or higher. The two parts can be installed as follows:
|
||||||
|
|
||||||
|
* `pvc`: Place this file in the `/usr/lib/check_mk_agent/plugins/` directory on each node.
|
||||||
|
|
||||||
|
* `pvc.py`: Place this file in the `~/local/lib/python3/cmk/base/plugins/agent_based/` directory on the CheckMK monitoring host for each monitoring site.
|
||||||
|
|
||||||
|
The plugin provides no configuration: the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK.
|
||||||
|
|
||||||
|
With both the agent and server plugins installed, you can then run `cmk -II <node>` (or use WATO) to inventory each node, which should produce two new checks:
|
||||||
|
|
||||||
|
* `PVC Cluster`: Provides the cluster-wide health. Note that this will be identical for all nodes in the cluster (i.e. if the cluster health drops, all nodes in the cluster will alert this check).
|
||||||
|
|
||||||
|
* `PVC Node <shortname>`: Provides the per-node health.
|
||||||
|
|
||||||
|
The "Summary" text, shown in the check lists, will be simplistic, only showing the current health percentage.
|
||||||
|
|
||||||
|
The "Details" text, found in the specific check details, will show the full list of problem(s) the check finds, as shown by `pvc status` itself.
|
||||||
|
6
node-daemon/monitoring/checkmk/pvc
Executable file
6
node-daemon/monitoring/checkmk/pvc
Executable file
@ -0,0 +1,6 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# PVC cluster status check for Check_MK (agent-side)
|
||||||
|
|
||||||
|
echo "<<<pvc>>>"
|
||||||
|
pvc --quiet status --format json
|
95
node-daemon/monitoring/checkmk/pvc.py
Normal file
95
node-daemon/monitoring/checkmk/pvc.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# Check_MK PVC plugin
|
||||||
|
#
|
||||||
|
# Copyright 2017-2021, Joshua Boniface <joshua@boniface.me>
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
from .agent_based_api.v1 import *
|
||||||
|
from cmk.base.check_api import host_name
|
||||||
|
from time import time
|
||||||
|
from json import loads
|
||||||
|
|
||||||
|
|
||||||
|
def discover_pvc(section):
|
||||||
|
my_node = host_name().split(".")[0]
|
||||||
|
yield Service(item=f"PVC Node {my_node}")
|
||||||
|
yield Service(item="PVC Cluster")
|
||||||
|
|
||||||
|
|
||||||
|
def check_pvc(item, params, section):
|
||||||
|
state = State.OK
|
||||||
|
summary = "Stuff"
|
||||||
|
details = None
|
||||||
|
data = loads(" ".join(section[0]))
|
||||||
|
my_node = host_name().split(".")[0]
|
||||||
|
|
||||||
|
maintenance_map = {
|
||||||
|
"true": "on",
|
||||||
|
"false": "off",
|
||||||
|
}
|
||||||
|
maintenance = maintenance_map[data["maintenance"]]
|
||||||
|
|
||||||
|
# Node check
|
||||||
|
if item == f"PVC Node {my_node}":
|
||||||
|
my_node = host_name().split(".")[0]
|
||||||
|
node_health = data["node_health"][my_node]["health"]
|
||||||
|
node_messages = data["node_health"][my_node]["messages"]
|
||||||
|
|
||||||
|
summary = f"Node health is {node_health}% (maintenance {maintenance})"
|
||||||
|
|
||||||
|
if len(node_messages) > 0:
|
||||||
|
details = ", ".join(node_messages)
|
||||||
|
|
||||||
|
if node_health <= 50 and maintenance == "off":
|
||||||
|
state = State.CRIT
|
||||||
|
elif node_health <= 90 and maintenance == "off":
|
||||||
|
state = State.WARN
|
||||||
|
else:
|
||||||
|
state = State.OK
|
||||||
|
|
||||||
|
yield Metric(name="node-health", value=node_health)
|
||||||
|
|
||||||
|
# Cluster check
|
||||||
|
elif item == "PVC Cluster":
|
||||||
|
cluster_health = data["cluster_health"]["health"]
|
||||||
|
cluster_messages = data["cluster_health"]["messages"]
|
||||||
|
|
||||||
|
summary = f"Cluster health is {cluster_health}% (maintenance {maintenance})"
|
||||||
|
|
||||||
|
if len(cluster_messages) > 0:
|
||||||
|
details = ", ".join(cluster_messages)
|
||||||
|
|
||||||
|
if cluster_health <= 50 and maintenance == "off":
|
||||||
|
state = State.CRIT
|
||||||
|
elif cluster_health <= 90 and maintenance == "off":
|
||||||
|
state = State.WARN
|
||||||
|
else:
|
||||||
|
state = State.OK
|
||||||
|
|
||||||
|
yield Metric(name="cluster-health", value=cluster_health)
|
||||||
|
|
||||||
|
yield Result(state=state, summary=summary, details=details)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
register.check_plugin(
|
||||||
|
name="pvc",
|
||||||
|
service_name="%s",
|
||||||
|
check_ruleset_name="pvc",
|
||||||
|
discovery_function=discover_pvc,
|
||||||
|
check_function=check_pvc,
|
||||||
|
check_default_parameters={},
|
||||||
|
)
|
@ -7,23 +7,6 @@
|
|||||||
|
|
||||||
pvc - Plugin to monitor a PVC cluster.
|
pvc - Plugin to monitor a PVC cluster.
|
||||||
|
|
||||||
=head1 CONFIGURATION
|
|
||||||
|
|
||||||
Note that due to how Munin thresholds work, these values must always be slightly less than 1 or 2 respectively,
|
|
||||||
or the alerts will never be triggered.
|
|
||||||
|
|
||||||
Defaults (no config required):
|
|
||||||
|
|
||||||
[pvc]
|
|
||||||
env.warning 1.99
|
|
||||||
env.critical 1.99
|
|
||||||
|
|
||||||
Make degraded cluster WARN only (max value is 2, so 3 effectively disables):
|
|
||||||
|
|
||||||
[pvc]
|
|
||||||
env.pvc_cluster_warning 1.99
|
|
||||||
env.pvc_cluster_critical 3
|
|
||||||
|
|
||||||
=head1 AUTHOR
|
=head1 AUTHOR
|
||||||
|
|
||||||
Joshua Boniface <joshua@boniface.me>
|
Joshua Boniface <joshua@boniface.me>
|
||||||
@ -45,24 +28,17 @@ GPLv3
|
|||||||
|
|
||||||
. "$MUNIN_LIBDIR/plugins/plugin.sh"
|
. "$MUNIN_LIBDIR/plugins/plugin.sh"
|
||||||
|
|
||||||
warning=1.99
|
is_multigraph
|
||||||
critical=1.99
|
|
||||||
|
warning=1
|
||||||
|
critical=2
|
||||||
|
|
||||||
export PVC_CLIENT_DIR="/run/shm/munin-pvc"
|
export PVC_CLIENT_DIR="/run/shm/munin-pvc"
|
||||||
PVC_CMD="/usr/bin/pvc --quiet --cluster local status --format json-pretty"
|
PVC_CMD="/usr/bin/pvc --quiet --cluster local status --format json-pretty"
|
||||||
JQ_CMD="/usr/bin/jq"
|
JQ_CMD="/usr/bin/jq"
|
||||||
|
|
||||||
output_usage() {
|
output_usage() {
|
||||||
echo "This plugin outputs numerical values based on the health of the PVC cluster."
|
echo "This plugin outputs information about a PVC cluster and node"
|
||||||
echo
|
|
||||||
echo "There are separate outputs for both the PVC cluster itself as well as the Ceph storage cluster."
|
|
||||||
echo "In normal operation, i.e. when both clusters are in 'Optimal' state, the plugin returns 0 for"
|
|
||||||
echo "each cluster. When the cluster is placed into 'Maintenance' mode,the plugin returns 1 for each"
|
|
||||||
echo "cluster, and goes into WARN state (limit 0.99); this can be adjusted by overriding the WARNING"
|
|
||||||
echo "threshold of the plugin to something other than 0.99 - note that due to Munin's alerting design,"
|
|
||||||
echo "the warning value must always be very slightly below the whole number. When either cluster"
|
|
||||||
echo "element becomes 'Degraded', the plugin returns 2 for the relevant cluster, which is treated as a"
|
|
||||||
echo "critical. Like the WARNING threshold, this can be overridden, and with the same caveat about limit."
|
|
||||||
exit 0
|
exit 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,72 +60,102 @@ output_autoconf() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
output_config() {
|
output_config() {
|
||||||
echo 'graph_title PVC Clusters'
|
echo 'multigraph pvc_cluster_health'
|
||||||
|
echo 'graph_title PVC Cluster Health'
|
||||||
echo 'graph_args --base 1000'
|
echo 'graph_args --base 1000'
|
||||||
echo 'graph_vlabel Count'
|
echo 'graph_vlabel Health%'
|
||||||
echo 'graph_category pvc'
|
echo 'graph_category pvc'
|
||||||
echo 'graph_period second'
|
echo 'graph_info Health of the PVC cluster'
|
||||||
echo 'graph_info This graph shows the nodes in the PVC cluster.'
|
|
||||||
|
|
||||||
echo 'pvc_cluster.label Cluster Degradation'
|
echo 'pvc_cluster_health.label Cluster Health'
|
||||||
echo 'pvc_cluster.type GAUGE'
|
echo 'pvc_cluster_health.type GAUGE'
|
||||||
echo 'pvc_cluster.max 2'
|
echo 'pvc_cluster_health.max 100'
|
||||||
echo 'pvc_cluster.info Whether the PVC cluster is in a degraded state.'
|
echo 'pvc_cluster_health.min 0'
|
||||||
print_warning pvc_cluster
|
echo 'pvc_cluster_health.info Health of the PVC cluster in %'
|
||||||
print_critical pvc_cluster
|
|
||||||
|
|
||||||
echo 'pvc_storage.label Storage Degradation'
|
echo 'multigraph pvc_cluster_alert'
|
||||||
echo 'pvc_storage.type GAUGE'
|
echo 'graph_title PVC Cluster Alerting'
|
||||||
echo 'pvc_storage.max 2'
|
echo 'graph_args --base 1000'
|
||||||
echo 'pvc_storage.info Whether the storage cluster is in a degraded state.'
|
echo 'graph_vlabel State'
|
||||||
print_warning pvc_storage
|
echo 'graph_category pvc'
|
||||||
print_critical pvc_storage
|
echo 'graph_info Alerting state of the PVC cluster health'
|
||||||
|
|
||||||
|
echo 'pvc_cluster_alert.label Cluster Health State'
|
||||||
|
echo 'pvc_cluster_alert.type GAUGE'
|
||||||
|
echo 'pvc_cluster_alert.max 2'
|
||||||
|
echo 'pvc_cluster_alert.min 0'
|
||||||
|
echo 'pvc_cluster_alert.info Alerting state of the PVC cluster health'
|
||||||
|
print_warning pvc_cluster_alert
|
||||||
|
print_critical pvc_cluster_alert
|
||||||
|
|
||||||
|
echo 'multigraph pvc_node_health'
|
||||||
|
echo 'graph_title PVC Node Health'
|
||||||
|
echo 'graph_args --base 1000'
|
||||||
|
echo 'graph_vlabel Health%'
|
||||||
|
echo 'graph_category pvc'
|
||||||
|
echo 'graph_info Health of the PVC node'
|
||||||
|
|
||||||
|
echo 'pvc_node_health.label Node Health'
|
||||||
|
echo 'pvc_node_health.type GAUGE'
|
||||||
|
echo 'pvc_node_health.max 100'
|
||||||
|
echo 'pvc_node_health.min 0'
|
||||||
|
echo 'pvc_node_health.info Health of the PVC node in %'
|
||||||
|
|
||||||
|
echo 'multigraph pvc_node_alert'
|
||||||
|
echo 'graph_title PVC Node Alerting'
|
||||||
|
echo 'graph_args --base 1000'
|
||||||
|
echo 'graph_vlabel State'
|
||||||
|
echo 'graph_category pvc'
|
||||||
|
echo 'graph_info Alerting state of the PVC node health'
|
||||||
|
|
||||||
|
echo 'pvc_node_alert.label Node Health State'
|
||||||
|
echo 'pvc_node_alert.type GAUGE'
|
||||||
|
echo 'pvc_node_alert.max 2'
|
||||||
|
echo 'pvc_node_alert.min 0'
|
||||||
|
echo 'pvc_node_alert.info Alerting state of the PVC node health'
|
||||||
|
print_warning pvc_node_alert
|
||||||
|
print_critical pvc_node_alert
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
}
|
}
|
||||||
|
|
||||||
output_values() {
|
output_values() {
|
||||||
PVC_OUTPUT="$( $PVC_CMD )"
|
PVC_OUTPUT="$( $PVC_CMD )"
|
||||||
|
HOST="$( hostname --short )"
|
||||||
|
|
||||||
cluster_health="$( $JQ_CMD '.health' <<<"${PVC_OUTPUT}" | tr -d '"' )"
|
is_maintenance="$( $JQ_CMD ".maintenance" <<<"${PVC_OUTPUT}" | tr -d '"' )"
|
||||||
cluster_failed_reason="$( $JQ_CMD -r '.health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
|
|
||||||
case $cluster_health in
|
|
||||||
"Optimal")
|
|
||||||
cluster_value="0"
|
|
||||||
;;
|
|
||||||
"Maintenance")
|
|
||||||
cluster_value="1"
|
|
||||||
;;
|
|
||||||
"Degraded")
|
|
||||||
cluster_value="2"
|
|
||||||
esac
|
|
||||||
|
|
||||||
storage_health="$( $JQ_CMD '.storage_health' <<<"${PVC_OUTPUT}" | tr -d '"' )"
|
cluster_health="$( $JQ_CMD ".cluster_health.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
|
||||||
storage_failed_reason="$( $JQ_CMD -r '.storage_health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
|
cluster_health_messages="$( $JQ_CMD -r ".cluster_health.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
|
||||||
case $storage_health in
|
echo 'multigraph pvc_cluster_health'
|
||||||
"Optimal")
|
echo "pvc_cluster_health.value ${cluster_health}"
|
||||||
storage_value="0"
|
echo "pvc_cluster_health.extinfo ${cluster_health_messages}"
|
||||||
;;
|
|
||||||
"Maintenance")
|
|
||||||
storage_value="1"
|
|
||||||
;;
|
|
||||||
"Degraded")
|
|
||||||
storage_value="2"
|
|
||||||
esac
|
|
||||||
|
|
||||||
|
if [[ ${cluster_health} -le 50 && ${is_maintenance} == "false" ]]; then
|
||||||
|
cluster_health_alert=2
|
||||||
|
elif [[ ${cluster_health} -le 90 && ${is_maintenance} == "false" ]]; then
|
||||||
|
cluster_health_alert=1
|
||||||
|
else
|
||||||
|
cluster_health_alert=0
|
||||||
|
fi
|
||||||
|
echo 'multigraph pvc_cluster_alert'
|
||||||
|
echo "pvc_cluster_alert.value ${cluster_health_alert}"
|
||||||
|
|
||||||
echo "pvc_cluster.value $cluster_value"
|
node_health="$( $JQ_CMD ".node_health.${HOST}.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
|
||||||
if [[ $cluster_value -eq 1 ]]; then
|
node_health_messages="$( $JQ_CMD -r ".node_health.${HOST}.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
|
||||||
echo "pvc_cluster.extinfo Cluster in maintenance mode"
|
echo 'multigraph pvc_node_health'
|
||||||
elif [[ $cluster_value -eq 2 ]]; then
|
echo "pvc_node_health.value ${node_health}"
|
||||||
echo "pvc_cluster.extinfo ${cluster_failed_reason}"
|
echo "pvc_node_health.extinfo ${node_health_messages}"
|
||||||
fi
|
|
||||||
echo "pvc_storage.value $storage_value"
|
if [[ ${node_health} -le 50 && ${is_maintenance} != "true" ]]; then
|
||||||
if [[ $storage_value -eq 1 ]]; then
|
node_health_alert=2
|
||||||
echo "pvc_storage.extinfo Cluster in maintenance mode"
|
elif [[ ${node_health} -le 90 && ${is_maintenance} != "true" ]]; then
|
||||||
elif [[ $storage_value -eq 2 ]]; then
|
node_health_alert=1
|
||||||
echo "pvc_storage.extinfo ${storage_failed_reason}"
|
else
|
||||||
fi
|
node_health_alert=0
|
||||||
|
fi
|
||||||
|
echo 'multigraph pvc_node_alert'
|
||||||
|
echo "pvc_node_alert.value ${node_health_alert}"
|
||||||
}
|
}
|
||||||
|
|
||||||
case $# in
|
case $# in
|
||||||
|
167
node-daemon/plugins/disk
Normal file
167
node-daemon/plugins/disk
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# disk.py - PVC Monitoring example plugin for disk (system + OSD)
|
||||||
|
# Part of the Parallel Virtual Cluster (PVC) system
|
||||||
|
#
|
||||||
|
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, version 3.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
#
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||||
|
# a simple plugin to check the system and OSD disks for errors and faults and return
|
||||||
|
# a health delta corresponding to severity.
|
||||||
|
|
||||||
|
# This script can thus be used as an example or reference implementation of a
|
||||||
|
# PVC monitoring pluginscript and expanded upon as required.
|
||||||
|
|
||||||
|
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||||
|
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||||
|
# of the role of each function is provided in context of the example; see the other
|
||||||
|
# examples for more potential uses.
|
||||||
|
|
||||||
|
# WARNING:
|
||||||
|
#
|
||||||
|
# This script will run in the context of the node daemon keepalives as root.
|
||||||
|
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||||
|
|
||||||
|
|
||||||
|
# This import is always required here, as MonitoringPlugin is used by the
|
||||||
|
# MonitoringPluginScript class
|
||||||
|
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||||
|
|
||||||
|
|
||||||
|
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||||
|
# the file name
|
||||||
|
PLUGIN_NAME = "disk"
|
||||||
|
|
||||||
|
|
||||||
|
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||||
|
class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
def setup(self):
|
||||||
|
"""
|
||||||
|
setup(): Perform special setup steps during node daemon startup
|
||||||
|
|
||||||
|
This step is optional and should be used sparingly.
|
||||||
|
|
||||||
|
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||||
|
and return a non-None failure message to indicate the error.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from daemon_lib.common import run_os_command
|
||||||
|
from json import loads
|
||||||
|
|
||||||
|
_, _all_disks, _ = run_os_command("lsblk --json --paths --include 8,259")
|
||||||
|
try:
|
||||||
|
all_disks = loads(_all_disks)
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error loading lsblk JSON: {e}"
|
||||||
|
|
||||||
|
disk_details = list()
|
||||||
|
|
||||||
|
def get_smartinfo(disk, extra_opt=""):
|
||||||
|
_, _smart_info, _ = run_os_command(f"smartctl --info --json {extra_opt} {disk}")
|
||||||
|
try:
|
||||||
|
smart_info = loads(_smart_info)
|
||||||
|
except Exception as e:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return smart_info
|
||||||
|
|
||||||
|
for disk in [disk["name"] for disk in all_disks['blockdevices']]:
|
||||||
|
extra_opt = ""
|
||||||
|
smart_info = get_smartinfo(disk)
|
||||||
|
if smart_info is None or smart_info["smartctl"]["exit_status"] > 1:
|
||||||
|
continue
|
||||||
|
elif smart_info["smartctl"]["exit_status"] == 1:
|
||||||
|
if "requires option" in smart_info["smartctl"]["messages"][0]["string"]:
|
||||||
|
extra_opt = smart_info["smartctl"]["messages"][0]["string"].split("'")[1].replace('N','0')
|
||||||
|
smart_info = get_smartinfo(disk, extra_opt)
|
||||||
|
if smart_info is None or smart_info["smartctl"]["exit_status"] > 0:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
disk_type = smart_info["device"]["type"]
|
||||||
|
|
||||||
|
disk_details.append((disk, extra_opt, disk_type))
|
||||||
|
|
||||||
|
self.disk_details = disk_details
|
||||||
|
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""
|
||||||
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Re-run setup each time to ensure the disk details are current
|
||||||
|
self.setup()
|
||||||
|
|
||||||
|
# Run any imports first
|
||||||
|
from daemon_lib.common import run_os_command
|
||||||
|
from json import loads
|
||||||
|
|
||||||
|
health_delta = 0
|
||||||
|
messages = list()
|
||||||
|
|
||||||
|
for _disk in self.disk_details:
|
||||||
|
disk = _disk[0]
|
||||||
|
extra_opt = _disk[1]
|
||||||
|
disk_type = _disk[2]
|
||||||
|
|
||||||
|
_, _smart_info, _ = run_os_command(f"smartctl --all --json {extra_opt} {disk}")
|
||||||
|
try:
|
||||||
|
smart_info = loads(_smart_info)
|
||||||
|
except Exception as e:
|
||||||
|
health_delta += 10
|
||||||
|
messages.append(f"{disk} failed to load SMART data")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if disk_type == 'nvme':
|
||||||
|
for attribute in smart_info['nvme_smart_health_information_log'].items():
|
||||||
|
if attribute[0] == "critical_warning" and attribute[1] > 0:
|
||||||
|
health_delta += 10
|
||||||
|
messages.append(f"{disk} critical warning value {attribute[1]}")
|
||||||
|
if attribute[0] == "media_errors" and attribute[1] > 0:
|
||||||
|
health_delta += 10
|
||||||
|
messages.append(f"{disk} media errors value {attribute[1]}")
|
||||||
|
if attribute[0] == "percentage_used" and attribute[1] > 90:
|
||||||
|
health_delta += 10
|
||||||
|
messages.append(f"{disk} percentage used value {attribute[1]}%")
|
||||||
|
else:
|
||||||
|
for attribute in smart_info['ata_smart_attributes']['table']:
|
||||||
|
if attribute["when_failed"]:
|
||||||
|
health_delta += 10
|
||||||
|
messages.append(f"{disk} attribute {attribute['name']} value {attribute['raw']['value']}")
|
||||||
|
|
||||||
|
if len(messages) < 1:
|
||||||
|
messages.append(f"All {len(self.disk_details)} checked disks report OK: {', '.join([disk[0] for disk in self.disk_details])}")
|
||||||
|
|
||||||
|
# Set the health delta in our local PluginResult object
|
||||||
|
self.plugin_result.set_health_delta(health_delta)
|
||||||
|
|
||||||
|
# Set the message in our local PluginResult object
|
||||||
|
self.plugin_result.set_message(', '.join(messages))
|
||||||
|
|
||||||
|
# Return our local PluginResult object
|
||||||
|
return self.plugin_result
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
"""
|
||||||
|
cleanup(): Perform special cleanup steps during node daemon termination
|
||||||
|
|
||||||
|
This step is optional and should be used sparingly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
160
node-daemon/plugins/dpkg
Normal file
160
node-daemon/plugins/dpkg
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# dpkg.py - PVC Monitoring example plugin for dpkg status
|
||||||
|
# Part of the Parallel Virtual Cluster (PVC) system
|
||||||
|
#
|
||||||
|
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, version 3.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
#
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||||
|
# a simple plugin to check the system dpkg status is as expected, with no invalid
|
||||||
|
# packages or obsolete configuration files, and will return a 1 health delta for each
|
||||||
|
# flaw in invalid packages, upgradable packages, and obsolete config files.
|
||||||
|
|
||||||
|
# This script can thus be used as an example or reference implementation of a
|
||||||
|
# PVC monitoring pluginscript and expanded upon as required.
|
||||||
|
|
||||||
|
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||||
|
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||||
|
# of the role of each function is provided in context of the example; see the other
|
||||||
|
# examples for more potential uses.
|
||||||
|
|
||||||
|
# WARNING:
|
||||||
|
#
|
||||||
|
# This script will run in the context of the node daemon keepalives as root.
|
||||||
|
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||||
|
|
||||||
|
|
||||||
|
# This import is always required here, as MonitoringPlugin is used by the
|
||||||
|
# MonitoringPluginScript class
|
||||||
|
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||||
|
|
||||||
|
|
||||||
|
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||||
|
# the file name
|
||||||
|
PLUGIN_NAME = "dpkg"
|
||||||
|
|
||||||
|
|
||||||
|
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||||
|
class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
def setup(self):
|
||||||
|
"""
|
||||||
|
setup(): Perform special setup steps during node daemon startup
|
||||||
|
|
||||||
|
This step is optional and should be used sparingly.
|
||||||
|
|
||||||
|
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||||
|
and return a non-None failure message to indicate the error.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""
|
||||||
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Run any imports first
|
||||||
|
from re import match
|
||||||
|
import daemon_lib.common as pvc_common
|
||||||
|
|
||||||
|
# Get Debian version
|
||||||
|
with open('/etc/debian_version', 'r') as fh:
|
||||||
|
debian_version = fh.read().strip()
|
||||||
|
|
||||||
|
# Get a list of dpkg packages for analysis
|
||||||
|
retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/dpkg --list")
|
||||||
|
|
||||||
|
# Get a list of installed packages and states
|
||||||
|
packages = list()
|
||||||
|
for dpkg_line in stdout.split('\n'):
|
||||||
|
if match('^[a-z][a-z] ', dpkg_line):
|
||||||
|
line_split = dpkg_line.split()
|
||||||
|
package_state = line_split[0]
|
||||||
|
package_name = line_split[1]
|
||||||
|
packages.append((package_name, package_state))
|
||||||
|
|
||||||
|
count_ok = 0
|
||||||
|
count_inconsistent = 0
|
||||||
|
list_inconsistent = list()
|
||||||
|
|
||||||
|
for package in packages:
|
||||||
|
if package[1] == "ii":
|
||||||
|
count_ok += 1
|
||||||
|
else:
|
||||||
|
count_inconsistent += 1
|
||||||
|
list_inconsistent.append(package[0])
|
||||||
|
|
||||||
|
# Get upgradable packages
|
||||||
|
retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/apt list --upgradable")
|
||||||
|
|
||||||
|
list_upgradable = list()
|
||||||
|
for apt_line in stdout.split('\n'):
|
||||||
|
if match('^[a-z][a-z] ', apt_line):
|
||||||
|
line_split = apt_line.split('/')
|
||||||
|
package_name = line_split[0]
|
||||||
|
list_upgradable.append(package_name)
|
||||||
|
|
||||||
|
count_upgradable = len(list_upgradable)
|
||||||
|
|
||||||
|
# Get obsolete config files (dpkg-* or ucf-* under /etc)
|
||||||
|
retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/find /etc -type f -a \( -name '*.dpkg-*' -o -name '*.ucf-*' \)")
|
||||||
|
|
||||||
|
obsolete_conffiles = list()
|
||||||
|
for conffile_line in stdout.split('\n'):
|
||||||
|
if conffile_line:
|
||||||
|
obsolete_conffiles.append(conffile_line)
|
||||||
|
|
||||||
|
count_obsolete_conffiles = len(obsolete_conffiles)
|
||||||
|
|
||||||
|
# Set health_delta based on the results
|
||||||
|
health_delta = 0
|
||||||
|
if count_inconsistent > 0:
|
||||||
|
health_delta += 1
|
||||||
|
if count_upgradable > 0:
|
||||||
|
health_delta += 1
|
||||||
|
if count_obsolete_conffiles > 0:
|
||||||
|
health_delta += 1
|
||||||
|
|
||||||
|
# Set the health delta in our local PluginResult object
|
||||||
|
self.plugin_result.set_health_delta(health_delta)
|
||||||
|
|
||||||
|
# Craft the message
|
||||||
|
message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages inconsistent: {count_inconsistent}, upgradable: {count_upgradable}"
|
||||||
|
|
||||||
|
# Set the message in our local PluginResult object
|
||||||
|
self.plugin_result.set_message(message)
|
||||||
|
|
||||||
|
# Set the detailed data in our local PluginResult object
|
||||||
|
detailed_data = {
|
||||||
|
"debian_version": debian_version,
|
||||||
|
"obsolete_conffiles": obsolete_conffiles,
|
||||||
|
"inconsistent_packages": list_inconsistent,
|
||||||
|
"upgradable_packages": list_upgradable,
|
||||||
|
}
|
||||||
|
self.plugin_result.set_data(detailed_data)
|
||||||
|
|
||||||
|
# Return our local PluginResult object
|
||||||
|
return self.plugin_result
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
"""
|
||||||
|
cleanup(): Perform special cleanup steps during node daemon termination
|
||||||
|
|
||||||
|
This step is optional and should be used sparingly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
106
node-daemon/plugins/edac
Normal file
106
node-daemon/plugins/edac
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# edac.py - PVC Monitoring example plugin for EDAC
|
||||||
|
# Part of the Parallel Virtual Cluster (PVC) system
|
||||||
|
#
|
||||||
|
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, version 3.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
#
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||||
|
# a simple plugin to check the system's EDAC registers and report any failures.
|
||||||
|
|
||||||
|
# This script can thus be used as an example or reference implementation of a
|
||||||
|
# PVC monitoring pluginscript and expanded upon as required.
|
||||||
|
|
||||||
|
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||||
|
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||||
|
# of the role of each function is provided in context of the example; see the other
|
||||||
|
# examples for more potential uses.
|
||||||
|
|
||||||
|
# WARNING:
|
||||||
|
#
|
||||||
|
# This script will run in the context of the node daemon keepalives as root.
|
||||||
|
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||||
|
|
||||||
|
|
||||||
|
# This import is always required here, as MonitoringPlugin is used by the
|
||||||
|
# MonitoringPluginScript class
|
||||||
|
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||||
|
|
||||||
|
|
||||||
|
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||||
|
# the file name
|
||||||
|
PLUGIN_NAME = "edac"
|
||||||
|
|
||||||
|
|
||||||
|
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||||
|
class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
def setup(self):
|
||||||
|
"""
|
||||||
|
setup(): Perform special setup steps during node daemon startup
|
||||||
|
|
||||||
|
This step is optional and should be used sparingly.
|
||||||
|
|
||||||
|
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||||
|
and return a non-None failure message to indicate the error.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""
|
||||||
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Run any imports first
|
||||||
|
import daemon_lib.common as common
|
||||||
|
from re import match, search
|
||||||
|
|
||||||
|
# Get edac-util output
|
||||||
|
retcode, stdout, stderr = common.run_os_command('/usr/bin/edac-util')
|
||||||
|
|
||||||
|
# If there's no errors, we're OK
|
||||||
|
if match(r'^edac-util: No errors to report.', stdout):
|
||||||
|
health_delta = 0
|
||||||
|
message = "EDAC reports no errors"
|
||||||
|
else:
|
||||||
|
health_delta = 0
|
||||||
|
message = "EDAC reports errors: "
|
||||||
|
errors = list()
|
||||||
|
for line in stdout.split('\n'):
|
||||||
|
if match(r'^mc[0-9]: csrow', line):
|
||||||
|
if 'Uncorrected' in line:
|
||||||
|
health_delta = 50
|
||||||
|
errors.append(' '.join(line.split()[2:]))
|
||||||
|
message += ', '.join(errors)
|
||||||
|
|
||||||
|
# Set the health delta in our local PluginResult object
|
||||||
|
self.plugin_result.set_health_delta(health_delta)
|
||||||
|
|
||||||
|
# Set the message in our local PluginResult object
|
||||||
|
self.plugin_result.set_message(message)
|
||||||
|
|
||||||
|
# Return our local PluginResult object
|
||||||
|
return self.plugin_result
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
"""
|
||||||
|
cleanup(): Perform special cleanup steps during node daemon termination
|
||||||
|
|
||||||
|
This step is optional and should be used sparingly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
108
node-daemon/plugins/load
Normal file
108
node-daemon/plugins/load
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# load.py - PVC Monitoring example plugin for load
|
||||||
|
# Part of the Parallel Virtual Cluster (PVC) system
|
||||||
|
#
|
||||||
|
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, version 3.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
#
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||||
|
# a simple plugin to check the system load against the total number of CPU cores,
|
||||||
|
# and return a 10 health delta (100 -> 90) if the load average is > 1/2 that number.
|
||||||
|
|
||||||
|
# This script can thus be used as an example or reference implementation of a
|
||||||
|
# PVC monitoring pluginscript and expanded upon as required.
|
||||||
|
|
||||||
|
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||||
|
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||||
|
# of the role of each function is provided in context of the example; see the other
|
||||||
|
# examples for more potential uses.
|
||||||
|
|
||||||
|
# WARNING:
|
||||||
|
#
|
||||||
|
# This script will run in the context of the node daemon keepalives as root.
|
||||||
|
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||||
|
|
||||||
|
|
||||||
|
# This import is always required here, as MonitoringPlugin is used by the
|
||||||
|
# MonitoringPluginScript class
|
||||||
|
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||||
|
|
||||||
|
|
||||||
|
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||||
|
# the file name
|
||||||
|
PLUGIN_NAME = "load"
|
||||||
|
|
||||||
|
|
||||||
|
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||||
|
class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
def setup(self):
|
||||||
|
"""
|
||||||
|
setup(): Perform special setup steps during node daemon startup
|
||||||
|
|
||||||
|
This step is optional and should be used sparingly.
|
||||||
|
|
||||||
|
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||||
|
and return a non-None failure message to indicate the error.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""
|
||||||
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Run any imports first
|
||||||
|
from os import getloadavg
|
||||||
|
from psutil import cpu_count
|
||||||
|
|
||||||
|
# Get the current 1-minute system load average
|
||||||
|
load_average = getloadavg()[0]
|
||||||
|
|
||||||
|
# Get the number of CPU cores
|
||||||
|
cpu_cores = cpu_count()
|
||||||
|
|
||||||
|
# Check that the load average is greater or equal to the cpu count
|
||||||
|
if load_average > float(cpu_cores):
|
||||||
|
# Set the health delta to 10 (subtract 10 from the total of 100)
|
||||||
|
health_delta = 10
|
||||||
|
# Craft a message that can be used by the clients
|
||||||
|
message = f"Current load is {load_average} out of {cpu_cores} CPU cores"
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Set the health delta to 0 (no change)
|
||||||
|
health_delta = 0
|
||||||
|
# Craft a message that can be used by the clients
|
||||||
|
message = f"Current load is {load_average} out of {cpu_cores} CPU cores"
|
||||||
|
|
||||||
|
# Set the health delta in our local PluginResult object
|
||||||
|
self.plugin_result.set_health_delta(health_delta)
|
||||||
|
|
||||||
|
# Set the message in our local PluginResult object
|
||||||
|
self.plugin_result.set_message(message)
|
||||||
|
|
||||||
|
# Return our local PluginResult object
|
||||||
|
return self.plugin_result
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
"""
|
||||||
|
cleanup(): Perform special cleanup steps during node daemon termination
|
||||||
|
|
||||||
|
This step is optional and should be used sparingly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
200
node-daemon/plugins/nics
Normal file
200
node-daemon/plugins/nics
Normal file
@ -0,0 +1,200 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# nics.py - PVC Monitoring example plugin for NIC interfaces
|
||||||
|
# Part of the Parallel Virtual Cluster (PVC) system
|
||||||
|
#
|
||||||
|
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, version 3.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
#
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||||
|
# a simple plugin to check the network interfaces of the host, specifically for speed
|
||||||
|
# and 802.3ad status (if applicable).
|
||||||
|
|
||||||
|
# This script can thus be used as an example or reference implementation of a
|
||||||
|
# PVC monitoring pluginscript and expanded upon as required.
|
||||||
|
|
||||||
|
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||||
|
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||||
|
# of the role of each function is provided in context of the example; see the other
|
||||||
|
# examples for more potential uses.
|
||||||
|
|
||||||
|
# WARNING:
|
||||||
|
#
|
||||||
|
# This script will run in the context of the node daemon keepalives as root.
|
||||||
|
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||||
|
|
||||||
|
|
||||||
|
# This import is always required here, as MonitoringPlugin is used by the
|
||||||
|
# MonitoringPluginScript class
|
||||||
|
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||||
|
|
||||||
|
|
||||||
|
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||||
|
# the file name
|
||||||
|
PLUGIN_NAME = "nics"
|
||||||
|
|
||||||
|
# Set a minimum link speed variable used below
|
||||||
|
# For PVC at least 10 Gbps is required for proper operation of a cluster
|
||||||
|
MINIMUM_LINKSPEED = 10000
|
||||||
|
|
||||||
|
|
||||||
|
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||||
|
class MonitoringPluginScript(MonitoringPlugin):
|
||||||
|
def setup(self):
|
||||||
|
"""
|
||||||
|
setup(): Perform special setup steps during node daemon startup
|
||||||
|
|
||||||
|
This step is optional and should be used sparingly.
|
||||||
|
|
||||||
|
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||||
|
and return a non-None failure message to indicate the error.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""
|
||||||
|
run(): Perform the check actions and return a PluginResult object
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Run any imports first
|
||||||
|
import daemon_lib.common as common
|
||||||
|
from re import match, search, findall
|
||||||
|
|
||||||
|
messages = list()
|
||||||
|
health_delta = 0
|
||||||
|
|
||||||
|
# Get a list of the various underlying devices
|
||||||
|
_core_nics = set()
|
||||||
|
|
||||||
|
for dev in [
|
||||||
|
self.config['bridge_dev'],
|
||||||
|
self.config['upstream_dev'],
|
||||||
|
self.config['cluster_dev'],
|
||||||
|
self.config['storage_dev'],
|
||||||
|
]:
|
||||||
|
with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
|
||||||
|
_devtype = uevent.readlines()[0].split('=')[-1].strip()
|
||||||
|
|
||||||
|
if _devtype == 'vlan':
|
||||||
|
with open(f"/proc/net/vlan/{dev}") as devfh:
|
||||||
|
vlan_info = devfh.read().split('\n')
|
||||||
|
for line in vlan_info:
|
||||||
|
if match(r'^Device:', line):
|
||||||
|
dev = line.split()[-1]
|
||||||
|
|
||||||
|
_core_nics.add(dev)
|
||||||
|
|
||||||
|
core_nics = sorted(list(_core_nics))
|
||||||
|
|
||||||
|
for dev in core_nics:
|
||||||
|
with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
|
||||||
|
_devtype = uevent.readlines()[0].split('=')[-1].strip()
|
||||||
|
|
||||||
|
if _devtype == "bond":
|
||||||
|
syspath = f"/proc/net/bonding/{dev}"
|
||||||
|
|
||||||
|
with open(syspath) as devfh:
|
||||||
|
bonding_stats = devfh.read()
|
||||||
|
|
||||||
|
_, _mode, _info, *_slaves = bonding_stats.split('\n\n')
|
||||||
|
|
||||||
|
slave_interfaces = list()
|
||||||
|
for slavedev in _slaves:
|
||||||
|
lines = slavedev.split('\n')
|
||||||
|
for line in lines:
|
||||||
|
if match(r'^Slave Interface:', line):
|
||||||
|
interface_name = line.split()[-1]
|
||||||
|
if match(r'^MII Status:', line):
|
||||||
|
interface_status = line.split()[-1]
|
||||||
|
if match(r'^Speed:', line):
|
||||||
|
try:
|
||||||
|
interface_speed_mbps = int(line.split()[-2])
|
||||||
|
except Exception:
|
||||||
|
interface_speed_mbps = 0
|
||||||
|
if match(r'^Duplex:', line):
|
||||||
|
interface_duplex = line.split()[-1]
|
||||||
|
slave_interfaces.append((interface_name, interface_status, interface_speed_mbps, interface_duplex))
|
||||||
|
|
||||||
|
# Ensure at least 2 slave interfaces are up
|
||||||
|
slave_interface_up_count = 0
|
||||||
|
for slave_interface in slave_interfaces:
|
||||||
|
if slave_interface[1] == 'up':
|
||||||
|
slave_interface_up_count += 1
|
||||||
|
if slave_interface_up_count < 2:
|
||||||
|
messages.append(f"{dev} DEGRADED with {slave_interface_up_count} active slaves")
|
||||||
|
health_delta += 10
|
||||||
|
else:
|
||||||
|
messages.append(f"{dev} OK with {slave_interface_up_count} active slaves")
|
||||||
|
|
||||||
|
# Get ethtool supported speeds for slave interfaces
|
||||||
|
supported_link_speeds = set()
|
||||||
|
for slave_interface in slave_interfaces:
|
||||||
|
slave_dev = slave_interface[0]
|
||||||
|
_, ethtool_stdout, _ = common.run_os_command(f"ethtool {slave_dev}")
|
||||||
|
in_modes = False
|
||||||
|
for line in ethtool_stdout.split('\n'):
|
||||||
|
if search('Supported link modes:', line):
|
||||||
|
in_modes = True
|
||||||
|
if search('Supported pause frame use:', line):
|
||||||
|
in_modes = False
|
||||||
|
break
|
||||||
|
if in_modes:
|
||||||
|
speed = int(findall(r'\d+', line.split()[-1])[0])
|
||||||
|
supported_link_speeds.add(speed)
|
||||||
|
else:
|
||||||
|
# Get ethtool supported speeds for interface
|
||||||
|
supported_link_speeds = set()
|
||||||
|
_, ethtool_stdout, _ = common.run_os_command(f"ethtool {dev}")
|
||||||
|
in_modes = False
|
||||||
|
for line in ethtool_stdout.split('\n'):
|
||||||
|
if search('Supported link modes:', line):
|
||||||
|
in_modes = True
|
||||||
|
if search('Supported pause frame use:', line):
|
||||||
|
in_modes = False
|
||||||
|
break
|
||||||
|
if in_modes:
|
||||||
|
speed = int(line.split()[-1].replace('baseT', '').split('/')[0])
|
||||||
|
supported_link_speeds.add(speed)
|
||||||
|
|
||||||
|
max_supported_link_speed = sorted(list(supported_link_speeds))[-1]
|
||||||
|
|
||||||
|
# Ensure interface is running at MINIMUM_LINKSPEED
|
||||||
|
with open(f"/sys/class/net/{dev}/speed") as devfh:
|
||||||
|
dev_speed = int(devfh.read())
|
||||||
|
if dev_speed < max_supported_link_speed:
|
||||||
|
messages.append(f"{dev} DEGRADED at {dev_speed} Mbps")
|
||||||
|
health_delta += 10
|
||||||
|
else:
|
||||||
|
messages.append(f"{dev} OK at {dev_speed} Mbps")
|
||||||
|
|
||||||
|
# Set the health delta in our local PluginResult object
|
||||||
|
self.plugin_result.set_health_delta(health_delta)
|
||||||
|
|
||||||
|
# Set the message in our local PluginResult object
|
||||||
|
self.plugin_result.set_message(', '.join(messages))
|
||||||
|
|
||||||
|
# Return our local PluginResult object
|
||||||
|
return self.plugin_result
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
"""
|
||||||
|
cleanup(): Perform special cleanup steps during node daemon termination
|
||||||
|
|
||||||
|
This step is optional and should be used sparingly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
@ -128,6 +128,8 @@ pvc:
|
|||||||
configuration:
|
configuration:
|
||||||
# directories: PVC system directories
|
# directories: PVC system directories
|
||||||
directories:
|
directories:
|
||||||
|
# plugin_directory: Directory containing node monitoring plugins
|
||||||
|
plugin_directory: "/usr/share/pvc/plugins"
|
||||||
# dynamic_directory: Temporary in-memory directory for active configurations
|
# dynamic_directory: Temporary in-memory directory for active configurations
|
||||||
dynamic_directory: "/run/pvc"
|
dynamic_directory: "/run/pvc"
|
||||||
# log_directory: Logging directory
|
# log_directory: Logging directory
|
||||||
@ -150,8 +152,8 @@ pvc:
|
|||||||
log_keepalives: True
|
log_keepalives: True
|
||||||
# log_keepalive_cluster_details: Enable or disable node status logging during keepalive
|
# log_keepalive_cluster_details: Enable or disable node status logging during keepalive
|
||||||
log_keepalive_cluster_details: True
|
log_keepalive_cluster_details: True
|
||||||
# log_keepalive_storage_details: Enable or disable node storage logging during keepalive
|
# log_keepalive_plugin_details: Enable or disable node health plugin logging during keepalive
|
||||||
log_keepalive_storage_details: True
|
log_keepalive_plugin_details: True
|
||||||
# console_log_lines: Number of console log lines to store in Zookeeper per VM
|
# console_log_lines: Number of console log lines to store in Zookeeper per VM
|
||||||
console_log_lines: 1000
|
console_log_lines: 1000
|
||||||
# node_log_lines: Number of node log lines to store in Zookeeper per node
|
# node_log_lines: Number of node log lines to store in Zookeeper per node
|
||||||
|
@ -27,6 +27,7 @@ import pvcnoded.util.services
|
|||||||
import pvcnoded.util.libvirt
|
import pvcnoded.util.libvirt
|
||||||
import pvcnoded.util.zookeeper
|
import pvcnoded.util.zookeeper
|
||||||
|
|
||||||
|
import pvcnoded.objects.MonitoringInstance as MonitoringInstance
|
||||||
import pvcnoded.objects.DNSAggregatorInstance as DNSAggregatorInstance
|
import pvcnoded.objects.DNSAggregatorInstance as DNSAggregatorInstance
|
||||||
import pvcnoded.objects.MetadataAPIInstance as MetadataAPIInstance
|
import pvcnoded.objects.MetadataAPIInstance as MetadataAPIInstance
|
||||||
import pvcnoded.objects.VMInstance as VMInstance
|
import pvcnoded.objects.VMInstance as VMInstance
|
||||||
@ -58,6 +59,7 @@ version = "0.9.61"
|
|||||||
|
|
||||||
def entrypoint():
|
def entrypoint():
|
||||||
keepalive_timer = None
|
keepalive_timer = None
|
||||||
|
monitoring_instance = None
|
||||||
|
|
||||||
# Get our configuration
|
# Get our configuration
|
||||||
config = pvcnoded.util.config.get_configuration()
|
config = pvcnoded.util.config.get_configuration()
|
||||||
@ -204,7 +206,7 @@ def entrypoint():
|
|||||||
|
|
||||||
# Define a cleanup function
|
# Define a cleanup function
|
||||||
def cleanup(failure=False):
|
def cleanup(failure=False):
|
||||||
nonlocal logger, zkhandler, keepalive_timer, d_domain
|
nonlocal logger, zkhandler, keepalive_timer, d_domain, monitoring_instance
|
||||||
|
|
||||||
logger.out("Terminating pvcnoded and cleaning up", state="s")
|
logger.out("Terminating pvcnoded and cleaning up", state="s")
|
||||||
|
|
||||||
@ -253,6 +255,13 @@ def entrypoint():
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Clean up any monitoring plugins that have cleanup
|
||||||
|
try:
|
||||||
|
logger.out("Performing monitoring plugin cleanup", state="s")
|
||||||
|
monitoring_instance.run_cleanups()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Set stop state in Zookeeper
|
# Set stop state in Zookeeper
|
||||||
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")])
|
zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")])
|
||||||
|
|
||||||
@ -1015,9 +1024,14 @@ def entrypoint():
|
|||||||
state="i",
|
state="i",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Set up the node monitoring instance
|
||||||
|
monitoring_instance = MonitoringInstance.MonitoringInstance(
|
||||||
|
zkhandler, config, logger, this_node
|
||||||
|
)
|
||||||
|
|
||||||
# Start keepalived thread
|
# Start keepalived thread
|
||||||
keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer(
|
keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer(
|
||||||
logger, config, zkhandler, this_node
|
logger, config, zkhandler, this_node, monitoring_instance
|
||||||
)
|
)
|
||||||
|
|
||||||
# Tick loop; does nothing since everything is async
|
# Tick loop; does nothing since everything is async
|
||||||
|
412
node-daemon/pvcnoded/objects/MonitoringInstance.py
Normal file
412
node-daemon/pvcnoded/objects/MonitoringInstance.py
Normal file
@ -0,0 +1,412 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# PluginInstance.py - Class implementing a PVC monitoring instance
|
||||||
|
# Part of the Parallel Virtual Cluster (PVC) system
|
||||||
|
#
|
||||||
|
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, version 3.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
#
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
import concurrent.futures
|
||||||
|
import time
|
||||||
|
import importlib.util
|
||||||
|
|
||||||
|
from os import walk
|
||||||
|
from datetime import datetime
|
||||||
|
from json import dumps
|
||||||
|
|
||||||
|
|
||||||
|
class PluginError(Exception):
|
||||||
|
"""
|
||||||
|
An exception that results from a plugin failing setup
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PluginResult(object):
|
||||||
|
def __init__(self, zkhandler, config, logger, this_node, plugin_name):
|
||||||
|
self.zkhandler = zkhandler
|
||||||
|
self.config = config
|
||||||
|
self.logger = logger
|
||||||
|
self.this_node = this_node
|
||||||
|
self.plugin_name = plugin_name
|
||||||
|
self.current_time = int(time.time())
|
||||||
|
self.health_delta = 0
|
||||||
|
self.message = None
|
||||||
|
self.data = {}
|
||||||
|
self.runtime = "0.00"
|
||||||
|
|
||||||
|
def set_health_delta(self, new_delta):
|
||||||
|
self.health_delta = new_delta
|
||||||
|
|
||||||
|
def set_message(self, new_message):
|
||||||
|
self.message = new_message
|
||||||
|
|
||||||
|
def set_data(self, new_data):
|
||||||
|
self.data = new_data
|
||||||
|
|
||||||
|
def set_runtime(self, new_runtime):
|
||||||
|
self.runtime = new_runtime
|
||||||
|
|
||||||
|
def to_zookeeper(self):
|
||||||
|
self.zkhandler.write(
|
||||||
|
[
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.name",
|
||||||
|
self.plugin_name,
|
||||||
|
),
|
||||||
|
self.plugin_name,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.last_run",
|
||||||
|
self.plugin_name,
|
||||||
|
),
|
||||||
|
self.current_time,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.health_delta",
|
||||||
|
self.plugin_name,
|
||||||
|
),
|
||||||
|
self.health_delta,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.message",
|
||||||
|
self.plugin_name,
|
||||||
|
),
|
||||||
|
self.message,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.data",
|
||||||
|
self.plugin_name,
|
||||||
|
),
|
||||||
|
dumps(self.data),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.runtime",
|
||||||
|
self.plugin_name,
|
||||||
|
),
|
||||||
|
self.runtime,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class MonitoringPlugin(object):
|
||||||
|
def __init__(self, zkhandler, config, logger, this_node, plugin_name):
|
||||||
|
self.zkhandler = zkhandler
|
||||||
|
self.config = config
|
||||||
|
self.logger = logger
|
||||||
|
self.this_node = this_node
|
||||||
|
self.plugin_name = plugin_name
|
||||||
|
|
||||||
|
self.plugin_result = PluginResult(
|
||||||
|
self.zkhandler,
|
||||||
|
self.config,
|
||||||
|
self.logger,
|
||||||
|
self.this_node,
|
||||||
|
self.plugin_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.plugin_name
|
||||||
|
|
||||||
|
#
|
||||||
|
# Helper functions; exposed to child MonitoringPluginScript instances
|
||||||
|
#
|
||||||
|
def log(self, message, state="d"):
|
||||||
|
"""
|
||||||
|
Log a message to the PVC logger instance using the plugin name as a prefix
|
||||||
|
Takes "state" values as defined by the PVC logger instance, defaulting to debug:
|
||||||
|
"d": debug
|
||||||
|
"i": informational
|
||||||
|
"t": tick/keepalive
|
||||||
|
"w": warning
|
||||||
|
"e": error
|
||||||
|
"""
|
||||||
|
if state == "d" and not self.config["debug"]:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.logger.out(message, state=state, prefix=self.plugin_name)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Primary class functions; implemented by the individual plugins
|
||||||
|
#
|
||||||
|
def setup(self):
|
||||||
|
"""
|
||||||
|
setup(): Perform setup of the plugin; run once during daemon startup
|
||||||
|
|
||||||
|
This step is optional and should be used sparingly.
|
||||||
|
|
||||||
|
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||||
|
and return a non-None failure message to indicate the error.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""
|
||||||
|
run(): Run the plugin, returning a PluginResult object
|
||||||
|
"""
|
||||||
|
return self.plugin_result
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
"""
|
||||||
|
cleanup(): Clean up after the plugin; run once during daemon shutdown
|
||||||
|
OPTIONAL
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class MonitoringInstance(object):
|
||||||
|
def __init__(self, zkhandler, config, logger, this_node):
|
||||||
|
self.zkhandler = zkhandler
|
||||||
|
self.config = config
|
||||||
|
self.logger = logger
|
||||||
|
self.this_node = this_node
|
||||||
|
|
||||||
|
# Get a list of plugins from the plugin_directory
|
||||||
|
plugin_files = next(walk(self.config["plugin_directory"]), (None, None, []))[
|
||||||
|
2
|
||||||
|
] # [] if no file
|
||||||
|
|
||||||
|
self.all_plugins = list()
|
||||||
|
self.all_plugin_names = list()
|
||||||
|
|
||||||
|
successful_plugins = 0
|
||||||
|
|
||||||
|
# Load each plugin file into the all_plugins list
|
||||||
|
for plugin_file in sorted(plugin_files):
|
||||||
|
try:
|
||||||
|
self.logger.out(
|
||||||
|
f"Loading monitoring plugin from {self.config['plugin_directory']}/{plugin_file}",
|
||||||
|
state="i",
|
||||||
|
)
|
||||||
|
loader = importlib.machinery.SourceFileLoader(
|
||||||
|
"plugin_script", f"{self.config['plugin_directory']}/{plugin_file}"
|
||||||
|
)
|
||||||
|
spec = importlib.util.spec_from_loader(loader.name, loader)
|
||||||
|
plugin_script = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(plugin_script)
|
||||||
|
|
||||||
|
plugin = plugin_script.MonitoringPluginScript(
|
||||||
|
self.zkhandler,
|
||||||
|
self.config,
|
||||||
|
self.logger,
|
||||||
|
self.this_node,
|
||||||
|
plugin_script.PLUGIN_NAME,
|
||||||
|
)
|
||||||
|
|
||||||
|
failed_setup = plugin.setup()
|
||||||
|
if failed_setup is not None:
|
||||||
|
raise PluginError(f"{failed_setup}")
|
||||||
|
|
||||||
|
# Create plugin key
|
||||||
|
self.zkhandler.write(
|
||||||
|
[
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.name",
|
||||||
|
plugin.plugin_name,
|
||||||
|
),
|
||||||
|
plugin.plugin_name,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.last_run",
|
||||||
|
plugin.plugin_name,
|
||||||
|
),
|
||||||
|
"0",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.health_delta",
|
||||||
|
plugin.plugin_name,
|
||||||
|
),
|
||||||
|
"0",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.message",
|
||||||
|
plugin.plugin_name,
|
||||||
|
),
|
||||||
|
"Initializing",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.data",
|
||||||
|
plugin.plugin_name,
|
||||||
|
),
|
||||||
|
dumps({}),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin.runtime",
|
||||||
|
plugin.plugin_name,
|
||||||
|
),
|
||||||
|
"0.00",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.all_plugins.append(plugin)
|
||||||
|
self.all_plugin_names.append(plugin.plugin_name)
|
||||||
|
successful_plugins += 1
|
||||||
|
|
||||||
|
self.logger.out(
|
||||||
|
f"Successfully loaded monitoring plugin '{plugin.plugin_name}'",
|
||||||
|
state="o",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.out(
|
||||||
|
f"Failed to load monitoring plugin: {e}",
|
||||||
|
state="w",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.zkhandler.write(
|
||||||
|
[
|
||||||
|
(
|
||||||
|
("node.monitoring.plugins", self.this_node.name),
|
||||||
|
" ".join(self.all_plugin_names),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
if successful_plugins < 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Clean up any old plugin data for which a plugin file no longer exists
|
||||||
|
for plugin_key in self.zkhandler.children(
|
||||||
|
("node.monitoring.data", self.this_node.name)
|
||||||
|
):
|
||||||
|
if plugin_key not in self.all_plugin_names:
|
||||||
|
self.zkhandler.delete(
|
||||||
|
(
|
||||||
|
"node.monitoring.data",
|
||||||
|
self.this_node.name,
|
||||||
|
"monitoring_plugin",
|
||||||
|
plugin_key,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_plugin(self, plugin):
|
||||||
|
time_start = datetime.now()
|
||||||
|
result = plugin.run()
|
||||||
|
time_end = datetime.now()
|
||||||
|
time_delta = time_end - time_start
|
||||||
|
runtime = "{:0.02f}".format(time_delta.total_seconds())
|
||||||
|
result.set_runtime(runtime)
|
||||||
|
result.to_zookeeper()
|
||||||
|
return result
|
||||||
|
|
||||||
|
def run_plugins(self):
|
||||||
|
total_health = 100
|
||||||
|
if self.config["log_keepalive_plugin_details"]:
|
||||||
|
self.logger.out(
|
||||||
|
f"Running monitoring plugins: {', '.join([x.plugin_name for x in self.all_plugins])}",
|
||||||
|
state="t",
|
||||||
|
)
|
||||||
|
plugin_results = list()
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor:
|
||||||
|
to_future_plugin_results = {
|
||||||
|
executor.submit(self.run_plugin, plugin): plugin
|
||||||
|
for plugin in self.all_plugins
|
||||||
|
}
|
||||||
|
for future in concurrent.futures.as_completed(to_future_plugin_results):
|
||||||
|
plugin_results.append(future.result())
|
||||||
|
|
||||||
|
for result in sorted(plugin_results, key=lambda x: x.plugin_name):
|
||||||
|
if self.config["log_keepalive_plugin_details"]:
|
||||||
|
self.logger.out(
|
||||||
|
result.message,
|
||||||
|
state="t",
|
||||||
|
prefix=f"{result.plugin_name} ({result.runtime}s)",
|
||||||
|
)
|
||||||
|
if result is not None:
|
||||||
|
total_health -= result.health_delta
|
||||||
|
|
||||||
|
if total_health < 0:
|
||||||
|
total_health = 0
|
||||||
|
|
||||||
|
if total_health > 90:
|
||||||
|
health_colour = self.logger.fmt_green
|
||||||
|
elif total_health > 50:
|
||||||
|
health_colour = self.logger.fmt_yellow
|
||||||
|
else:
|
||||||
|
health_colour = self.logger.fmt_red
|
||||||
|
|
||||||
|
self.zkhandler.write(
|
||||||
|
[
|
||||||
|
(
|
||||||
|
("node.monitoring.health", self.this_node.name),
|
||||||
|
total_health,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.logger.out(
|
||||||
|
f"Node health: {health_colour}{total_health}%{self.logger.fmt_end}",
|
||||||
|
state="t",
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_cleanup(self, plugin):
|
||||||
|
return plugin.cleanup()
|
||||||
|
|
||||||
|
def run_cleanups(self):
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor:
|
||||||
|
to_future_plugin_results = {
|
||||||
|
executor.submit(self.run_cleanup, plugin): plugin
|
||||||
|
for plugin in self.all_plugins
|
||||||
|
}
|
||||||
|
for future in concurrent.futures.as_completed(to_future_plugin_results):
|
||||||
|
# This doesn't do anything, just lets us wait for them all to complete
|
||||||
|
pass
|
||||||
|
# Set the node health to None as no previous checks are now valid
|
||||||
|
self.zkhandler.write(
|
||||||
|
[
|
||||||
|
(
|
||||||
|
("node.monitoring.health", self.this_node.name),
|
||||||
|
None,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
@ -180,6 +180,9 @@ def get_configuration():
|
|||||||
raise MalformedConfigurationError(e)
|
raise MalformedConfigurationError(e)
|
||||||
|
|
||||||
config_directories = {
|
config_directories = {
|
||||||
|
"plugin_directory": o_directories.get(
|
||||||
|
"plugin_directory", "/usr/share/pvc/plugins"
|
||||||
|
),
|
||||||
"dynamic_directory": o_directories.get("dynamic_directory", None),
|
"dynamic_directory": o_directories.get("dynamic_directory", None),
|
||||||
"log_directory": o_directories.get("log_directory", None),
|
"log_directory": o_directories.get("log_directory", None),
|
||||||
"console_log_directory": o_directories.get("console_log_directory", None),
|
"console_log_directory": o_directories.get("console_log_directory", None),
|
||||||
@ -225,8 +228,8 @@ def get_configuration():
|
|||||||
"log_keepalive_cluster_details": o_logging.get(
|
"log_keepalive_cluster_details": o_logging.get(
|
||||||
"log_keepalive_cluster_details", False
|
"log_keepalive_cluster_details", False
|
||||||
),
|
),
|
||||||
"log_keepalive_storage_details": o_logging.get(
|
"log_keepalive_plugin_details": o_logging.get(
|
||||||
"log_keepalive_storage_details", False
|
"log_keepalive_plugin_details", False
|
||||||
),
|
),
|
||||||
"console_log_lines": o_logging.get("console_log_lines", False),
|
"console_log_lines": o_logging.get("console_log_lines", False),
|
||||||
"node_log_lines": o_logging.get("node_log_lines", False),
|
"node_log_lines": o_logging.get("node_log_lines", False),
|
||||||
|
@ -51,7 +51,7 @@ libvirt_vm_states = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def start_keepalive_timer(logger, config, zkhandler, this_node):
|
def start_keepalive_timer(logger, config, zkhandler, this_node, monitoring_instance):
|
||||||
keepalive_interval = config["keepalive_interval"]
|
keepalive_interval = config["keepalive_interval"]
|
||||||
logger.out(
|
logger.out(
|
||||||
f"Starting keepalive timer ({keepalive_interval} second interval)", state="s"
|
f"Starting keepalive timer ({keepalive_interval} second interval)", state="s"
|
||||||
@ -59,7 +59,7 @@ def start_keepalive_timer(logger, config, zkhandler, this_node):
|
|||||||
keepalive_timer = BackgroundScheduler()
|
keepalive_timer = BackgroundScheduler()
|
||||||
keepalive_timer.add_job(
|
keepalive_timer.add_job(
|
||||||
node_keepalive,
|
node_keepalive,
|
||||||
args=(logger, config, zkhandler, this_node),
|
args=(logger, config, zkhandler, this_node, monitoring_instance),
|
||||||
trigger="interval",
|
trigger="interval",
|
||||||
seconds=keepalive_interval,
|
seconds=keepalive_interval,
|
||||||
)
|
)
|
||||||
@ -97,34 +97,12 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
|
|||||||
logger.out("Failed to open connection to Ceph cluster: {}".format(e), state="e")
|
logger.out("Failed to open connection to Ceph cluster: {}".format(e), state="e")
|
||||||
return
|
return
|
||||||
|
|
||||||
if debug:
|
|
||||||
logger.out("Getting health stats from monitor", state="d", prefix="ceph-thread")
|
|
||||||
|
|
||||||
# Get Ceph cluster health for local status output
|
|
||||||
command = {"prefix": "health", "format": "json"}
|
|
||||||
try:
|
|
||||||
health_status = json.loads(
|
|
||||||
ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[1]
|
|
||||||
)
|
|
||||||
ceph_health = health_status["status"]
|
|
||||||
except Exception as e:
|
|
||||||
logger.out("Failed to obtain Ceph health data: {}".format(e), state="e")
|
|
||||||
ceph_health = "HEALTH_UNKN"
|
|
||||||
|
|
||||||
if ceph_health in ["HEALTH_OK"]:
|
|
||||||
ceph_health_colour = logger.fmt_green
|
|
||||||
elif ceph_health in ["HEALTH_UNKN"]:
|
|
||||||
ceph_health_colour = logger.fmt_cyan
|
|
||||||
elif ceph_health in ["HEALTH_WARN"]:
|
|
||||||
ceph_health_colour = logger.fmt_yellow
|
|
||||||
else:
|
|
||||||
ceph_health_colour = logger.fmt_red
|
|
||||||
|
|
||||||
# Primary-only functions
|
# Primary-only functions
|
||||||
if this_node.router_state == "primary":
|
if this_node.router_state == "primary":
|
||||||
|
# Get Ceph status information (pretty)
|
||||||
if debug:
|
if debug:
|
||||||
logger.out(
|
logger.out(
|
||||||
"Set ceph health information in zookeeper (primary only)",
|
"Set Ceph status information in zookeeper (primary only)",
|
||||||
state="d",
|
state="d",
|
||||||
prefix="ceph-thread",
|
prefix="ceph-thread",
|
||||||
)
|
)
|
||||||
@ -138,9 +116,27 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.out("Failed to set Ceph status data: {}".format(e), state="e")
|
logger.out("Failed to set Ceph status data: {}".format(e), state="e")
|
||||||
|
|
||||||
|
# Get Ceph health information (JSON)
|
||||||
if debug:
|
if debug:
|
||||||
logger.out(
|
logger.out(
|
||||||
"Set ceph rados df information in zookeeper (primary only)",
|
"Set Ceph health information in zookeeper (primary only)",
|
||||||
|
state="d",
|
||||||
|
prefix="ceph-thread",
|
||||||
|
)
|
||||||
|
|
||||||
|
command = {"prefix": "health", "format": "json"}
|
||||||
|
ceph_health = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[
|
||||||
|
1
|
||||||
|
].decode("ascii")
|
||||||
|
try:
|
||||||
|
zkhandler.write([("base.storage.health", str(ceph_health))])
|
||||||
|
except Exception as e:
|
||||||
|
logger.out("Failed to set Ceph health data: {}".format(e), state="e")
|
||||||
|
|
||||||
|
# Get Ceph df information (pretty)
|
||||||
|
if debug:
|
||||||
|
logger.out(
|
||||||
|
"Set Ceph rados df information in zookeeper (primary only)",
|
||||||
state="d",
|
state="d",
|
||||||
prefix="ceph-thread",
|
prefix="ceph-thread",
|
||||||
)
|
)
|
||||||
@ -408,8 +404,6 @@ def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
|
|||||||
|
|
||||||
ceph_conn.shutdown()
|
ceph_conn.shutdown()
|
||||||
|
|
||||||
queue.put(ceph_health_colour)
|
|
||||||
queue.put(ceph_health)
|
|
||||||
queue.put(osds_this_node)
|
queue.put(osds_this_node)
|
||||||
|
|
||||||
if debug:
|
if debug:
|
||||||
@ -648,7 +642,7 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue):
|
|||||||
|
|
||||||
|
|
||||||
# Keepalive update function
|
# Keepalive update function
|
||||||
def node_keepalive(logger, config, zkhandler, this_node):
|
def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
|
||||||
debug = config["debug"]
|
debug = config["debug"]
|
||||||
if debug:
|
if debug:
|
||||||
logger.out("Keepalive starting", state="d", prefix="main-thread")
|
logger.out("Keepalive starting", state="d", prefix="main-thread")
|
||||||
@ -777,16 +771,14 @@ def node_keepalive(logger, config, zkhandler, this_node):
|
|||||||
|
|
||||||
if config["enable_storage"]:
|
if config["enable_storage"]:
|
||||||
try:
|
try:
|
||||||
ceph_health_colour = ceph_thread_queue.get(
|
osds_this_node = ceph_thread_queue.get(
|
||||||
timeout=config["keepalive_interval"]
|
timeout=(config["keepalive_interval"] - 1)
|
||||||
)
|
)
|
||||||
ceph_health = ceph_thread_queue.get(timeout=config["keepalive_interval"])
|
|
||||||
osds_this_node = ceph_thread_queue.get(timeout=config["keepalive_interval"])
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.out("Ceph stats queue get exceeded timeout, continuing", state="w")
|
logger.out("Ceph stats queue get exceeded timeout, continuing", state="w")
|
||||||
ceph_health_colour = logger.fmt_cyan
|
|
||||||
ceph_health = "UNKNOWN"
|
|
||||||
osds_this_node = "?"
|
osds_this_node = "?"
|
||||||
|
else:
|
||||||
|
osds_this_node = "0"
|
||||||
|
|
||||||
# Set our information in zookeeper
|
# Set our information in zookeeper
|
||||||
keepalive_time = int(time.time())
|
keepalive_time = int(time.time())
|
||||||
@ -839,8 +831,8 @@ def node_keepalive(logger, config, zkhandler, this_node):
|
|||||||
if config["log_keepalive_cluster_details"]:
|
if config["log_keepalive_cluster_details"]:
|
||||||
logger.out(
|
logger.out(
|
||||||
"{bold}Maintenance:{nofmt} {maint} "
|
"{bold}Maintenance:{nofmt} {maint} "
|
||||||
"{bold}Active VMs:{nofmt} {domcount} "
|
"{bold}Node VMs:{nofmt} {domcount} "
|
||||||
"{bold}Networks:{nofmt} {netcount} "
|
"{bold}Node OSDs:{nofmt} {osdcount} "
|
||||||
"{bold}Load:{nofmt} {load} "
|
"{bold}Load:{nofmt} {load} "
|
||||||
"{bold}Memory [MiB]: VMs:{nofmt} {allocmem} "
|
"{bold}Memory [MiB]: VMs:{nofmt} {allocmem} "
|
||||||
"{bold}Used:{nofmt} {usedmem} "
|
"{bold}Used:{nofmt} {usedmem} "
|
||||||
@ -849,7 +841,7 @@ def node_keepalive(logger, config, zkhandler, this_node):
|
|||||||
nofmt=logger.fmt_end,
|
nofmt=logger.fmt_end,
|
||||||
maint=this_node.maintenance,
|
maint=this_node.maintenance,
|
||||||
domcount=this_node.domains_count,
|
domcount=this_node.domains_count,
|
||||||
netcount=len(zkhandler.children("base.network")),
|
osdcount=osds_this_node,
|
||||||
load=this_node.cpuload,
|
load=this_node.cpuload,
|
||||||
freemem=this_node.memfree,
|
freemem=this_node.memfree,
|
||||||
usedmem=this_node.memused,
|
usedmem=this_node.memused,
|
||||||
@ -857,22 +849,6 @@ def node_keepalive(logger, config, zkhandler, this_node):
|
|||||||
),
|
),
|
||||||
state="t",
|
state="t",
|
||||||
)
|
)
|
||||||
if config["enable_storage"] and config["log_keepalive_storage_details"]:
|
|
||||||
logger.out(
|
|
||||||
"{bold}Ceph cluster status:{nofmt} {health_colour}{health}{nofmt} "
|
|
||||||
"{bold}Total OSDs:{nofmt} {total_osds} "
|
|
||||||
"{bold}Node OSDs:{nofmt} {node_osds} "
|
|
||||||
"{bold}Pools:{nofmt} {total_pools} ".format(
|
|
||||||
bold=logger.fmt_bold,
|
|
||||||
health_colour=ceph_health_colour,
|
|
||||||
nofmt=logger.fmt_end,
|
|
||||||
health=ceph_health,
|
|
||||||
total_osds=len(zkhandler.children("base.osd")),
|
|
||||||
node_osds=osds_this_node,
|
|
||||||
total_pools=len(zkhandler.children("base.pool")),
|
|
||||||
),
|
|
||||||
state="t",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Look for dead nodes and fence them
|
# Look for dead nodes and fence them
|
||||||
if not this_node.maintenance:
|
if not this_node.maintenance:
|
||||||
@ -918,5 +894,7 @@ def node_keepalive(logger, config, zkhandler, this_node):
|
|||||||
[(("node.state.daemon", node_name), "dead")]
|
[(("node.state.daemon", node_name), "dead")]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
monitoring_instance.run_plugins()
|
||||||
|
|
||||||
if debug:
|
if debug:
|
||||||
logger.out("Keepalive finished", state="d", prefix="main-thread")
|
logger.out("Keepalive finished", state="d", prefix="main-thread")
|
||||||
|
Reference in New Issue
Block a user