Skip to content

Commit 5cd53d6

Browse files
committed
bugfix: refactor alerts to accomodate for single-node clusters
For the sake of brevity, let: Q: kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"} (allocable), and, QQ: namespace_cpu:kube_pod_container_resource_requests:sum{} (requested), thus, both quota alerts relevant here exist in the form: sum(QQ) by (cluster) - (sum(Q) by (cluster) - max(Q) by (cluster)) > 0 and (sum(Q) by (cluster) - max(Q) by (cluster)) > 0, which, in case of a single-node cluster (sum(Q) by (cluster) = max(Q) by (cluster)), is reduced to, sum(QQ) by (cluster) > 0, i.e., the alert will fire if *any* request limits exist. To address this, drop the "max(Q) by (cluster)" buffer assumed in non-SNO clusters from SNO, reducing the expression to: sum(QQ) by (cluster) - sum(Q) by (cluster) > 0 (total requeted - total allocable > 0 to trigger alert), since there is only a single node, so a buffer of the same sort does not make sense. Signed-off-by: Pranshu Srivastava <rexagod@gmail.com>
1 parent 35aebca commit 5cd53d6

File tree

1 file changed

+47
-16
lines changed

1 file changed

+47
-16
lines changed

alerts/resource_alerts.libsonnet

+47-16
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,34 @@
3434
} +
3535
if $._config.showMultiCluster then {
3636
expr: |||
37-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
37+
(count(kube_node_info) == 1
3838
and
39-
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
39+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
40+
sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) > 0)
41+
or
42+
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
43+
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
44+
max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
45+
and
46+
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) -
47+
max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0)
4048
||| % $._config,
4149
annotations+: {
4250
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
4351
},
4452
} else {
4553
expr: |||
46-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
54+
(count(kube_node_info) == 1
55+
and
56+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
57+
sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) > 0)
58+
or
59+
(sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
60+
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
61+
max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
4762
and
48-
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
63+
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) -
64+
max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0)
4965
||| % $._config,
5066
annotations+: {
5167
description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
@@ -63,24 +79,39 @@
6379
} +
6480
if $._config.showMultiCluster then {
6581
expr: |||
66-
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
82+
(count(kube_node_info) == 1
6783
and
68-
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
84+
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
85+
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) > 0)
86+
or
87+
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) -
88+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
89+
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
90+
and
91+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) -
92+
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0)
6993
||| % $._config,
7094
annotations+: {
7195
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
7296
},
73-
} else
74-
{
75-
expr: |||
76-
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
77-
and
78-
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
79-
||| % $._config,
80-
annotations+: {
81-
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
82-
},
97+
} else {
98+
expr: |||
99+
(count(kube_node_info) == 1
100+
and
101+
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
102+
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) > 0)
103+
or
104+
(sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) -
105+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
106+
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
107+
and
108+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) -
109+
max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0)
110+
||| % $._config,
111+
annotations+: {
112+
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
83113
},
114+
},
84115
{
85116
alert: 'KubeCPUQuotaOvercommit',
86117
labels: {

0 commit comments

Comments
 (0)