Skip to content

Commit

Permalink
Improve alert aggregations for multiple clusters
Browse files Browse the repository at this point in the history
  • Loading branch information
hamishforbes committed Jun 24, 2021
1 parent e0dc356 commit ff2b9c3
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 32 deletions.
10 changes: 5 additions & 5 deletions alerts/apps_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@
// label exists for 2 values. This avoids "many-to-many matching
// not allowed" errors when joining with kube_pod_status_phase.
expr: |||
sum by (namespace, pod) (
max by(namespace, pod) (
sum by (namespace, pod, %(clusterGroupLabelsStr)s) (
max by(namespace, pod, %(clusterGroupLabelsStr)s) (
kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase=~"Pending|Unknown"}
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
) * on(namespace, pod, %(clusterGroupLabelsStr)s) group_left(owner_kind) topk by(namespace, pod, %(clusterGroupLabelsStr)s) (
1, max by(namespace, pod, owner_kind, %(clusterGroupLabelsStr)s) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
||| % $._config,
Expand Down Expand Up @@ -194,7 +194,7 @@
},
{
expr: |||
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0
sum by (namespace, pod, container, %(clusterGroupLabelsStr)s) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0
||| % $._config,
labels: {
severity: 'warning',
Expand Down
17 changes: 12 additions & 5 deletions alerts/kube_apiserver.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,16 @@ local utils = import 'utils.libsonnet';
{
alert: 'KubeAPIErrorBudgetBurn',
expr: |||
sum(apiserver_request:burnrate%s) > (%.2f * %.5f)
sum(apiserver_request:burnrate%s) by (%s) > (%.2f * %.5f)
and
sum(apiserver_request:burnrate%s) > (%.2f * %.5f)
sum(apiserver_request:burnrate%s) by (%s) > (%.2f * %.5f)
||| % [
w.long,
$._config.clusterGroupLabelsStr,
w.factor,
(1 - $._config.SLOs.apiserver.target),
w.short,
$._config.clusterGroupLabelsStr,
w.factor,
(1 - $._config.SLOs.apiserver.target),
],
Expand Down Expand Up @@ -75,7 +77,7 @@ local utils = import 'utils.libsonnet';
{
alert: 'AggregatedAPIErrors',
expr: |||
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
sum by(name, namespace, %(clusterGroupLabelsStr)s)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
||| % $._config,
labels: {
severity: 'warning',
Expand All @@ -88,7 +90,7 @@ local utils = import 'utils.libsonnet';
{
alert: 'AggregatedAPIDown',
expr: |||
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
(1 - max by(name, namespace, %(clusterGroupLabelsStr)s)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
||| % $._config,
'for': '5m',
labels: {
Expand All @@ -106,7 +108,12 @@ local utils = import 'utils.libsonnet';
{
alert: 'KubeAPITerminatedRequests',
expr: |||
sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) / ( sum(rate(apiserver_request_total{%(kubeApiserverSelector)s}[10m])) + sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) ) > 0.20
sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) by (%(clusterGroupLabelsStr)s)
/
(
sum(rate(apiserver_request_total{%(kubeApiserverSelector)s}[10m])) by (%(clusterGroupLabelsStr)s)
+ sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) by (%(clusterGroupLabelsStr)s)
) > 0.20
||| % $._config,
labels: {
severity: 'warning',
Expand Down
11 changes: 6 additions & 5 deletions alerts/kubelet.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,12 @@
// Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it.
// We have to ignore this special node in the KubeletTooManyPods alert.
expr: |||
count by(node) (
(kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{%(kubeStateMetricsSelector)s})
count by(node, %(clusterGroupLabelsStr)s) (
(kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Running"} == 1)
* on(instance,pod,namespace,%(clusterGroupLabelsStr)s) group_left(node) topk by(instance,pod,namespace,%(clusterGroupLabelsStr)s) (1, kube_pod_info{%(kubeStateMetricsSelector)s})
)
/
max by(node) (
max by(node,%(clusterGroupLabelsStr)s) (
kube_node_status_capacity{%(kubeStateMetricsSelector)s,resource="pods"} != 1
) > 0.95
||| % $._config,
Expand All @@ -72,7 +73,7 @@
{
alert: 'KubeNodeReadinessFlapping',
expr: |||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node,%(clusterGroupLabelsStr)s) > 2
||| % $._config,
'for': '15m',
labels: {
Expand Down Expand Up @@ -100,7 +101,7 @@
{
alert: 'KubeletPodStartUpLatencyHigh',
expr: |||
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{%(kubeletSelector)s} > 60
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (instance, le)) * on(instance) group_left(node,%(clusterGroupLabelsStr)s) kubelet_node_name{%(kubeletSelector)s} > 60
||| % $._config,
'for': '15m',
labels: {
Expand Down
26 changes: 13 additions & 13 deletions alerts/resource_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@
{
alert: 'KubeCPUOvercommit',
expr: |||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterGroupLabelsStr)s)
/
sum(kube_node_status_allocatable{resource="cpu"})
sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterGroupLabelsStr)s)
>
((count(kube_node_status_allocatable{resource="cpu"}) > 1) - 1) / count(kube_node_status_allocatable{resource="cpu"})
((count(kube_node_status_allocatable{resource="cpu"}) by (%(clusterGroupLabelsStr)s) > 1) - 1) / count(kube_node_status_allocatable{resource="cpu"}) by (%(clusterGroupLabelsStr)s)
||| % $._config,
labels: {
severity: 'warning',
Expand All @@ -43,13 +43,13 @@
{
alert: 'KubeMemoryOvercommit',
expr: |||
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s})
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterGroupLabelsStr)s)
/
sum(kube_node_status_allocatable{resource="memory"})
sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterGroupLabelsStr)s)
>
((count(kube_node_status_allocatable{resource="memory"}) > 1) - 1)
((count(kube_node_status_allocatable{resource="memory"}) by (%(clusterGroupLabelsStr)s) > 1) - 1)
/
count(kube_node_status_allocatable{resource="memory"})
count(kube_node_status_allocatable{resource="memory"}) by (%(clusterGroupLabelsStr)s)
||| % $._config,
labels: {
severity: 'warning',
Expand All @@ -63,9 +63,9 @@
{
alert: 'KubeCPUQuotaOvercommit',
expr: |||
sum(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource="cpu"})
sum(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource="cpu"}) by (%(clusterGroupLabelsStr)s)
/
sum(kube_node_status_allocatable{resource="cpu"})
sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterGroupLabelsStr)s)
> %(namespaceOvercommitFactor)s
||| % $._config,
labels: {
Expand All @@ -80,9 +80,9 @@
{
alert: 'KubeMemoryQuotaOvercommit',
expr: |||
sum(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource="memory"})
sum(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource="memory"}) by (%(clusterGroupLabelsStr)s)
/
sum(kube_node_status_allocatable{resource="memory",%(kubeStateMetricsSelector)s})
sum(kube_node_status_allocatable{resource="memory",%(kubeStateMetricsSelector)s}) by (%(clusterGroupLabelsStr)s)
> %(namespaceOvercommitFactor)s
||| % $._config,
labels: {
Expand Down Expand Up @@ -148,9 +148,9 @@
{
alert: 'CPUThrottlingHigh',
expr: |||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace)
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace, %(clusterGroupLabelsStr)s)
/
sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace)
sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace, %(clusterGroupLabelsStr)s)
> ( %(cpuThrottlingPercent)s / 100 )
||| % $._config,
'for': '15m',
Expand Down
12 changes: 8 additions & 4 deletions alerts/system_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
{
alert: 'KubeVersionMismatch',
expr: |||
count(count by (git_version) (label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
count(
count by (git_version, %(clusterGroupLabelsStr)s) (
label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*")
)
) by (%(clusterGroupLabelsStr)s) > 1
||| % $._config,
'for': '15m',
labels: {
Expand All @@ -28,11 +32,11 @@
// this is normal and an expected error, therefore it should be
// ignored in this alert.
expr: |||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job, %(clusterGroupLabelsStr)s)
/
sum(rate(rest_client_requests_total[5m])) by (instance, job))
sum(rate(rest_client_requests_total[5m])) by (instance, job, %(clusterGroupLabelsStr)s))
> 0.01
|||,
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
Expand Down
2 changes: 2 additions & 0 deletions config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@
// Opt-in to multiCluster dashboards by overriding this and the clusterLabel.
showMultiCluster: false,
clusterLabel: 'cluster',
clusterGroupLabels: if self.showMultiCluster then [self.clusterLabel] else [],
clusterGroupLabelsStr: std.join(',', self.clusterGroupLabels),

namespaceLabel: 'namespace',

Expand Down

0 comments on commit ff2b9c3

Please sign in to comment.