diff --git a/alerts/apps_alerts.libsonnet b/alerts/apps_alerts.libsonnet index 6b845c601..44e4b121e 100644 --- a/alerts/apps_alerts.libsonnet +++ b/alerts/apps_alerts.libsonnet @@ -32,11 +32,11 @@ // label exists for 2 values. This avoids "many-to-many matching // not allowed" errors when joining with kube_pod_status_phase. expr: ||| - sum by (namespace, pod) ( - max by(namespace, pod) ( + sum by (namespace, pod, %(clusterGroupLabelsStr)s) ( + max by(namespace, pod, %(clusterGroupLabelsStr)s) ( kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase=~"Pending|Unknown"} - ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( - 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) + ) * on(namespace, pod, %(clusterGroupLabelsStr)s) group_left(owner_kind) topk by(namespace, pod, %(clusterGroupLabelsStr)s) ( + 1, max by(namespace, pod, owner_kind, %(clusterGroupLabelsStr)s) (kube_pod_owner{owner_kind!="Job"}) ) ) > 0 ||| % $._config, @@ -194,7 +194,7 @@ }, { expr: ||| - sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0 + sum by (namespace, pod, container, %(clusterGroupLabelsStr)s) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0 ||| % $._config, labels: { severity: 'warning', diff --git a/alerts/kube_apiserver.libsonnet b/alerts/kube_apiserver.libsonnet index 9ec5a0e6a..81efb0227 100644 --- a/alerts/kube_apiserver.libsonnet +++ b/alerts/kube_apiserver.libsonnet @@ -18,14 +18,16 @@ local utils = import 'utils.libsonnet'; { alert: 'KubeAPIErrorBudgetBurn', expr: ||| - sum(apiserver_request:burnrate%s) > (%.2f * %.5f) + sum(apiserver_request:burnrate%s) by (%s) > (%.2f * %.5f) and - sum(apiserver_request:burnrate%s) > (%.2f * %.5f) + sum(apiserver_request:burnrate%s) by (%s) > (%.2f * %.5f) ||| % [ w.long, + $._config.clusterGroupLabelsStr, w.factor, (1 - $._config.SLOs.apiserver.target), w.short, + $._config.clusterGroupLabelsStr, w.factor, (1 - $._config.SLOs.apiserver.target), ], @@ -75,7 +77,7 @@ local utils = import 'utils.libsonnet'; { alert: 'AggregatedAPIErrors', expr: ||| - sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 + sum by(name, namespace, %(clusterGroupLabelsStr)s)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 ||| % $._config, labels: { severity: 'warning', @@ -88,7 +90,7 @@ local utils = import 'utils.libsonnet'; { alert: 'AggregatedAPIDown', expr: ||| - (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 + (1 - max by(name, namespace, %(clusterGroupLabelsStr)s)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 ||| % $._config, 'for': '5m', labels: { @@ -106,7 +108,12 @@ local utils = import 'utils.libsonnet'; { alert: 'KubeAPITerminatedRequests', expr: ||| - sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) / ( sum(rate(apiserver_request_total{%(kubeApiserverSelector)s}[10m])) + sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) ) > 0.20 + sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) by (%(clusterGroupLabelsStr)s) + / + ( + sum(rate(apiserver_request_total{%(kubeApiserverSelector)s}[10m])) by (%(clusterGroupLabelsStr)s) + + sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) by (%(clusterGroupLabelsStr)s) + ) > 0.20 ||| % $._config, labels: { severity: 'warning', diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index ec5e6b29e..ed8412901 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -52,11 +52,12 @@ // Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it. // We have to ignore this special node in the KubeletTooManyPods alert. expr: ||| - count by(node) ( - (kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{%(kubeStateMetricsSelector)s}) + count by(node, %(clusterGroupLabelsStr)s) ( + (kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Running"} == 1) + * on(instance,pod,namespace,%(clusterGroupLabelsStr)s) group_left(node) topk by(instance,pod,namespace,%(clusterGroupLabelsStr)s) (1, kube_pod_info{%(kubeStateMetricsSelector)s}) ) / - max by(node) ( + max by(node,%(clusterGroupLabelsStr)s) ( kube_node_status_capacity{%(kubeStateMetricsSelector)s,resource="pods"} != 1 ) > 0.95 ||| % $._config, @@ -72,7 +73,7 @@ { alert: 'KubeNodeReadinessFlapping', expr: ||| - sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 + sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node,%(clusterGroupLabelsStr)s) > 2 ||| % $._config, 'for': '15m', labels: { @@ -100,7 +101,7 @@ { alert: 'KubeletPodStartUpLatencyHigh', expr: ||| - histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{%(kubeletSelector)s} > 60 + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (instance, le)) * on(instance) group_left(node,%(clusterGroupLabelsStr)s) kubelet_node_name{%(kubeletSelector)s} > 60 ||| % $._config, 'for': '15m', labels: { diff --git a/alerts/resource_alerts.libsonnet b/alerts/resource_alerts.libsonnet index 8a103ea60..97bbd8021 100644 --- a/alerts/resource_alerts.libsonnet +++ b/alerts/resource_alerts.libsonnet @@ -25,11 +25,11 @@ { alert: 'KubeCPUOvercommit', expr: ||| - sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) + sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterGroupLabelsStr)s) / - sum(kube_node_status_allocatable{resource="cpu"}) + sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterGroupLabelsStr)s) > - ((count(kube_node_status_allocatable{resource="cpu"}) > 1) - 1) / count(kube_node_status_allocatable{resource="cpu"}) + ((count(kube_node_status_allocatable{resource="cpu"}) by (%(clusterGroupLabelsStr)s) > 1) - 1) / count(kube_node_status_allocatable{resource="cpu"}) by (%(clusterGroupLabelsStr)s) ||| % $._config, labels: { severity: 'warning', @@ -43,13 +43,13 @@ { alert: 'KubeMemoryOvercommit', expr: ||| - sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) + sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterGroupLabelsStr)s) / - sum(kube_node_status_allocatable{resource="memory"}) + sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterGroupLabelsStr)s) > - ((count(kube_node_status_allocatable{resource="memory"}) > 1) - 1) + ((count(kube_node_status_allocatable{resource="memory"}) by (%(clusterGroupLabelsStr)s) > 1) - 1) / - count(kube_node_status_allocatable{resource="memory"}) + count(kube_node_status_allocatable{resource="memory"}) by (%(clusterGroupLabelsStr)s) ||| % $._config, labels: { severity: 'warning', @@ -63,9 +63,9 @@ { alert: 'KubeCPUQuotaOvercommit', expr: ||| - sum(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource="cpu"}) + sum(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource="cpu"}) by (%(clusterGroupLabelsStr)s) / - sum(kube_node_status_allocatable{resource="cpu"}) + sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterGroupLabelsStr)s) > %(namespaceOvercommitFactor)s ||| % $._config, labels: { @@ -80,9 +80,9 @@ { alert: 'KubeMemoryQuotaOvercommit', expr: ||| - sum(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource="memory"}) + sum(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource="memory"}) by (%(clusterGroupLabelsStr)s) / - sum(kube_node_status_allocatable{resource="memory",%(kubeStateMetricsSelector)s}) + sum(kube_node_status_allocatable{resource="memory",%(kubeStateMetricsSelector)s}) by (%(clusterGroupLabelsStr)s) > %(namespaceOvercommitFactor)s ||| % $._config, labels: { @@ -148,9 +148,9 @@ { alert: 'CPUThrottlingHigh', expr: ||| - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace) + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace, %(clusterGroupLabelsStr)s) / - sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace) + sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace, %(clusterGroupLabelsStr)s) > ( %(cpuThrottlingPercent)s / 100 ) ||| % $._config, 'for': '15m', diff --git a/alerts/system_alerts.libsonnet b/alerts/system_alerts.libsonnet index c62b34de2..9526d696f 100644 --- a/alerts/system_alerts.libsonnet +++ b/alerts/system_alerts.libsonnet @@ -11,7 +11,11 @@ { alert: 'KubeVersionMismatch', expr: ||| - count(count by (git_version) (label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 + count( + count by (git_version, %(clusterGroupLabelsStr)s) ( + label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*") + ) + ) by (%(clusterGroupLabelsStr)s) > 1 ||| % $._config, 'for': '15m', labels: { @@ -28,11 +32,11 @@ // this is normal and an expected error, therefore it should be // ignored in this alert. expr: ||| - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job, %(clusterGroupLabelsStr)s) / - sum(rate(rest_client_requests_total[5m])) by (instance, job)) + sum(rate(rest_client_requests_total[5m])) by (instance, job, %(clusterGroupLabelsStr)s)) > 0.01 - |||, + ||| % $._config, 'for': '15m', labels: { severity: 'warning', diff --git a/config.libsonnet b/config.libsonnet index b9fdccf48..1af51bac2 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -79,6 +79,8 @@ // Opt-in to multiCluster dashboards by overriding this and the clusterLabel. showMultiCluster: false, clusterLabel: 'cluster', + clusterGroupLabels: if self.showMultiCluster then [self.clusterLabel] else [], + clusterGroupLabelsStr: std.join(',', self.clusterGroupLabels), namespaceLabel: 'namespace',