From 5982ffd819ae0baebfc51fb4a1e3a286d9627080 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 19 Nov 2024 10:15:33 +0000 Subject: [PATCH] Update upstream alerts --- .../master/collector_prometheus_alerts.yaml | 42 +++----------- .../master/lokistack_prometheus_alerts.yaml | 18 ++++++ .../lokistack_prometheus_alerts.yaml | 18 ++++++ .../lokistack_prometheus_alerts.yaml | 18 ++++++ .../lokistack_prometheus_alerts.yaml | 18 ++++++ .../60_lokistack_alerts.yaml | 21 +++++++ .../60_lokistack_alerts.yaml | 21 +++++++ .../60_collector_alerts.yaml | 56 ++++--------------- .../60_lokistack_alerts.yaml | 21 +++++++ .../60_lokistack_alerts.yaml | 21 +++++++ 10 files changed, 173 insertions(+), 81 deletions(-) diff --git a/component/extracted_alerts/master/collector_prometheus_alerts.yaml b/component/extracted_alerts/master/collector_prometheus_alerts.yaml index 1942d35..68a555c 100644 --- a/component/extracted_alerts/master/collector_prometheus_alerts.yaml +++ b/component/extracted_alerts/master/collector_prometheus_alerts.yaml @@ -9,7 +9,7 @@ spec: rules: - alert: CollectorNodeDown annotations: - message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." + description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." summary: "Collector cannot be scraped" expr: | up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 @@ -17,38 +17,10 @@ spec: labels: service: collector severity: critical - - alert: CollectorHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - - alert: CollectorVeryHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - alert: ElasticsearchDeprecation annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for the Red Hat Elasticsearch Operator has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead." - summary: "Detected Elasticsearch as the in-cluster storage, which has been removed in 6.0 release" + description: "In Red Hat OpenShift Logging Operator 6.0, support for the Red Hat Elasticsearch Operator has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead." + summary: "Detected Elasticsearch as the in-cluster storage, which has been removed in the 6.0 release" expr: | sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 for: 5m @@ -58,8 +30,8 @@ spec: namespace: openshift-logging - alert: FluentdDeprecation annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead." - summary: "Detected Fluentd as the collector, which has been removed in a 6.0 release" + description: "In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead." + summary: "Detected Fluentd as the collector, which has been removed in the 6.0 release" expr: | sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 for: 5m @@ -69,7 +41,7 @@ spec: namespace: openshift-logging - alert: KibanaDeprecation annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead." + description: "In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead." summary: "Detected Kibana as the log data visualization, which has been removed in the 6.0 release" expr: | sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 @@ -80,7 +52,7 @@ spec: namespace: openshift-logging - alert: DiskBufferUsage annotations: - message: "Collectors potentially consuming too much node disk, {{ $value }}% " + description: "Collectors potentially consuming too much node disk, {{ $value }}% " summary: "Detected consuming too much node disk on $labels.hostname host" expr: | (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') diff --git a/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml index 15cc424..799c280 100644 --- a/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml @@ -175,6 +175,24 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning - alert: LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml index f378c49..e0c49d6 100644 --- a/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml @@ -175,3 +175,21 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning diff --git a/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml index f378c49..e0c49d6 100644 --- a/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml @@ -175,3 +175,21 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning diff --git a/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml index 15cc424..799c280 100644 --- a/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml @@ -175,6 +175,24 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning - alert: LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..4f6c7da 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -204,6 +204,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..4f6c7da 100644 --- a/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/legacy/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -204,6 +204,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml index 19adca5..ee443c8 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -12,7 +12,7 @@ spec: rules: - alert: SYN_CollectorNodeDown annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m. summary: Collector cannot be scraped expr: | @@ -23,50 +23,14 @@ spec: severity: critical syn: 'true' syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - alert: SYN_ElasticsearchDeprecation annotations: - message: In Red Hat OpenShift Logging Operator 6.0, support for the Red - Hat Elasticsearch Operator has been removed. Bug fixes and support are - provided only through the end of the 5.9 lifecycle. As an alternative + description: In Red Hat OpenShift Logging Operator 6.0, support for the + Red Hat Elasticsearch Operator has been removed. Bug fixes and support + are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead. summary: Detected Elasticsearch as the in-cluster storage, which has been - removed in 6.0 release + removed in the 6.0 release expr: | sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 for: 5m @@ -78,12 +42,12 @@ spec: syn_component: openshift4-logging - alert: SYN_FluentdDeprecation annotations: - message: In Red Hat OpenShift Logging Operator 6.0, support for Fluentd + description: In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead. summary: Detected Fluentd as the collector, which has been removed in - a 6.0 release + the 6.0 release expr: | sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 for: 5m @@ -95,7 +59,7 @@ spec: syn_component: openshift4-logging - alert: SYN_KibanaDeprecation annotations: - message: In Red Hat OpenShift Logging Operator 6.0, support for Kibana + description: In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead. @@ -112,8 +76,8 @@ spec: syn_component: openshift4-logging - alert: SYN_DiskBufferUsage annotations: - message: 'Collectors potentially consuming too much node disk, {{ $value - }}% ' + description: 'Collectors potentially consuming too much node disk, {{ + $value }}% ' summary: Detected consuming too much node disk on $labels.hostname host expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\ \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\ diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..4f6c7da 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -204,6 +204,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..4f6c7da 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -204,6 +204,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |-