appuio · github-actions · Dec 23, 2024
@@ -9,78 +9,17 @@ spec:
     rules:
     - alert: CollectorNodeDown
       annotations:
-        message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
+        description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
         summary: "Collector cannot be scraped"
       expr: |
         up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0
       for: 10m
       labels:
         service: collector
         severity: critical
-    - alert: CollectorHighErrorRate
-      annotations:
-        message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
-        summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high"
-      expr: |
-        100 * (
-            collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-          /
-            collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-          ) > 0.001
-      for: 15m
-      labels:
-        service: collector
-        severity: critical
-    - alert: CollectorVeryHighErrorRate
-      annotations:
-        message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
-        summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high"
-      expr: |
-        100 * (
-            collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-          /
-            collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-          ) > 0.05
-      for: 15m
-      labels:
-        service: collector
-        severity: critical
-    - alert: ElasticsearchDeprecation
-      annotations:
-        message: "In Red Hat OpenShift Logging Operator 6.0, support for the Red Hat Elasticsearch Operator has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead."
-        summary: "Detected Elasticsearch as the in-cluster storage, which has been removed in 6.0 release"
-      expr: |
-        sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0
-      for: 5m
-      labels:
-        service: storage
-        severity: Warning
-        namespace: openshift-logging
-    - alert: FluentdDeprecation
-      annotations:
-        message: "In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead."
-        summary: "Detected Fluentd as the collector, which has been removed in a 6.0 release"
-      expr: |
-        sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0
-      for: 5m
-      labels:
-        service: collector
-        severity: Warning
-        namespace: openshift-logging
-    - alert: KibanaDeprecation
-      annotations:
-        message: "In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead."
-        summary: "Detected Kibana as the log data visualization, which has been removed in the 6.0 release"
-      expr: |
-        sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0
-      for: 5m
-      labels:
-        service: visualization
-        severity: Warning
-        namespace: openshift-logging
     - alert: DiskBufferUsage
       annotations:
-        message: "Collectors potentially consuming too much node disk, {{ $value }}% "
+        description: "Collectors potentially consuming too much node disk, {{ $value }}% "
         summary: "Detected consuming too much node disk on $labels.hostname host"
       expr: |
         (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') 

@@ -175,6 +175,24 @@ groups:
     for: 15m
     labels:
       severity: warning
+  - alert: LokiDiscardedSamplesWarning
+    annotations:
+      message: |-
+        Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+        Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+      summary: Loki is discarding samples during ingestion because they fail validation.
+      runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
+    expr: |
+      sum by(namespace, tenant, reason) (
+        irate(loki_discarded_samples_total{
+          reason!="rate_limited",
+          reason!="per_stream_rate_limit",
+          reason!="stream_limit"}[2m])
+      )
+      > 0
+    for: 15m
+    labels:
+      severity: warning
   - alert: LokistackSchemaUpgradesRequired
     annotations:
       message: |-

@@ -175,3 +175,21 @@ groups:
     for: 15m
     labels:
       severity: warning
+  - alert: LokiDiscardedSamplesWarning
+    annotations:
+      message: |-
+        Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+        Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+      summary: Loki is discarding samples during ingestion because they fail validation.
+      runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
+    expr: |
+      sum by(namespace, tenant, reason) (
+        irate(loki_discarded_samples_total{
+          reason!="rate_limited",
+          reason!="per_stream_rate_limit",
+          reason!="stream_limit"}[2m])
+      )
+      > 0
+    for: 15m
+    labels:
+      severity: warning
@@ -175,3 +175,21 @@ groups:
     for: 15m
     labels:
       severity: warning
+  - alert: LokiDiscardedSamplesWarning
+    annotations:
+      message: |-
+        Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+        Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+      summary: Loki is discarding samples during ingestion because they fail validation.
+      runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
+    expr: |
+      sum by(namespace, tenant, reason) (
+        irate(loki_discarded_samples_total{
+          reason!="rate_limited",
+          reason!="per_stream_rate_limit",
+          reason!="stream_limit"}[2m])
+      )
+      > 0
+    for: 15m
+    labels:
+      severity: warning
@@ -175,6 +175,24 @@ groups:
     for: 15m
     labels:
       severity: warning
+  - alert: LokiDiscardedSamplesWarning
+    annotations:
+      message: |-
+        Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+        Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+      summary: Loki is discarding samples during ingestion because they fail validation.
+      runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
+    expr: |
+      sum by(namespace, tenant, reason) (
+        irate(loki_discarded_samples_total{
+          reason!="rate_limited",
+          reason!="per_stream_rate_limit",
+          reason!="stream_limit"}[2m])
+      )
+      > 0
+    for: 15m
+    labels:
+      severity: warning
   - alert: LokistackSchemaUpgradesRequired
     annotations:
       message: |-

@@ -204,6 +204,27 @@ spec:
             severity: warning
             syn: 'true'
             syn_component: openshift4-logging
+        - alert: SYN_LokiDiscardedSamplesWarning
+          annotations:
+            message: |-
+              Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+              Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+            runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
+            summary: Loki is discarding samples during ingestion because they fail
+              validation.
+          expr: |
+            sum by(namespace, tenant, reason) (
+              irate(loki_discarded_samples_total{
+                reason!="rate_limited",
+                reason!="per_stream_rate_limit",
+                reason!="stream_limit"}[2m])
+            )
+            > 0
+          for: 15m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-logging
         - alert: SYN_LokistackSchemaUpgradesRequired
           annotations:
             message: |-

@@ -204,6 +204,27 @@ spec:
             severity: warning
             syn: 'true'
             syn_component: openshift4-logging
+        - alert: SYN_LokiDiscardedSamplesWarning
+          annotations:
+            message: |-
+              Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+              Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+            runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
+            summary: Loki is discarding samples during ingestion because they fail
+              validation.
+          expr: |
+            sum by(namespace, tenant, reason) (
+              irate(loki_discarded_samples_total{
+                reason!="rate_limited",
+                reason!="per_stream_rate_limit",
+                reason!="stream_limit"}[2m])
+            )
+            > 0
+          for: 15m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-logging
         - alert: SYN_LokistackSchemaUpgradesRequired
           annotations:
             message: |-

@@ -12,7 +12,7 @@ spec:
       rules:
         - alert: SYN_CollectorNodeDown
           annotations:
-            message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod
+            description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod
               }} collector component for more than 10m.
             summary: Collector cannot be scraped
           expr: |
@@ -23,97 +23,10 @@ spec:
             severity: critical
             syn: 'true'
             syn_component: openshift4-logging
-        - alert: SYN_CollectorHighErrorRate
-          annotations:
-            message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace
-              }}/{{ $labels.pod }} collector component.'
-            summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component
-              errors are high'
-          expr: |
-            100 * (
-                collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-              /
-                collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-              ) > 0.001
-          for: 15m
-          labels:
-            service: collector
-            severity: critical
-            syn: 'true'
-            syn_component: openshift4-logging
-        - alert: SYN_CollectorVeryHighErrorRate
-          annotations:
-            message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace
-              }}/{{ $labels.pod }} collector component.'
-            summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component
-              errors are very high'
-          expr: |
-            100 * (
-                collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-              /
-                collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
-              ) > 0.05
-          for: 15m
-          labels:
-            service: collector
-            severity: critical
-            syn: 'true'
-            syn_component: openshift4-logging
-        - alert: SYN_ElasticsearchDeprecation
-          annotations:
-            message: In Red Hat OpenShift Logging Operator 6.0, support for the Red
-              Hat Elasticsearch Operator has been removed. Bug fixes and support are
-              provided only through the end of the 5.9 lifecycle. As an alternative
-              to the Elasticsearch Operator, you can use the Loki Operator instead.
-            summary: Detected Elasticsearch as the in-cluster storage, which has been
-              removed in 6.0 release
-          expr: |
-            sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0
-          for: 5m
-          labels:
-            namespace: openshift-logging
-            service: storage
-            severity: Warning
-            syn: 'true'
-            syn_component: openshift4-logging
-        - alert: SYN_FluentdDeprecation
-          annotations:
-            message: In Red Hat OpenShift Logging Operator 6.0, support for Fluentd
-              as a collector has been removed. Bug fixes and support are provided
-              only through the end of the 5.9 lifecycle. As an alternative to Fluentd,
-              you can use the Vector collector instead.
-            summary: Detected Fluentd as the collector, which has been removed in
-              a 6.0 release
-          expr: |
-            sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0
-          for: 5m
-          labels:
-            namespace: openshift-logging
-            service: collector
-            severity: Warning
-            syn: 'true'
-            syn_component: openshift4-logging
-        - alert: SYN_KibanaDeprecation
-          annotations:
-            message: In Red Hat OpenShift Logging Operator 6.0, support for Kibana
-              as a data visualization dashboard has been removed. Bug fixes and support
-              are provided only through the end of the 5.9 lifecycle. As an alternative
-              to Kibana, you can use the Grafana Dashboard instead.
-            summary: Detected Kibana as the log data visualization, which has been
-              removed in the 6.0 release
-          expr: |
-            sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0
-          for: 5m
-          labels:
-            namespace: openshift-logging
-            service: visualization
-            severity: Warning
-            syn: 'true'
-            syn_component: openshift4-logging
         - alert: SYN_DiskBufferUsage
           annotations:
-            message: 'Collectors potentially consuming too much node disk, {{ $value
-              }}% '
+            description: 'Collectors potentially consuming too much node disk, {{
+              $value }}% '
             summary: Detected consuming too much node disk on $labels.hostname host
           expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\
             \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\

@@ -204,6 +204,27 @@ spec:
             severity: warning
             syn: 'true'
             syn_component: openshift4-logging
+        - alert: SYN_LokiDiscardedSamplesWarning
+          annotations:
+            message: |-
+              Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+              Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+            runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
+            summary: Loki is discarding samples during ingestion because they fail
+              validation.
+          expr: |
+            sum by(namespace, tenant, reason) (
+              irate(loki_discarded_samples_total{
+                reason!="rate_limited",
+                reason!="per_stream_rate_limit",
+                reason!="stream_limit"}[2m])
+            )
+            > 0
+          for: 15m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-logging
         - alert: SYN_LokistackSchemaUpgradesRequired
           annotations:
             message: |-

@@ -204,6 +204,27 @@ spec:
             severity: warning
             syn: 'true'
             syn_component: openshift4-logging
+        - alert: SYN_LokiDiscardedSamplesWarning
+          annotations:
+            message: |-
+              Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
+              Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
+            runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
+            summary: Loki is discarding samples during ingestion because they fail
+              validation.
+          expr: |
+            sum by(namespace, tenant, reason) (
+              irate(loki_discarded_samples_total{
+                reason!="rate_limited",
+                reason!="per_stream_rate_limit",
+                reason!="stream_limit"}[2m])
+            )
+            > 0
+          for: 15m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-logging
         - alert: SYN_LokistackSchemaUpgradesRequired
           annotations:
             message: |-