Skip to content

Commit

Permalink
Update upstream alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
github-actions[bot] committed Dec 20, 2024
1 parent 8710667 commit f490102
Show file tree
Hide file tree
Showing 10 changed files with 161 additions and 153 deletions.
65 changes: 2 additions & 63 deletions component/extracted_alerts/master/collector_prometheus_alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,78 +9,17 @@ spec:
rules:
- alert: CollectorNodeDown
annotations:
message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
summary: "Collector cannot be scraped"
expr: |
up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0
for: 10m
labels:
service: collector
severity: critical
- alert: CollectorHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.001
for: 15m
labels:
service: collector
severity: critical
- alert: CollectorVeryHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.05
for: 15m
labels:
service: collector
severity: critical
- alert: ElasticsearchDeprecation
annotations:
message: "In Red Hat OpenShift Logging Operator 6.0, support for the Red Hat Elasticsearch Operator has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead."
summary: "Detected Elasticsearch as the in-cluster storage, which has been removed in 6.0 release"
expr: |
sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0
for: 5m
labels:
service: storage
severity: Warning
namespace: openshift-logging
- alert: FluentdDeprecation
annotations:
message: "In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead."
summary: "Detected Fluentd as the collector, which has been removed in a 6.0 release"
expr: |
sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0
for: 5m
labels:
service: collector
severity: Warning
namespace: openshift-logging
- alert: KibanaDeprecation
annotations:
message: "In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead."
summary: "Detected Kibana as the log data visualization, which has been removed in the 6.0 release"
expr: |
sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0
for: 5m
labels:
service: visualization
severity: Warning
namespace: openshift-logging
- alert: DiskBufferUsage
annotations:
message: "Collectors potentially consuming too much node disk, {{ $value }}% "
description: "Collectors potentially consuming too much node disk, {{ $value }}% "
summary: "Detected consuming too much node disk on $labels.hostname host"
expr: |
(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)')
Expand Down
18 changes: 18 additions & 0 deletions component/extracted_alerts/master/lokistack_prometheus_alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,24 @@ groups:
for: 15m
labels:
severity: warning
- alert: LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
summary: Loki is discarding samples during ingestion because they fail validation.
runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
- alert: LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,21 @@ groups:
for: 15m
labels:
severity: warning
- alert: LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
summary: Loki is discarding samples during ingestion because they fail validation.
runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,21 @@ groups:
for: 15m
labels:
severity: warning
- alert: LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
summary: Loki is discarding samples during ingestion because they fail validation.
runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,24 @@ groups:
for: 15m
labels:
severity: warning
- alert: LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
summary: Loki is discarding samples during ingestion because they fail validation.
runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
- alert: LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,27 @@ spec:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
summary: Loki is discarding samples during ingestion because they fail
validation.
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,27 @@ spec:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
summary: Loki is discarding samples during ingestion because they fail
validation.
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
rules:
- alert: SYN_CollectorNodeDown
annotations:
message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod
description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod
}} collector component for more than 10m.
summary: Collector cannot be scraped
expr: |
Expand All @@ -23,97 +23,10 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_CollectorHighErrorRate
annotations:
message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace
}}/{{ $labels.pod }} collector component.'
summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component
errors are high'
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.001
for: 15m
labels:
service: collector
severity: critical
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_CollectorVeryHighErrorRate
annotations:
message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace
}}/{{ $labels.pod }} collector component.'
summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component
errors are very high'
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.05
for: 15m
labels:
service: collector
severity: critical
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_ElasticsearchDeprecation
annotations:
message: In Red Hat OpenShift Logging Operator 6.0, support for the Red
Hat Elasticsearch Operator has been removed. Bug fixes and support are
provided only through the end of the 5.9 lifecycle. As an alternative
to the Elasticsearch Operator, you can use the Loki Operator instead.
summary: Detected Elasticsearch as the in-cluster storage, which has been
removed in 6.0 release
expr: |
sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0
for: 5m
labels:
namespace: openshift-logging
service: storage
severity: Warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_FluentdDeprecation
annotations:
message: In Red Hat OpenShift Logging Operator 6.0, support for Fluentd
as a collector has been removed. Bug fixes and support are provided
only through the end of the 5.9 lifecycle. As an alternative to Fluentd,
you can use the Vector collector instead.
summary: Detected Fluentd as the collector, which has been removed in
a 6.0 release
expr: |
sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0
for: 5m
labels:
namespace: openshift-logging
service: collector
severity: Warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_KibanaDeprecation
annotations:
message: In Red Hat OpenShift Logging Operator 6.0, support for Kibana
as a data visualization dashboard has been removed. Bug fixes and support
are provided only through the end of the 5.9 lifecycle. As an alternative
to Kibana, you can use the Grafana Dashboard instead.
summary: Detected Kibana as the log data visualization, which has been
removed in the 6.0 release
expr: |
sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0
for: 5m
labels:
namespace: openshift-logging
service: visualization
severity: Warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_DiskBufferUsage
annotations:
message: 'Collectors potentially consuming too much node disk, {{ $value
}}% '
description: 'Collectors potentially consuming too much node disk, {{
$value }}% '
summary: Detected consuming too much node disk on $labels.hostname host
expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\
\ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,27 @@ spec:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
summary: Loki is discarding samples during ingestion because they fail
validation.
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,27 @@ spec:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokiDiscardedSamplesWarning
annotations:
message: |-
Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion.
Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second.
runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning'
summary: Loki is discarding samples during ingestion because they fail
validation.
expr: |
sum by(namespace, tenant, reason) (
irate(loki_discarded_samples_total{
reason!="rate_limited",
reason!="per_stream_rate_limit",
reason!="stream_limit"}[2m])
)
> 0
for: 15m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-logging
- alert: SYN_LokistackSchemaUpgradesRequired
annotations:
message: |-
Expand Down

0 comments on commit f490102

Please sign in to comment.