From c6d5280e99e379eded014e87e8f13203ab776caf Mon Sep 17 00:00:00 2001 From: Stephan Feurer Date: Sat, 21 Dec 2024 15:08:21 +0100 Subject: [PATCH] Support OpenShift Logging 6.x --- alerts.txt | 18 +- class/defaults.yml | 48 +-- component/alertrules.libsonnet | 73 +--- component/app.jsonnet | 12 +- component/config_forwarding.libsonnet | 318 ------------------ component/config_logging.libsonnet | 123 ------- component/elasticsearch.libsonnet | 133 -------- .../master/collector_prometheus_alerts.yaml | 65 +--- ...sticsearch_operator_prometheus_alerts.yaml | 224 ------------ .../master/lokistack_prometheus_alerts.yaml | 18 + ...sticsearch_operator_prometheus_alerts.yaml | 224 ------------ .../fluentd_prometheus_alerts.yaml | 64 ---- .../lokistack_prometheus_alerts.yaml | 177 ---------- ...sticsearch_operator_prometheus_alerts.yaml | 224 ------------ .../fluentd_prometheus_alerts.yaml | 64 ---- .../collector_prometheus_alerts.yaml | 71 ---- ...sticsearch_operator_prometheus_alerts.yaml | 224 ------------ .../lokistack_prometheus_alerts.yaml | 177 ---------- .../collector_prometheus_alerts.yaml | 115 ------- ...sticsearch_operator_prometheus_alerts.yaml | 224 ------------ .../collector_prometheus_alerts.yaml | 45 +++ .../lokistack_prometheus_alerts.yaml | 18 + .../collector_prometheus_alerts.yaml | 45 +++ .../lokistack_prometheus_alerts.yaml | 34 ++ component/log_forwarder.libsonnet | 83 +++++ ...loki.libsonnet => log_lokistack.libsonnet} | 13 +- ...ibsonnet => log_metricsexporter.libsonnet} | 5 + ...und.libsonnet => log_workaround.libsonnet} | 18 +- component/main.jsonnet | 49 ++- component/utils.libsonnet | 33 -- ...csearchExpectNodeToReachDiskWatermark.adoc | 14 - lib/openshift4-logging.libsonnet | 18 - tests/forwardingonly.yml | 2 - .../apps/openshift4-logging.yaml | 4 + .../openshift4-logging/00_namespace.yaml | 1 + .../openshift4-logging/10_operator_group.yaml | 6 +- .../openshift4-logging/20_subscriptions.yaml | 10 +- .../30_cluster_logging.yaml | 17 - ...ki_logstore.yaml => 30_loki_logstore.yaml} | 0 ...0_loki_netpol.yaml => 30_loki_netpol.yaml} | 0 .../{50_loki_rbac.yaml => 30_loki_rbac.yaml} | 0 ...{50_loki_stack.yaml => 30_loki_stack.yaml} | 1 + .../openshift4-logging/40_log_forwarder.yaml | 21 ++ ...r_fix.yaml => 50_fix_app_logs_reader.yaml} | 0 ...er_fix.yaml => 50_fix_ingester_stuck.yaml} | 0 ...yaml => 50_fix_missing_metrics_token.yaml} | 0 .../60_collector_alerts.yaml | 93 +---- .../60_lokistack_alerts.yaml | 21 ++ .../apps/openshift4-logging.yaml | 4 + .../openshift4-logging/00_namespace.yaml | 1 + .../openshift4-logging/10_operator_group.yaml | 6 +- .../openshift4-logging/20_subscriptions.yaml | 5 +- .../30_cluster_logging.yaml | 13 - .../60_collector_alerts.yaml | 93 +---- .../apps/openshift4-logging.yaml | 4 + .../openshift4-logging/00_namespace.yaml | 1 + .../openshift4-logging/10_operator_group.yaml | 6 +- .../openshift4-logging/20_subscriptions.yaml | 10 +- .../30_cluster_logging.yaml | 17 - ...ki_logstore.yaml => 30_loki_logstore.yaml} | 0 ...0_loki_netpol.yaml => 30_loki_netpol.yaml} | 0 .../{50_loki_rbac.yaml => 30_loki_rbac.yaml} | 0 ...{50_loki_stack.yaml => 30_loki_stack.yaml} | 1 + ...gforwarding.yaml => 40_log_forwarder.yaml} | 2 +- ...r_fix.yaml => 50_fix_app_logs_reader.yaml} | 0 ...er_fix.yaml => 50_fix_ingester_stuck.yaml} | 0 ...yaml => 50_fix_missing_metrics_token.yaml} | 0 .../60_collector_alerts.yaml | 93 +---- .../60_lokistack_alerts.yaml | 21 ++ tests/master.yml | 33 -- 70 files changed, 432 insertions(+), 3025 deletions(-) delete mode 100644 component/config_forwarding.libsonnet delete mode 100644 component/config_logging.libsonnet delete mode 100644 component/elasticsearch.libsonnet delete mode 100644 component/extracted_alerts/master/elasticsearch_operator_prometheus_alerts.yaml delete mode 100644 component/extracted_alerts/release-5.6/elasticsearch_operator_prometheus_alerts.yaml delete mode 100644 component/extracted_alerts/release-5.6/fluentd_prometheus_alerts.yaml delete mode 100644 component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml delete mode 100644 component/extracted_alerts/release-5.7/elasticsearch_operator_prometheus_alerts.yaml delete mode 100644 component/extracted_alerts/release-5.7/fluentd_prometheus_alerts.yaml delete mode 100644 component/extracted_alerts/release-5.8/collector_prometheus_alerts.yaml delete mode 100644 component/extracted_alerts/release-5.8/elasticsearch_operator_prometheus_alerts.yaml delete mode 100644 component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml delete mode 100644 component/extracted_alerts/release-5.9/collector_prometheus_alerts.yaml delete mode 100644 component/extracted_alerts/release-5.9/elasticsearch_operator_prometheus_alerts.yaml create mode 100644 component/extracted_alerts/release-6.0/collector_prometheus_alerts.yaml rename component/extracted_alerts/{release-5.9 => release-6.0}/lokistack_prometheus_alerts.yaml (89%) create mode 100644 component/extracted_alerts/release-6.1/collector_prometheus_alerts.yaml rename component/extracted_alerts/{release-5.7 => release-6.1}/lokistack_prometheus_alerts.yaml (79%) create mode 100644 component/log_forwarder.libsonnet rename component/{loki.libsonnet => log_lokistack.libsonnet} (90%) rename component/{logmetrics.libsonnet => log_metricsexporter.libsonnet} (85%) rename component/{loki_workaround.libsonnet => log_workaround.libsonnet} (92%) delete mode 100644 component/utils.libsonnet delete mode 100644 docs/modules/ROOT/pages/runbooks/SYN_ElasticsearchExpectNodeToReachDiskWatermark.adoc delete mode 100644 lib/openshift4-logging.libsonnet delete mode 100644 tests/golden/defaults/openshift4-logging/openshift4-logging/30_cluster_logging.yaml rename tests/golden/defaults/openshift4-logging/openshift4-logging/{50_loki_logstore.yaml => 30_loki_logstore.yaml} (100%) rename tests/golden/defaults/openshift4-logging/openshift4-logging/{50_loki_netpol.yaml => 30_loki_netpol.yaml} (100%) rename tests/golden/defaults/openshift4-logging/openshift4-logging/{50_loki_rbac.yaml => 30_loki_rbac.yaml} (100%) rename tests/golden/defaults/openshift4-logging/openshift4-logging/{50_loki_stack.yaml => 30_loki_stack.yaml} (97%) create mode 100644 tests/golden/defaults/openshift4-logging/openshift4-logging/40_log_forwarder.yaml rename tests/golden/defaults/openshift4-logging/openshift4-logging/{50_loki_logreader_fix.yaml => 50_fix_app_logs_reader.yaml} (100%) rename tests/golden/defaults/openshift4-logging/openshift4-logging/{50_loki_ingester_fix.yaml => 50_fix_ingester_stuck.yaml} (100%) rename tests/golden/defaults/openshift4-logging/openshift4-logging/{50_loki_operator_metrics_token.yaml => 50_fix_missing_metrics_token.yaml} (100%) delete mode 100644 tests/golden/forwardingonly/openshift4-logging/openshift4-logging/30_cluster_logging.yaml delete mode 100644 tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_cluster_logging.yaml rename tests/golden/multilineerr/openshift4-logging/openshift4-logging/{50_loki_logstore.yaml => 30_loki_logstore.yaml} (100%) rename tests/golden/multilineerr/openshift4-logging/openshift4-logging/{50_loki_netpol.yaml => 30_loki_netpol.yaml} (100%) rename tests/golden/multilineerr/openshift4-logging/openshift4-logging/{50_loki_rbac.yaml => 30_loki_rbac.yaml} (100%) rename tests/golden/multilineerr/openshift4-logging/openshift4-logging/{50_loki_stack.yaml => 30_loki_stack.yaml} (97%) rename tests/golden/multilineerr/openshift4-logging/openshift4-logging/{31_cluster_logforwarding.yaml => 40_log_forwarder.yaml} (92%) rename tests/golden/multilineerr/openshift4-logging/openshift4-logging/{50_loki_logreader_fix.yaml => 50_fix_app_logs_reader.yaml} (100%) rename tests/golden/multilineerr/openshift4-logging/openshift4-logging/{50_loki_ingester_fix.yaml => 50_fix_ingester_stuck.yaml} (100%) rename tests/golden/multilineerr/openshift4-logging/openshift4-logging/{50_loki_operator_metrics_token.yaml => 50_fix_missing_metrics_token.yaml} (100%) diff --git a/alerts.txt b/alerts.txt index 33bbfb7..66fc2a5 100644 --- a/alerts.txt +++ b/alerts.txt @@ -1,17 +1,7 @@ -https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.6/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.6/fluentd_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.7/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.7/fluentd_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.8/config/prometheus/collector_alerts.yaml release-5.8/collector_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.9/config/prometheus/collector_alerts.yaml release-5.9/collector_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-6.0/config/prometheus/collector_alerts.yaml release-6.0/collector_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-6.1/config/prometheus/collector_alerts.yaml release-6.1/collector_prometheus_alerts.yaml https://raw.githubusercontent.com/openshift/cluster-logging-operator/master/config/prometheus/collector_alerts.yaml master/collector_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.6/files/prometheus_alerts.yml release-5.6/elasticsearch_operator_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.7/files/prometheus_alerts.yml release-5.7/elasticsearch_operator_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.8/elasticsearch_operator_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.9/elasticsearch_operator_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/elasticsearch-operator/master/files/prometheus_alerts.yml master/elasticsearch_operator_prometheus_alerts.yaml - -https://raw.githubusercontent.com/openshift/loki/release-5.6/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.6/lokistack_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/loki/release-5.7/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.7/lokistack_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/loki/release-5.8/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.8/lokistack_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/loki/release-5.9/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.9/lokistack_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/loki/release-6.0/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-6.0/lokistack_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/loki/release-6.1/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-6.1/lokistack_prometheus_alerts.yaml https://raw.githubusercontent.com/openshift/loki/main/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml master/lokistack_prometheus_alerts.yaml diff --git a/class/defaults.yml b/class/defaults.yml index 6b91e13..bcb6bcb 100644 --- a/class/defaults.yml +++ b/class/defaults.yml @@ -8,9 +8,9 @@ parameters: "False": {} namespace: openshift-logging - version: '5.9' + + version: '6.0' channel: 'stable-${openshift4_logging:version}' - alerts: 'release-${openshift4_logging:version}' components: lokistack: @@ -42,16 +42,6 @@ parameters: ingestion: ingestionBurstSize: 9 ingestionRate: 5 - elasticsearch: - enabled: false - kibana_host: null - predict_elasticsearch_storage_alert: - enabled: true - lookback_range: 72h - predict_hours_from_now: 72 - threshold: 85 - for: 6h - severity: warning logmetrics: enabled: false spec: @@ -65,14 +55,20 @@ parameters: cpu: 200m memory: 128Mi - clusterLogging: {} clusterLogForwarder: {} - namespaceLogForwarderEnabled: false - namespaceLogForwarder: {} - secrets: {} + alerts: + release: 'release-${openshift4_logging:version}' + ignore: + - ElasticsearchHighFileDescriptorUsage + - ElasticsearchOperatorCSVNotSuccessful + - FluentdQueueLengthIncreasing + patch: + FluentdQueueLengthIncreasing: + for: '12h' + operatorResources: clusterLogging: requests: @@ -86,12 +82,6 @@ parameters: cpu: 50m limits: memory: 512Mi - elasticsearch: - requests: - memory: 1Gi - cpu: 100m - limits: - memory: 1.5Gi images: kubectl: @@ -104,17 +94,5 @@ parameters: schedule: '*/10 * * * *' sleep_time: 2m - ignore_alerts: - - ElasticsearchHighFileDescriptorUsage - - ElasticsearchOperatorCSVNotSuccessful - - FluentdQueueLengthIncreasing - - patch_alerts: - FluentdQueueLengthIncreasing: - for: '12h' - - openshift4_elasticsearch_operator: - targetNamespaces: - - ${openshift4_logging:namespace} - openshift4_console: ${openshift4_logging:_openshift4_console:${openshift4_logging:components:lokistack:enabled}} + diff --git a/component/alertrules.libsonnet b/component/alertrules.libsonnet index 2182703..9763fd2 100644 --- a/component/alertrules.libsonnet +++ b/component/alertrules.libsonnet @@ -2,12 +2,10 @@ local alertpatching = import 'lib/alert-patching.libsonnet'; local com = import 'lib/commodore.libjsonnet'; local kap = import 'lib/kapitan.libjsonnet'; local kube = import 'lib/kube.libjsonnet'; -local utils = import 'utils.libsonnet'; local inv = kap.inventory(); local params = inv.parameters.openshift4_logging; -local elasticsearch = inv.parameters.openshift4_logging.components.elasticsearch; -local loki = inv.parameters.openshift4_logging.components.lokistack; +local lokiEnabled = params.components.lokistack.enabled; local runbook(alertname) = 'https://hub.syn.tools/openshift4-logging/runbooks/%s.html' % alertname; @@ -16,63 +14,26 @@ assert std.member(inv.applications, 'openshift4-monitoring') : 'Component `openshift4-monitoring` not enabled'; -// Keep config backwards compatible -local predict_storage_alert = elasticsearch.predict_elasticsearch_storage_alert + ( - if std.objectHas(params, 'predict_elasticsearch_storage_alert') then - std.trace( - 'parameter predict_elasticsearch_storage_alert is deprecated, please use parameter `components.elasticsearch.predict_elasticsearch_storage_alert instead`', - com.makeMergeable(params.predict_elasticsearch_storage_alert) - ) - else {} -); - +// Upstream alerts to ignore // Keep only alerts from params.ignore_alerts for which the last // array entry wasn't prefixed with `~`. -local user_ignore_alerts = com.renderArray(params.ignore_alerts); - -// Upstream alerts to ignore local ignore_alerts = std.set( // Add set of upstream alerts that should be ignored from processed value of // `params.ignore_alerts` - user_ignore_alerts + com.renderArray(std.get(params, 'ignore_alerts', [])) + + com.renderArray(std.get(params, 'ignore_alerts', [])) ); // Alert rule patches. // Provide partial objects for alert rules that need to be tuned compared to // upstream. The keys in this object correspond to the `alert` field of the // rule for which the patch is intended. -local patch_alerts = params.patch_alerts; +local patch_alerts = params.alerts.patch + std.get(params, 'patch_alerts', {}); local loadFile(file) = - local fpath = 'openshift4-logging/component/extracted_alerts/%s/%s' % [ params.alerts, file ]; + local fpath = 'openshift4-logging/component/extracted_alerts/%s/%s' % [ params.alerts.release, file ]; std.parseJson(kap.yaml_load_stream(fpath)); - -// This will be processed by filter_patch_rules() as well -local predictESStorage = { - local alertName = 'ElasticsearchExpectNodeToReachDiskWatermark', - local hoursFromNow = predict_storage_alert.predict_hours_from_now, - local secondsFromNow = hoursFromNow * 3600, - alert: alertName, - annotations: { - message: ( - 'Expecting to reach disk low watermark at {{ $labels.node }} node in {{ $labels.cluster }} cluster in %s hours.' - + ' When reaching the watermark no new shards will be allocated to this node anymore. You should consider adding more disk to the node.' - ) % std.toString(hoursFromNow), - runbook_url: runbook('SYN_' + alertName), - summary: 'Expecting to Reach Disk Low Watermark in %s Hours' % std.toString(hoursFromNow), - }, - expr: ||| - sum by(cluster, instance, node) ( - (1 - (predict_linear(es_fs_path_available_bytes[%s], %s) / es_fs_path_total_bytes)) * 100 - ) > %s - ||| % [ predict_storage_alert.lookback_range, std.toString(secondsFromNow), std.toString(predict_storage_alert.threshold) ], - 'for': predict_storage_alert['for'], - labels: { - severity: predict_storage_alert.severity, - }, -}; - local renderRunbookBaseURL(group, baseURL) = { name: group.name, rules: std.map( @@ -119,23 +80,6 @@ local prometheus_rules(name, groups, baseURL) = kube._Object('monitoring.coreos. }, }; - -// Elasticstack alerts - -local esStorageGroup = { - name: 'elasticsearch_node_storage.alerts', - rules: [ predictESStorage ], -}; -local fluentdGroup = if !utils.isVersion58 then loadFile('fluentd_prometheus_alerts.yaml')[0].groups else []; - -local esGroups = - loadFile('elasticsearch_operator_prometheus_alerts.yaml')[0].groups + - fluentdGroup + - [ - if predict_storage_alert.enabled then esStorageGroup, - ]; -local esBaseURL = 'https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md'; - // Lokistack alerts local lokiGroups = loadFile('lokistack_prometheus_alerts.yaml')[0].groups; @@ -146,7 +90,6 @@ local lokiBaseURL = 'https://github.com/grafana/loki/blob/main/operator/docs/lok local collectorGroups = loadFile('collector_prometheus_alerts.yaml')[0].spec.groups; { - [if elasticsearch.enabled then '60_elasticsearch_alerts']: prometheus_rules('syn-elasticsearch-logging-rules', esGroups, esBaseURL), - [if loki.enabled then '60_lokistack_alerts']: prometheus_rules('syn-loki-logging-rules', lokiGroups, lokiBaseURL), - [if utils.isVersion58 then '60_collector_alerts']: prometheus_rules('syn-collector-rules', collectorGroups, ''), + [if lokiEnabled then '60_lokistack_alerts']: prometheus_rules('syn-loki-logging-rules', lokiGroups, lokiBaseURL), + '60_collector_alerts': prometheus_rules('syn-collector-rules', collectorGroups, ''), } diff --git a/component/app.jsonnet b/component/app.jsonnet index 105fbf1..bd5d9f0 100644 --- a/component/app.jsonnet +++ b/component/app.jsonnet @@ -3,8 +3,14 @@ local inv = kap.inventory(); local params = inv.parameters.openshift4_logging; local argocd = import 'lib/argocd.libjsonnet'; -local app = argocd.App('openshift4-logging', params.namespace); - { - 'openshift4-logging': app, + 'openshift4-logging': argocd.App('openshift4-logging', params.namespace) { + spec+: { + syncPolicy+: { + syncOptions+: [ + 'ServerSideApply=true', + ], + }, + }, + }, } diff --git a/component/config_forwarding.libsonnet b/component/config_forwarding.libsonnet deleted file mode 100644 index c1cf1e5..0000000 --- a/component/config_forwarding.libsonnet +++ /dev/null @@ -1,318 +0,0 @@ -local com = import 'lib/commodore.libjsonnet'; -local kap = import 'lib/kapitan.libjsonnet'; -local kube = import 'lib/kube.libjsonnet'; -local lib = import 'lib/openshift4-logging.libsonnet'; -local utils = import 'utils.libsonnet'; - -local inv = kap.inventory(); -local params = inv.parameters.openshift4_logging; - -local deployLokistack = params.components.lokistack.enabled; -local deployElasticsearch = params.components.elasticsearch.enabled; -local forwardingOnly = !deployLokistack && !deployElasticsearch; - -// Make sure the default output is added to the pipelines `outputRefs`, -// if the logging stack is not disabled. -local pipelineOutputRefs(pipeline) = - local default = if forwardingOnly then [] else [ 'default' ]; - std.get(pipeline, 'forwarders', []) + default; - -// ----------------------------------------------------------------------------- -// Legacy Rendering -// ----------------------------------------------------------------------------- - -local legacyConfig = std.get(params, 'clusterLogForwarding', {}); -local hasLegacyConfig = if std.length(legacyConfig) > 0 then std.trace( - 'Parameter `clusterLogForwarding` is deprecated. Please update your config to use `clusterLogForwarder`', - true -) else false; - -// Apply default config for application logs. -local patchLegacyAppLogDefaults = { - local pipeline = std.get(legacyConfig, 'application_logs', { enabled: true }), - local pipelineOutputs = pipelineOutputRefs(pipeline), - local pipelineEnabled = std.length(pipelineOutputs) > 0, - - [if hasLegacyConfig then 'pipelines']: { - [if pipelineEnabled then 'application-logs']: { - inputRefs: [ 'application' ], - outputRefs: pipelineOutputs, - }, - }, -}; - -// Apply default config for infra logs. -local patchLegacyInfraLogDefaults = { - local pipeline = { enabled: true } + std.get(legacyConfig, 'infrastructure_logs', {}), - local pipelineOutputs = pipelineOutputRefs(pipeline), - local pipelineEnabled = pipeline.enabled && std.length(pipelineOutputs) > 0, - - [if hasLegacyConfig then 'pipelines']: { - [if pipelineEnabled then 'infrastructure-logs']: { - inputRefs: [ 'infrastructure' ], - outputRefs: pipelineOutputs, - }, - }, -}; - -// Apply default config for audit logs. -local patchLegacyAuditLogDefaults = { - local pipeline = std.get(legacyConfig, 'audit_logs', { enabled: false }), - local pipelineOutputs = pipelineOutputRefs(pipeline), - local pipelineEnabled = pipeline.enabled && std.length(pipelineOutputs) > 0, - - [if hasLegacyConfig then 'pipelines']: { - [if pipelineEnabled then 'audit-logs']: { - inputRefs: [ 'audit' ], - outputRefs: pipelineOutputs, - }, - }, -}; - -// Enable json parsing for default pipelines if configured. -local legacyEnableJson = std.get(std.get(legacyConfig, 'json', {}), 'enabled', false); -local patchLegacyJsonLogging = { - local enableAppLogs = std.get(std.get(legacyConfig, 'application_logs', {}), 'json', false), - local enableInfraLogs = std.get(std.get(legacyConfig, 'infrastructure_logs', {}), 'json', false), - - [if hasLegacyConfig then 'pipelines']: { - [if enableAppLogs then 'application-logs']: { parse: 'json' }, - [if enableInfraLogs then 'infrastructure-logs']: { parse: 'json' }, - }, - [if deployElasticsearch && legacyEnableJson then 'outputDefaults']: { - elasticsearch: { - structuredTypeKey: std.get(legacyConfig.json, 'typekey', 'kubernetes.labels.logFormat'), - structuredTypeName: std.get(legacyConfig.json, 'typename', 'nologformat'), - }, - }, -}; - -// Enable detectMultilineErrors for default pipelines if configured. -local patchLegacyMultilineErrors = { - local enableAppLogs = std.get(std.get(legacyConfig, 'application_logs', {}), 'detectMultilineErrors', false), - local enableInfraLogs = std.get(std.get(legacyConfig, 'infrastructure_logs', {}), 'detectMultilineErrors', false), - - [if hasLegacyConfig then 'pipelines']: { - [if enableAppLogs then 'application-logs']: { detectMultilineErrors: true }, - [if enableInfraLogs then 'infrastructure-logs']: { detectMultilineErrors: true }, - }, -}; - -// --- patch deprecated `clusterLogForwarding.namespace` config -local namespaceGroups = ( - if std.objectHas(legacyConfig, 'namespaces') then - { - [ns]: { - namespaces: [ ns ], - forwarders: [ legacyConfig.namespaces[ns].forwarder ], - } - for ns in std.objectFields(legacyConfig.namespaces) - } else {} -) + std.get(legacyConfig, 'namespace_groups', {}); -// --- patch end - -// Add inputs entry for every namespace_group defined in `clusterLogForwarding.namespace_groups`. -local patchLegacyCustomInputs = { - [if std.length(namespaceGroups) > 0 then 'inputs']: { - [group]: { - application: { - namespaces: namespaceGroups[group].namespaces, - }, - } - for group in std.objectFields(namespaceGroups) - if hasLegacyConfig - }, -}; - -// Add pipelines entry for every namespace_group defined in `clusterLogForwarding.namespace_groups`. -local patchLegacyCustomPipelines = { - [if std.length(namespaceGroups) > 0 then 'pipelines']: { - local enableJson = std.get(namespaceGroups[group], 'json', false), - local enableMultilineError = std.get(namespaceGroups[group], 'detectMultilineErrors', false), - - [group]: { - inputRefs: [ group ], - outputRefs: std.get(namespaceGroups[group], 'forwarders', []), - [if enableJson then 'parse']: 'json', - [if enableMultilineError then 'detectMultilineErrors']: true, - } - for group in std.objectFields(namespaceGroups) - if hasLegacyConfig - }, -}; - -// Add outputs entry for every forwarder defined in `clusterLogForwarding.forwarders`. -local patchLegacyCustomOutputs = { - [if std.length(std.get(legacyConfig, 'forwarders', {})) > 0 then 'outputs']: { - [name]: legacyConfig.forwarders[name] - for name in std.objectFields(legacyConfig.forwarders) - if hasLegacyConfig - }, -}; - -// ----------------------------------------------------------------------------- -// End Legacy Rendering -// ----------------------------------------------------------------------------- - -// Add defaults to pipelines config -local patchPipelineDefaults = { - local appsPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'application-logs', {}), - local infraPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'infrastructure-logs', {}), - local auditPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'audit-logs', {}), - - pipelines: { - [if !forwardingOnly || std.length(appsPipeline) > 0 then 'application-logs']: { - inputRefs: [ 'application' ], - outputRefs: pipelineOutputRefs(appsPipeline), - }, - [if !forwardingOnly || std.length(infraPipeline) > 0 then 'infrastructure-logs']: { - inputRefs: [ 'infrastructure' ], - outputRefs: pipelineOutputRefs(infraPipeline), - }, - [if std.length(auditPipeline) > 0 then 'audit-logs']: { - inputRefs: [ 'audit' ], - }, - }, -}; - -// clusterLogForwarderSpec: -// Consecutively apply patches to result of previous apply. -local clusterLogForwarderSpec = std.foldl( - // we use std.mergePatch here, because this way we don't need - // to make each patch object mergeable by suffixing all keys with a +. - function(manifest, patch) std.mergePatch(manifest, patch), - [ - patchPipelineDefaults, - // Apply legacy patches / defaults - patchLegacyAppLogDefaults, - patchLegacyInfraLogDefaults, - patchLegacyAuditLogDefaults, - patchLegacyJsonLogging, - patchLegacyMultilineErrors, - patchLegacyCustomInputs, - patchLegacyCustomOutputs, - patchLegacyCustomPipelines, - ], - { - inputs: {}, - outputs: {}, - pipelines: {}, - }, -) + com.makeMergeable(params.clusterLogForwarder); - -// Unfold objects into array for ClusterLogForwarder resource. -local unfoldSpecs(specs) = { - // Unfold objects into array. - [if std.length(specs.inputs) > 0 then 'inputs']: [ - { name: name } + specs.inputs[name] - for name in std.objectFields(specs.inputs) - ], - [if std.length(specs.outputs) > 0 then 'outputs']: [ - { name: name } + specs.outputs[name] - for name in std.objectFields(specs.outputs) - ], - [if std.length(specs.pipelines) > 0 then 'pipelines']: [ - { name: name } + specs.pipelines[name] - for name in std.objectFields(specs.pipelines) - ], -} + { - // Import remaining specs as is. - [key]: specs[key] - for key in std.objectFields(specs) - if !std.member([ 'inputs', 'outputs', 'pipelines' ], key) -}; - -// ClusterLogForwarder: -// Create definitive ClusterLogForwarder resource from specs. -local clusterLogForwarder = lib.ClusterLogForwarder(params.namespace, 'instance') { - metadata+: { - annotations+: { - 'argocd.argoproj.io/sync-options': 'SkipDryRunOnMissingResource=true', - }, - }, - spec: unfoldSpecs(clusterLogForwarderSpec), -}; - -// namespaceLogForwarderIgnoreKeys -// List of keys to ignore in namespaceLogForwarder -local namespaceLogForwarderIgnoreKeys = [ - 'instance', - 'openshift-logging/instance', -]; -// namespaceLogForwarder: -// Create namespaced LogForwarder resource from specs. -local namespaceLogForwarder = [ - local specs = { inputs: {}, outputs: {}, pipelines: {} } + com.makeMergeable(params.namespaceLogForwarder[forwarder]); - local name = utils.namespacedName(forwarder).name; - local namespace = utils.namespacedName(forwarder).namespace; - local serviceAccount = std.get(specs, 'serviceAccountName', utils.namespacedName(forwarder).name); - - lib.ClusterLogForwarder(namespace, name) { - metadata+: { - annotations+: { - 'argocd.argoproj.io/sync-options': 'SkipDryRunOnMissingResource=true', - }, - }, - spec: { serviceAccountName: serviceAccount } + com.makeMergeable(unfoldSpecs(specs)), - } - for forwarder in std.objectFields(params.namespaceLogForwarder) - if !std.member(namespaceLogForwarderIgnoreKeys, forwarder) -]; - -// namespaceServiceAccount: -// Create ServiceAccount for namespaced LogForwarder specs. -local namespaceServiceAccount = [ - local specs = params.namespaceLogForwarder[forwarder]; - local namespace = utils.namespacedName(forwarder).namespace; - local serviceAccount = std.get(specs, 'serviceAccountName', utils.namespacedName(forwarder).name); - - kube.ServiceAccount(serviceAccount) { - metadata+: { - namespace: namespace, - }, - } - for forwarder in std.objectFields(params.namespaceLogForwarder) - if !std.member(namespaceLogForwarderIgnoreKeys, forwarder) -]; - -// namespaceRoleBinding: -// Create RoleBinding for namespaced LogForwarder. -local namespaceRoleBinding = [ - local specs = params.namespaceLogForwarder[forwarder]; - local namespace = utils.namespacedName(forwarder).namespace; - local serviceAccount = std.get(specs, 'serviceAccountName', utils.namespacedName(forwarder).name); - - kube.RoleBinding(serviceAccount) { - metadata+: { - namespace: namespace, - }, - roleRef: { - apiGroup: 'rbac.authorization.k8s.io', - kind: 'ClusterRole', - name: 'collect-application-logs', - }, - subjects: [ { - kind: 'ServiceAccount', - name: serviceAccount, - namespace: namespace, - } ], - } - for forwarder in std.objectFields(params.namespaceLogForwarder) - if !std.member(namespaceLogForwarderIgnoreKeys, forwarder) -]; - -local enableLogForwarder = std.length(params.clusterLogForwarder) > 0 || std.get(legacyConfig, 'enabled', false); - -// Define outputs below -if enableLogForwarder then - { - '31_cluster_logforwarding': clusterLogForwarder, - [if std.length(params.namespaceLogForwarder) > 1 then '32_namespace_logforwarding']: namespaceLogForwarder, - [if std.length(params.namespaceLogForwarder) > 1 then '32_namespace_serviceaccount']: namespaceServiceAccount, - [if std.length(params.namespaceLogForwarder) > 1 then '32_namespace_rolebinding']: namespaceRoleBinding, - } -else - std.trace( - 'Log forwarding disabled, not deploying ClusterLogForwarder', - {} - ) diff --git a/component/config_logging.libsonnet b/component/config_logging.libsonnet deleted file mode 100644 index 1945fce..0000000 --- a/component/config_logging.libsonnet +++ /dev/null @@ -1,123 +0,0 @@ -local kap = import 'lib/kapitan.libjsonnet'; -local lib = import 'lib/openshift4-logging.libsonnet'; - -local inv = kap.inventory(); -local params = inv.parameters.openshift4_logging; - -local deployLokistack = params.components.lokistack.enabled; -local deployElasticsearch = params.components.elasticsearch.enabled; - -// Apply defaults for Lokistack. -local patchLokistackDefaults = { - [if deployLokistack then 'spec']: { - logStore: { - type: 'lokistack', - lokistack: { - name: 'loki', - }, - }, - }, -}; - -// Apply defaults for Elasticsearch. -local patchElasticsearchDefaults = { - [if deployElasticsearch then 'spec']: { - logStore: { - elasticsearch: { - nodeCount: 3, - storage: { - size: '200Gi', - }, - redundancyPolicy: 'SingleRedundancy', - nodeSelector: { - 'node-role.kubernetes.io/infra': '', - }, - }, - retentionPolicy: { - application: { - maxAge: '7d', - pruneNamespacesInterval: '15m', - }, - infra: { - maxAge: '30d', - pruneNamespacesInterval: '15m', - }, - audit: { - maxAge: '30d', - pruneNamespacesInterval: '15m', - }, - }, - }, - visualization: { - type: 'kibana', - kibana: { - replicas: 2, - nodeSelector: { - 'node-role.kubernetes.io/infra': '', - }, - }, - }, - }, -}; - -// Apply customisations from params.clusterLogging. -local patchLoggingConfig = { - spec: params.clusterLogging { - collection: { - // Don't include legacy config key 'collection.logs'. - [it]: params.clusterLogging.collection[it] - for it in std.objectFields(std.get(params.clusterLogging, 'collection', {})) - if it != 'logs' - }, - }, -}; - -// --- patch deprecated logging resource -local patchLegacyConfig = { - local legacyConfig = std.get(std.get(params.clusterLogging, 'collection', { collection: {} }), 'logs', {}), - local legacyType = std.get(legacyConfig, 'type', ''), - local legacyFluentd = std.get(legacyConfig, 'fluentd', {}), - - spec: { - collection: if std.length(legacyConfig) > 0 then std.trace( - 'Parameter `clusterLogging.collector.logs` is deprecated. Please update your config to use `clusterLogging.collector`', - { - [if legacyType != '' then 'type']: legacyType, - } + legacyFluentd, - ) else {}, - }, -}; -// --- patch end - - -// ClusterLogging specs: -// Consecutively apply patches to result of previous apply. -local clusterLogging = std.foldl( - // we use std.mergePatch here, because this way we don't need - // to make each patch object mergeable by suffixing all keys with a +. - function(manifest, patch) std.mergePatch(manifest, patch), - [ - patchLokistackDefaults, - patchElasticsearchDefaults, - patchLoggingConfig, - patchLegacyConfig, - ], - lib.ClusterLogging(params.namespace, 'instance') { - metadata+: { - annotations+: { - 'argocd.argoproj.io/sync-options': 'SkipDryRunOnMissingResource=true', - }, - }, - spec: { - managementState: 'Managed', - collection: { - type: 'vector', - }, - }, - } -); - -// Define outputs below -{ - '30_cluster_logging': clusterLogging, -} diff --git a/component/elasticsearch.libsonnet b/component/elasticsearch.libsonnet deleted file mode 100644 index 1e726b2..0000000 --- a/component/elasticsearch.libsonnet +++ /dev/null @@ -1,133 +0,0 @@ -// main template for openshift4-lokistack -local kap = import 'lib/kapitan.libjsonnet'; -local kube = import 'lib/kube.libjsonnet'; -local resourceLocker = import 'lib/resource-locker.libjsonnet'; - -// The hiera parameters for the component -local inv = kap.inventory(); -local params = inv.parameters.openshift4_logging; -local elasticsearch = inv.parameters.openshift4_logging.components.elasticsearch; - - -local machineconfig_journald = [ - kube._Object('machineconfiguration.openshift.io/v1', 'MachineConfig', '40-' + role + '-journald') { - metadata+: { - labels+: { - 'machineconfiguration.openshift.io/role': role, - }, - }, - spec: { - config: { - ignition: { - version: '2.2.0', - }, - storage: { - files: [ - { - contents: { - // See https://docs.openshift.com/container-platform/latest/logging/config/cluster-logging-systemd.html - source: 'data:text/plain;charset=utf-8;base64,' + std.base64(||| - MaxRetentionSec=1month - RateLimitBurst=10000 - RateLimitInterval=1s - Storage=persistent - SyncIntervalSec=1s - |||), - }, - filesystem: 'root', - mode: 420, - path: '/etc/systemd/journald.conf', - }, - ], - }, - }, - }, - } - for role in [ 'master', 'worker' ] -]; - -// Allow cluster-scoped ES operator to access ES pods in openshift-logging -local netpol_operator = kube.NetworkPolicy('allow-from-openshift-operators-redhat') { - spec: { - ingress: [ - { - from: [ - { - namespaceSelector: { - matchLabels: { - name: 'openshift-operators-redhat', - }, - }, - }, - { - podSelector: { - matchLabels: { - name: 'elasticsearch-operator', - }, - }, - }, - ], - }, - ], - podSelector: {}, - policyTypes: [ 'Ingress' ], - }, -}; - -// Keep config backwards compatible -local kibana_host = - if std.objectHas(params, 'kibana_host') then - std.trace( - 'parameter kibana_host is deprecated, please use parameter `components.elasticsearch.kibana_host instead`', - params.kibana_host - ) - else elasticsearch.kibana_host; - -local kibana_routeToPatch = kube._Object('route.openshift.io/v1', 'Route', 'kibana') { - metadata+: { - namespace: inv.parameters.openshift4_logging.namespace, - }, -}; - -local kibana_patch = resourceLocker.Patch(kibana_routeToPatch, { - spec: { - host: kibana_host, - }, -}); - -// OpenShift has custom RBAC permissions on routes if you want to set a host ┻━┻︵ヽ(`Д´)ノ︵ ┻━┻ -local kibana_patchWithAdditionalPermissions = std.map( - function(obj) - if obj.apiVersion == 'rbac.authorization.k8s.io/v1' && obj.kind == 'Role' then - obj { - rules+: [ - { - apiGroups: [ - 'route.openshift.io', - ], - resources: [ - 'routes/custom-host', - ], - verbs: [ - '*', - ], - }, - ], - } - else - obj - , kibana_patch -); - -// Define outputs below -if elasticsearch.enabled then - { - '40_es_machineconfig': machineconfig_journald, - '40_es_netpol': netpol_operator, - [if kibana_host != null then '40_es_kibana_host']: kibana_patchWithAdditionalPermissions, - } -else - std.trace( - 'Elasticsearch disabled, not deploying Elasticsearch stack', - {} - ) diff --git a/component/extracted_alerts/master/collector_prometheus_alerts.yaml b/component/extracted_alerts/master/collector_prometheus_alerts.yaml index 1942d35..2d5cdf8 100644 --- a/component/extracted_alerts/master/collector_prometheus_alerts.yaml +++ b/component/extracted_alerts/master/collector_prometheus_alerts.yaml @@ -9,7 +9,7 @@ spec: rules: - alert: CollectorNodeDown annotations: - message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." + description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." summary: "Collector cannot be scraped" expr: | up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 @@ -17,70 +17,9 @@ spec: labels: service: collector severity: critical - - alert: CollectorHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - - alert: CollectorVeryHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - - alert: ElasticsearchDeprecation - annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for the Red Hat Elasticsearch Operator has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to the Elasticsearch Operator, you can use the Loki Operator instead." - summary: "Detected Elasticsearch as the in-cluster storage, which has been removed in 6.0 release" - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - service: storage - severity: Warning - namespace: openshift-logging - - alert: FluentdDeprecation - annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for Fluentd as a collector has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Fluentd, you can use the Vector collector instead." - summary: "Detected Fluentd as the collector, which has been removed in a 6.0 release" - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - service: collector - severity: Warning - namespace: openshift-logging - - alert: KibanaDeprecation - annotations: - message: "In Red Hat OpenShift Logging Operator 6.0, support for Kibana as a data visualization dashboard has been removed. Bug fixes and support are provided only through the end of the 5.9 lifecycle. As an alternative to Kibana, you can use the Grafana Dashboard instead." - summary: "Detected Kibana as the log data visualization, which has been removed in the 6.0 release" - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - service: visualization - severity: Warning - namespace: openshift-logging - alert: DiskBufferUsage annotations: - message: "Collectors potentially consuming too much node disk, {{ $value }}% " + description: "Collectors potentially consuming too much node disk, {{ $value }}% " summary: "Detected consuming too much node disk on $labels.hostname host" expr: | (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') diff --git a/component/extracted_alerts/master/elasticsearch_operator_prometheus_alerts.yaml b/component/extracted_alerts/master/elasticsearch_operator_prometheus_alerts.yaml deleted file mode 100644 index 8f79010..0000000 --- a/component/extracted_alerts/master/elasticsearch_operator_prometheus_alerts.yaml +++ /dev/null @@ -1,224 +0,0 @@ ---- -"groups": -- "name": logging_elasticsearch.alerts - "rules": - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." - "summary": "Cluster health status is RED" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" - "expr": | - sum by (cluster) (es_cluster_status == 2) - "for": 7m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." - "summary": "Cluster health status is YELLOW" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" - "expr": | - sum by (cluster) (es_cluster_status == 1) - "for": 20m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchWriteRequestsRejectionJumps - "annotations": - "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." - "summary": "High Write Rejection Ratio - {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" - "expr": | - round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchJVMHeapUseHigh - "annotations": - "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "JVM Heap usage on the node is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" - "expr": | - sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": AggregatedLoggingSystemCPUHigh - "annotations": - "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "System CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_os_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchProcessCPUHigh - "annotations": - "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "ES process CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_process_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchDiskSpaceRunningLow - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." - "summary": "Cluster low on disk space" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" - "expr": | - sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 - "for": 1h - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchHighFileDescriptorUsage - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." - "summary": "Cluster low on file descriptors" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" - "expr": | - predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchOperatorCSVNotSuccessful - "annotations": - "message": "Elasticsearch Operator CSV has not reconciled succesfully." - "summary": "Elasticsearch Operator CSV Not Successful" - "expr": | - csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning \ No newline at end of file diff --git a/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml index 15cc424..799c280 100644 --- a/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/master/lokistack_prometheus_alerts.yaml @@ -175,6 +175,24 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning - alert: LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/component/extracted_alerts/release-5.6/elasticsearch_operator_prometheus_alerts.yaml b/component/extracted_alerts/release-5.6/elasticsearch_operator_prometheus_alerts.yaml deleted file mode 100644 index 8f79010..0000000 --- a/component/extracted_alerts/release-5.6/elasticsearch_operator_prometheus_alerts.yaml +++ /dev/null @@ -1,224 +0,0 @@ ---- -"groups": -- "name": logging_elasticsearch.alerts - "rules": - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." - "summary": "Cluster health status is RED" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" - "expr": | - sum by (cluster) (es_cluster_status == 2) - "for": 7m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." - "summary": "Cluster health status is YELLOW" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" - "expr": | - sum by (cluster) (es_cluster_status == 1) - "for": 20m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchWriteRequestsRejectionJumps - "annotations": - "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." - "summary": "High Write Rejection Ratio - {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" - "expr": | - round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchJVMHeapUseHigh - "annotations": - "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "JVM Heap usage on the node is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" - "expr": | - sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": AggregatedLoggingSystemCPUHigh - "annotations": - "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "System CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_os_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchProcessCPUHigh - "annotations": - "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "ES process CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_process_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchDiskSpaceRunningLow - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." - "summary": "Cluster low on disk space" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" - "expr": | - sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 - "for": 1h - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchHighFileDescriptorUsage - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." - "summary": "Cluster low on file descriptors" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" - "expr": | - predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchOperatorCSVNotSuccessful - "annotations": - "message": "Elasticsearch Operator CSV has not reconciled succesfully." - "summary": "Elasticsearch Operator CSV Not Successful" - "expr": | - csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning \ No newline at end of file diff --git a/component/extracted_alerts/release-5.6/fluentd_prometheus_alerts.yaml b/component/extracted_alerts/release-5.6/fluentd_prometheus_alerts.yaml deleted file mode 100644 index 7772c47..0000000 --- a/component/extracted_alerts/release-5.6/fluentd_prometheus_alerts.yaml +++ /dev/null @@ -1,64 +0,0 @@ - -"groups": -- "name": "logging_fluentd.alerts" - "rules": - - "alert": "FluentdNodeDown" - "annotations": - "message": "Prometheus could not scrape fluentd {{ $labels.container }} for more than 10m." - "summary": "Fluentd cannot be scraped" - "expr": | - up{job = "collector", container = "collector"} == 0 or absent(up{job="collector", container="collector"}) == 1 - "for": "10m" - "labels": - "service": "collector" - "severity": "critical" - namespace: "openshift-logging" - - "alert": "FluentdQueueLengthIncreasing" - "annotations": - "message": "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." - "summary": "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." - "expr": | - sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) - "for": "1h" - "labels": - "service": "collector" - "severity": "Warning" - namespace: "openshift-logging" - - alert: FluentDHighErrorRate - annotations: - message: |- - {{ $value }}% of records have resulted in an error by fluentd {{ $labels.instance }}. - summary: FluentD output errors are high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 10 - for: 15m - labels: - severity: warning - namespace: "openshift-logging" - - alert: FluentDVeryHighErrorRate - annotations: - message: |- - {{ $value }}% of records have resulted in an error by fluentd {{ $labels.instance }}. - summary: FluentD output errors are very high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 25 - for: 15m - labels: - severity: critical - namespace: "openshift-logging" -- "name": "logging_clusterlogging_telemetry.rules" - "rules": - - "expr": | - sum by(cluster)(log_collected_bytes_total) - "record": "cluster:log_collected_bytes_total:sum" - - "expr": | - sum by(cluster)(log_logged_bytes_total) - "record": "cluster:log_logged_bytes_total:sum" diff --git a/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml deleted file mode 100644 index f378c49..0000000 --- a/component/extracted_alerts/release-5.6/lokistack_prometheus_alerts.yaml +++ /dev/null @@ -1,177 +0,0 @@ ---- -groups: -- name: logging_loki.alerts - rules: - - alert: LokiRequestErrors - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - summary: "At least 10% of requests are responded by 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Errors" - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code=~"5.."} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiStackWriteRequestErrors - annotations: - message: |- - {{ printf "%.2f" $value }}% of write requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. - summary: "At least 10% of write requests to the lokistack-gateway are responded with 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#LokiStack-Write-Request-Errors" - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler="push"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler="push"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiStackReadRequestErrors - annotations: - message: |- - {{ printf "%.2f" $value }}% of query requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. - summary: "At least 10% of query requests to the lokistack-gateway are responded with 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#LokiStack-Read-Request-Errors" - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: |- - {{ $labels.job }} is experiencing an increase of {{ $value }} panics. - summary: "A panic was triggered." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Panics" - expr: | - sum( - increase( - loki_panic_total[10m] - ) - ) by (job, namespace) - > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - summary: "The 99th percentile is experiencing high latency (higher than 1 second)." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Latency" - expr: | - histogram_quantile(0.99, - sum( - irate( - loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[1m] - ) - ) by (job, le, namespace, route) - ) - > 1 - for: 15m - labels: - severity: critical - - alert: LokiTenantRateLimit - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing 429 errors. - summary: "At least 10% of requests are responded with the rate limit error code." - runbook_url: "[[ .RunbookURL ]]#Loki-Tenant-Rate-Limit" - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429"} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: warning - - alert: LokiStorageSlowWrite - annotations: - message: |- - The storage path is experiencing slow write response rates. - summary: "The storage path is experiencing slow write response rates." - runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Write" - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="WRITE"} - ) by (job, le, namespace) - ) - > 1 - for: 15m - labels: - severity: warning - - alert: LokiStorageSlowRead - annotations: - message: |- - The storage path is experiencing slow read response rates. - summary: "The storage path is experiencing slow read response rates." - runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Read" - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="Shipper.Query"} - ) by (job, le, namespace) - ) - > 5 - for: 15m - labels: - severity: warning - - alert: LokiWritePathHighLoad - annotations: - message: |- - The write path is experiencing high load. - summary: "The write path is experiencing high load, causing backpressure storage flushing." - runbook_url: "[[ .RunbookURL ]]#Loki-Write-Path-High-Load" - expr: | - sum( - loki_ingester_wal_replay_flushing - ) by (job, namespace) - > 0 - for: 15m - labels: - severity: warning - - alert: LokiReadPathHighLoad - annotations: - message: |- - The read path is experiencing high load. - summary: "The read path has high volume of queries, causing longer response times." - runbook_url: "[[ .RunbookURL ]]#Loki-Read-Path-High-Load" - expr: | - histogram_quantile(0.99, - sum( - rate( - loki_logql_querystats_latency_seconds_bucket[5m] - ) - ) by (job, le, namespace) - ) - > 30 - for: 15m - labels: - severity: warning diff --git a/component/extracted_alerts/release-5.7/elasticsearch_operator_prometheus_alerts.yaml b/component/extracted_alerts/release-5.7/elasticsearch_operator_prometheus_alerts.yaml deleted file mode 100644 index 8f79010..0000000 --- a/component/extracted_alerts/release-5.7/elasticsearch_operator_prometheus_alerts.yaml +++ /dev/null @@ -1,224 +0,0 @@ ---- -"groups": -- "name": logging_elasticsearch.alerts - "rules": - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." - "summary": "Cluster health status is RED" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" - "expr": | - sum by (cluster) (es_cluster_status == 2) - "for": 7m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." - "summary": "Cluster health status is YELLOW" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" - "expr": | - sum by (cluster) (es_cluster_status == 1) - "for": 20m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchWriteRequestsRejectionJumps - "annotations": - "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." - "summary": "High Write Rejection Ratio - {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" - "expr": | - round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchJVMHeapUseHigh - "annotations": - "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "JVM Heap usage on the node is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" - "expr": | - sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": AggregatedLoggingSystemCPUHigh - "annotations": - "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "System CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_os_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchProcessCPUHigh - "annotations": - "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "ES process CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_process_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchDiskSpaceRunningLow - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." - "summary": "Cluster low on disk space" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" - "expr": | - sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 - "for": 1h - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchHighFileDescriptorUsage - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." - "summary": "Cluster low on file descriptors" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" - "expr": | - predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchOperatorCSVNotSuccessful - "annotations": - "message": "Elasticsearch Operator CSV has not reconciled succesfully." - "summary": "Elasticsearch Operator CSV Not Successful" - "expr": | - csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning \ No newline at end of file diff --git a/component/extracted_alerts/release-5.7/fluentd_prometheus_alerts.yaml b/component/extracted_alerts/release-5.7/fluentd_prometheus_alerts.yaml deleted file mode 100644 index 7772c47..0000000 --- a/component/extracted_alerts/release-5.7/fluentd_prometheus_alerts.yaml +++ /dev/null @@ -1,64 +0,0 @@ - -"groups": -- "name": "logging_fluentd.alerts" - "rules": - - "alert": "FluentdNodeDown" - "annotations": - "message": "Prometheus could not scrape fluentd {{ $labels.container }} for more than 10m." - "summary": "Fluentd cannot be scraped" - "expr": | - up{job = "collector", container = "collector"} == 0 or absent(up{job="collector", container="collector"}) == 1 - "for": "10m" - "labels": - "service": "collector" - "severity": "critical" - namespace: "openshift-logging" - - "alert": "FluentdQueueLengthIncreasing" - "annotations": - "message": "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." - "summary": "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." - "expr": | - sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) - "for": "1h" - "labels": - "service": "collector" - "severity": "Warning" - namespace: "openshift-logging" - - alert: FluentDHighErrorRate - annotations: - message: |- - {{ $value }}% of records have resulted in an error by fluentd {{ $labels.instance }}. - summary: FluentD output errors are high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 10 - for: 15m - labels: - severity: warning - namespace: "openshift-logging" - - alert: FluentDVeryHighErrorRate - annotations: - message: |- - {{ $value }}% of records have resulted in an error by fluentd {{ $labels.instance }}. - summary: FluentD output errors are very high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 25 - for: 15m - labels: - severity: critical - namespace: "openshift-logging" -- "name": "logging_clusterlogging_telemetry.rules" - "rules": - - "expr": | - sum by(cluster)(log_collected_bytes_total) - "record": "cluster:log_collected_bytes_total:sum" - - "expr": | - sum by(cluster)(log_logged_bytes_total) - "record": "cluster:log_logged_bytes_total:sum" diff --git a/component/extracted_alerts/release-5.8/collector_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/collector_prometheus_alerts.yaml deleted file mode 100644 index c4f1663..0000000 --- a/component/extracted_alerts/release-5.8/collector_prometheus_alerts.yaml +++ /dev/null @@ -1,71 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: collector - namespace: openshift-logging -spec: - groups: - - name: logging_collector.alerts - rules: - - alert: CollectorNodeDown - annotations: - message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." - summary: "Collector cannot be scraped" - expr: | - up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 - for: 10m - labels: - service: collector - severity: critical - - alert: CollectorHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - - alert: CollectorVeryHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - - alert: FluentdQueueLengthIncreasing - annotations: - message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." - summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." - expr: | - sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) - for: 1h - labels: - service: collector - severity: Warning - - name: logging_clusterlogging_telemetry.rules - rules: - - expr: | - sum by(cluster)(log_collected_bytes_total) - record: cluster:log_collected_bytes_total:sum - - expr: | - sum by(cluster)(log_logged_bytes_total) - record: cluster:log_logged_bytes_total:sum - - expr: | - sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m])) - record: collector:log_num_errors:sum_rate - - expr: | - sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m])) - record: collector:received_events:sum_rate diff --git a/component/extracted_alerts/release-5.8/elasticsearch_operator_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/elasticsearch_operator_prometheus_alerts.yaml deleted file mode 100644 index 8f79010..0000000 --- a/component/extracted_alerts/release-5.8/elasticsearch_operator_prometheus_alerts.yaml +++ /dev/null @@ -1,224 +0,0 @@ ---- -"groups": -- "name": logging_elasticsearch.alerts - "rules": - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." - "summary": "Cluster health status is RED" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" - "expr": | - sum by (cluster) (es_cluster_status == 2) - "for": 7m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." - "summary": "Cluster health status is YELLOW" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" - "expr": | - sum by (cluster) (es_cluster_status == 1) - "for": 20m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchWriteRequestsRejectionJumps - "annotations": - "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." - "summary": "High Write Rejection Ratio - {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" - "expr": | - round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchJVMHeapUseHigh - "annotations": - "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "JVM Heap usage on the node is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" - "expr": | - sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": AggregatedLoggingSystemCPUHigh - "annotations": - "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "System CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_os_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchProcessCPUHigh - "annotations": - "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "ES process CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_process_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchDiskSpaceRunningLow - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." - "summary": "Cluster low on disk space" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" - "expr": | - sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 - "for": 1h - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchHighFileDescriptorUsage - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." - "summary": "Cluster low on file descriptors" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" - "expr": | - predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchOperatorCSVNotSuccessful - "annotations": - "message": "Elasticsearch Operator CSV has not reconciled succesfully." - "summary": "Elasticsearch Operator CSV Not Successful" - "expr": | - csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning \ No newline at end of file diff --git a/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml deleted file mode 100644 index f378c49..0000000 --- a/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml +++ /dev/null @@ -1,177 +0,0 @@ ---- -groups: -- name: logging_loki.alerts - rules: - - alert: LokiRequestErrors - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - summary: "At least 10% of requests are responded by 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Errors" - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code=~"5.."} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiStackWriteRequestErrors - annotations: - message: |- - {{ printf "%.2f" $value }}% of write requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. - summary: "At least 10% of write requests to the lokistack-gateway are responded with 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#LokiStack-Write-Request-Errors" - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler="push"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler="push"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiStackReadRequestErrors - annotations: - message: |- - {{ printf "%.2f" $value }}% of query requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. - summary: "At least 10% of query requests to the lokistack-gateway are responded with 5xx server errors." - runbook_url: "[[ .RunbookURL ]]#LokiStack-Read-Request-Errors" - expr: | - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - / - sum( - code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler=~"query|query_range|label|labels|label_values"} - ) by (job, namespace) - * 100 - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: |- - {{ $labels.job }} is experiencing an increase of {{ $value }} panics. - summary: "A panic was triggered." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Panics" - expr: | - sum( - increase( - loki_panic_total[10m] - ) - ) by (job, namespace) - > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - summary: "The 99th percentile is experiencing high latency (higher than 1 second)." - runbook_url: "[[ .RunbookURL ]]#Loki-Request-Latency" - expr: | - histogram_quantile(0.99, - sum( - irate( - loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[1m] - ) - ) by (job, le, namespace, route) - ) - > 1 - for: 15m - labels: - severity: critical - - alert: LokiTenantRateLimit - annotations: - message: |- - {{ $labels.job }} {{ $labels.route }} is experiencing 429 errors. - summary: "At least 10% of requests are responded with the rate limit error code." - runbook_url: "[[ .RunbookURL ]]#Loki-Tenant-Rate-Limit" - expr: | - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429"} - ) by (job, namespace, route) - / - sum( - job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m - ) by (job, namespace, route) - * 100 - > 10 - for: 15m - labels: - severity: warning - - alert: LokiStorageSlowWrite - annotations: - message: |- - The storage path is experiencing slow write response rates. - summary: "The storage path is experiencing slow write response rates." - runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Write" - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="WRITE"} - ) by (job, le, namespace) - ) - > 1 - for: 15m - labels: - severity: warning - - alert: LokiStorageSlowRead - annotations: - message: |- - The storage path is experiencing slow read response rates. - summary: "The storage path is experiencing slow read response rates." - runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Read" - expr: | - histogram_quantile(0.99, - sum( - job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="Shipper.Query"} - ) by (job, le, namespace) - ) - > 5 - for: 15m - labels: - severity: warning - - alert: LokiWritePathHighLoad - annotations: - message: |- - The write path is experiencing high load. - summary: "The write path is experiencing high load, causing backpressure storage flushing." - runbook_url: "[[ .RunbookURL ]]#Loki-Write-Path-High-Load" - expr: | - sum( - loki_ingester_wal_replay_flushing - ) by (job, namespace) - > 0 - for: 15m - labels: - severity: warning - - alert: LokiReadPathHighLoad - annotations: - message: |- - The read path is experiencing high load. - summary: "The read path has high volume of queries, causing longer response times." - runbook_url: "[[ .RunbookURL ]]#Loki-Read-Path-High-Load" - expr: | - histogram_quantile(0.99, - sum( - rate( - loki_logql_querystats_latency_seconds_bucket[5m] - ) - ) by (job, le, namespace) - ) - > 30 - for: 15m - labels: - severity: warning diff --git a/component/extracted_alerts/release-5.9/collector_prometheus_alerts.yaml b/component/extracted_alerts/release-5.9/collector_prometheus_alerts.yaml deleted file mode 100644 index 30ee172..0000000 --- a/component/extracted_alerts/release-5.9/collector_prometheus_alerts.yaml +++ /dev/null @@ -1,115 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: collector - namespace: openshift-logging -spec: - groups: - - name: logging_collector.alerts - rules: - - alert: CollectorNodeDown - annotations: - message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." - summary: "Collector cannot be scraped" - expr: | - up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 - for: 10m - labels: - service: collector - severity: critical - - alert: CollectorHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - - alert: CollectorVeryHighErrorRate - annotations: - message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." - summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high" - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - - alert: FluentdQueueLengthIncreasing - annotations: - message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." - summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." - expr: | - sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) - for: 1h - labels: - service: collector - severity: Warning - - alert: ElasticsearchDeprecation - annotations: - message: "The OpenShift Elasticsearch Operator is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to using the OpenShift Elasticsearch Operator to manage the default log storage, you can use the Loki Operator." - summary: "Detected Elasticsearch as the in-cluster storage which is deprecated and will be removed in a future release." - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - service: storage - severity: Warning - namespace: openshift-logging - - alert: FluentdDeprecation - annotations: - message: "Fluentd is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to Fluentd, you can use Vector instead." - summary: "Detected Fluentd as the collector which is deprecated and will be removed in a future release." - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - service: collector - severity: Warning - namespace: openshift-logging - - alert: KibanaDeprecation - annotations: - message: "The Kibana web console is now deprecated and is planned to be removed in a future logging release." - summary: "Detected Kibana as the visualization which is deprecated and will be removed in a future release." - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - service: visualization - severity: Warning - namespace: openshift-logging - - alert: DiskBufferUsage - annotations: - message: "Collectors potentially consuming too much node disk, {{ $value }}% " - summary: "Detected consuming too much node disk on $labels.hostname host" - expr: | - (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') - / on(instance) group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'})) * 100 > 15 - for: 5m - labels: - service: collector - severity: Warning - - name: logging_clusterlogging_telemetry.rules - rules: - - expr: | - sum by(cluster)(log_collected_bytes_total) - record: cluster:log_collected_bytes_total:sum - - expr: | - sum by(cluster)(log_logged_bytes_total) - record: cluster:log_logged_bytes_total:sum - - expr: | - sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m])) - record: collector:log_num_errors:sum_rate - - expr: | - sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m])) - record: collector:received_events:sum_rate diff --git a/component/extracted_alerts/release-5.9/elasticsearch_operator_prometheus_alerts.yaml b/component/extracted_alerts/release-5.9/elasticsearch_operator_prometheus_alerts.yaml deleted file mode 100644 index 8f79010..0000000 --- a/component/extracted_alerts/release-5.9/elasticsearch_operator_prometheus_alerts.yaml +++ /dev/null @@ -1,224 +0,0 @@ ---- -"groups": -- "name": logging_elasticsearch.alerts - "rules": - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." - "summary": "Cluster health status is RED" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" - "expr": | - sum by (cluster) (es_cluster_status == 2) - "for": 7m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchClusterNotHealthy - "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." - "summary": "Cluster health status is YELLOW" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" - "expr": | - sum by (cluster) (es_cluster_status == 1) - "for": 20m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchWriteRequestsRejectionJumps - "annotations": - "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." - "summary": "High Write Rejection Ratio - {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" - "expr": | - round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - es_fs_path_available_bytes / - es_fs_path_total_bytes - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 5m - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchJVMHeapUseHigh - "annotations": - "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "JVM Heap usage on the node is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" - "expr": | - sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": AggregatedLoggingSystemCPUHigh - "annotations": - "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "System CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_os_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchProcessCPUHigh - "annotations": - "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." - "summary": "ES process CPU usage is high" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" - "expr": | - sum by (cluster, instance, node) (es_process_cpu_percent) > 90 - "for": 1m - "labels": - "namespace": openshift-logging - "severity": info - - - "alert": ElasticsearchDiskSpaceRunningLow - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." - "summary": "Cluster low on disk space" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" - "expr": | - sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 - "for": 1h - "labels": - "namespace": openshift-logging - "severity": critical - - - "alert": ElasticsearchHighFileDescriptorUsage - "annotations": - "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." - "summary": "Cluster low on file descriptors" - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" - "expr": | - predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchOperatorCSVNotSuccessful - "annotations": - "message": "Elasticsearch Operator CSV has not reconciled succesfully." - "summary": "Elasticsearch Operator CSV Not Successful" - "expr": | - csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 - "for": 10m - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." - "summary": "Disk Low Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." - "summary": "Disk High Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning - - - "alert": ElasticsearchNodeDiskWatermarkReached - "annotations": - "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." - "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." - "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" - "expr": | - sum by (instance, pod) ( - round( - (1 - ( - predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / - predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) - ) - ) * 100, 0.001) - ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct - "for": 1h - "labels": - "namespace": openshift-logging - "severity": warning \ No newline at end of file diff --git a/component/extracted_alerts/release-6.0/collector_prometheus_alerts.yaml b/component/extracted_alerts/release-6.0/collector_prometheus_alerts.yaml new file mode 100644 index 0000000..2d5cdf8 --- /dev/null +++ b/component/extracted_alerts/release-6.0/collector_prometheus_alerts.yaml @@ -0,0 +1,45 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: collector + namespace: openshift-logging +spec: + groups: + - name: logging_collector.alerts + rules: + - alert: CollectorNodeDown + annotations: + description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." + summary: "Collector cannot be scraped" + expr: | + up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 + for: 10m + labels: + service: collector + severity: critical + - alert: DiskBufferUsage + annotations: + description: "Collectors potentially consuming too much node disk, {{ $value }}% " + summary: "Detected consuming too much node disk on $labels.hostname host" + expr: | + (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') + / on(instance) group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'})) * 100 > 15 + for: 5m + labels: + service: collector + severity: Warning + - name: logging_clusterlogging_telemetry.rules + rules: + - expr: | + sum by(cluster)(log_logged_bytes_total) + record: cluster:log_logged_bytes_total:sum + - expr: | + sum by(pod, namespace, app_kubernetes_io_instance)(rate(vector_component_errors_total[2m])) + record: collector:log_num_errors:sum_rate + - expr: | + sum by(pod, namespace, app_kubernetes_io_instance)(rate(vector_component_received_events_total[2m])) + record: collector:received_events:sum_rate + + + + diff --git a/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-6.0/lokistack_prometheus_alerts.yaml similarity index 89% rename from component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml rename to component/extracted_alerts/release-6.0/lokistack_prometheus_alerts.yaml index 15cc424..799c280 100644 --- a/component/extracted_alerts/release-5.9/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/release-6.0/lokistack_prometheus_alerts.yaml @@ -175,6 +175,24 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning - alert: LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/component/extracted_alerts/release-6.1/collector_prometheus_alerts.yaml b/component/extracted_alerts/release-6.1/collector_prometheus_alerts.yaml new file mode 100644 index 0000000..2d5cdf8 --- /dev/null +++ b/component/extracted_alerts/release-6.1/collector_prometheus_alerts.yaml @@ -0,0 +1,45 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: collector + namespace: openshift-logging +spec: + groups: + - name: logging_collector.alerts + rules: + - alert: CollectorNodeDown + annotations: + description: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." + summary: "Collector cannot be scraped" + expr: | + up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 + for: 10m + labels: + service: collector + severity: critical + - alert: DiskBufferUsage + annotations: + description: "Collectors potentially consuming too much node disk, {{ $value }}% " + summary: "Detected consuming too much node disk on $labels.hostname host" + expr: | + (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') + / on(instance) group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'})) * 100 > 15 + for: 5m + labels: + service: collector + severity: Warning + - name: logging_clusterlogging_telemetry.rules + rules: + - expr: | + sum by(cluster)(log_logged_bytes_total) + record: cluster:log_logged_bytes_total:sum + - expr: | + sum by(pod, namespace, app_kubernetes_io_instance)(rate(vector_component_errors_total[2m])) + record: collector:log_num_errors:sum_rate + - expr: | + sum by(pod, namespace, app_kubernetes_io_instance)(rate(vector_component_received_events_total[2m])) + record: collector:received_events:sum_rate + + + + diff --git a/component/extracted_alerts/release-5.7/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-6.1/lokistack_prometheus_alerts.yaml similarity index 79% rename from component/extracted_alerts/release-5.7/lokistack_prometheus_alerts.yaml rename to component/extracted_alerts/release-6.1/lokistack_prometheus_alerts.yaml index f378c49..799c280 100644 --- a/component/extracted_alerts/release-5.7/lokistack_prometheus_alerts.yaml +++ b/component/extracted_alerts/release-6.1/lokistack_prometheus_alerts.yaml @@ -175,3 +175,37 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + - alert: LokistackSchemaUpgradesRequired + annotations: + message: |- + The LokiStack "{{ $labels.stack_name }}" in namespace "{{ $labels.stack_namespace }}" is using a storage schema + configuration that does not contain the latest schema version. It is recommended to update the schema + configuration to update the schema version to the latest version in the future. + summary: "One or more of the deployed LokiStacks contains an outdated storage schema configuration." + runbook_url: "[[ .RunbookURL ]]#Lokistack-Schema-Upgrades-Required" + expr: | + sum ( + lokistack_status_condition{reason="StorageNeedsSchemaUpdate",status="true"} + ) by (stack_namespace, stack_name) + > 0 + for: 1m + labels: + severity: warning diff --git a/component/log_forwarder.libsonnet b/component/log_forwarder.libsonnet new file mode 100644 index 0000000..c57d25f --- /dev/null +++ b/component/log_forwarder.libsonnet @@ -0,0 +1,83 @@ +local com = import 'lib/commodore.libjsonnet'; +local kap = import 'lib/kapitan.libjsonnet'; +local kube = import 'lib/kube.libjsonnet'; + +local inv = kap.inventory(); +local params = inv.parameters.openshift4_logging; +local lokiEnabled = params.components.lokistack.enabled; +local forwarderEnabled = lokiEnabled || std.length(params.clusterLogForwarder) > 0; + +// Make sure the default output is added to the pipelines `outputRefs`, +// if the logging stack is not disabled. +local pipelineOutputRefs(pipeline) = + local default = if lokiEnabled then [ 'default' ] else []; + std.get(pipeline, 'forwarders', []) + default; + +// clusterLogForwarderSpec: +// Consecutively apply patches to result of previous apply. +local clusterLogForwarderSpec = { + local appsPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'application-logs', {}), + local infraPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'infrastructure-logs', {}), + local auditPipeline = std.get(std.get(params.clusterLogForwarder, 'pipelines', {}), 'audit-logs', {}), + + inputs: {}, + outputs: {}, + pipelines: { + [if lokiEnabled || std.length(appsPipeline) > 0 then 'application-logs']: { + inputRefs: [ 'application' ], + outputRefs: pipelineOutputRefs(appsPipeline), + }, + [if lokiEnabled || std.length(infraPipeline) > 0 then 'infrastructure-logs']: { + inputRefs: [ 'infrastructure' ], + outputRefs: pipelineOutputRefs(infraPipeline), + }, + [if std.length(auditPipeline) > 0 then 'audit-logs']: { + inputRefs: [ 'audit' ], + }, + }, +} + com.makeMergeable(params.clusterLogForwarder); + +// Unfold objects into array for ClusterLogForwarder resource. +local unfoldSpecs(specs) = { + // Unfold objects into array. + [if std.length(specs.inputs) > 0 then 'inputs']: [ + { name: name } + specs.inputs[name] + for name in std.objectFields(specs.inputs) + ], + [if std.length(specs.outputs) > 0 then 'outputs']: [ + { name: name } + specs.outputs[name] + for name in std.objectFields(specs.outputs) + ], + [if std.length(specs.pipelines) > 0 then 'pipelines']: [ + { name: name } + specs.pipelines[name] + for name in std.objectFields(specs.pipelines) + ], +} + { + // Import remaining specs as is. + [key]: specs[key] + for key in std.objectFields(specs) + if !std.member([ 'inputs', 'outputs', 'pipelines' ], key) +}; + +// ClusterLogForwarder: +// Create definitive ClusterLogForwarder resource from specs. +local clusterLogForwarder = kube._Object('observability.openshift.io/v1', 'ClusterLogForwarder', 'instance') { + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-options': 'SkipDryRunOnMissingResource=true', + }, + namespace: params.namespace, + }, + spec: unfoldSpecs(clusterLogForwarderSpec), +}; + +// Define outputs below +if forwarderEnabled then + { + '40_log_forwarder': clusterLogForwarder, + } +else + std.trace( + 'Log forwarding disabled, not deploying ClusterLogForwarder', + {} + ) diff --git a/component/loki.libsonnet b/component/log_lokistack.libsonnet similarity index 90% rename from component/loki.libsonnet rename to component/log_lokistack.libsonnet index 09e406f..a06bca5 100644 --- a/component/loki.libsonnet +++ b/component/log_lokistack.libsonnet @@ -3,7 +3,6 @@ local com = import 'lib/commodore.libjsonnet'; local kap = import 'lib/kapitan.libjsonnet'; local kube = import 'lib/kube.libjsonnet'; local po = import 'lib/patch-operator.libsonnet'; -local workaround = import 'loki_workaround.libsonnet'; // The hiera parameters for the component local inv = kap.inventory(); @@ -50,6 +49,7 @@ local lokistack_spec = { local lokistack = kube._Object('loki.grafana.com/v1', 'LokiStack', 'loki') { metadata+: { annotations+: { + 'argocd.argoproj.io/sync-wave': '-50', // Allow ArgoCD to do the dry run when the CRD doesn't exist yet 'argocd.argoproj.io/sync-options': 'SkipDryRunOnMissingResource=true', }, @@ -120,13 +120,10 @@ local aggregate_loki_log_access = kube.ClusterRole('syn:loki:cluster-reader') { // Define outputs below if loki.enabled then { - '50_loki_stack': lokistack, - '50_loki_logstore': logstore, - '50_loki_netpol': [ netpol_viewplugin, netpol_lokigateway ], - '50_loki_rbac': [ aggregate_loki_log_access ], - '50_loki_operator_metrics_token': workaround.missing_metrics_token, - '50_loki_ingester_fix': workaround.ingester_stuck, - '50_loki_logreader_fix': workaround.app_logs_reader, + '30_loki_stack': lokistack, + '30_loki_logstore': logstore, + '30_loki_netpol': [ netpol_viewplugin, netpol_lokigateway ], + '30_loki_rbac': [ aggregate_loki_log_access ], } else std.trace( diff --git a/component/logmetrics.libsonnet b/component/log_metricsexporter.libsonnet similarity index 85% rename from component/logmetrics.libsonnet rename to component/log_metricsexporter.libsonnet index 78db4f4..117eee6 100644 --- a/component/logmetrics.libsonnet +++ b/component/log_metricsexporter.libsonnet @@ -5,6 +5,11 @@ local inv = kap.inventory(); local logmetrics = inv.parameters.openshift4_logging.components.logmetrics; local logMetricExporter = kube._Object('logging.openshift.io/v1alpha1', 'LogFileMetricExporter', 'instance') { + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-50', + }, + }, spec: logmetrics.spec, }; diff --git a/component/loki_workaround.libsonnet b/component/log_workaround.libsonnet similarity index 92% rename from component/loki_workaround.libsonnet rename to component/log_workaround.libsonnet index 2f2e0f0..ed9772a 100644 --- a/component/loki_workaround.libsonnet +++ b/component/log_workaround.libsonnet @@ -5,6 +5,7 @@ local kube = import 'lib/kube.libjsonnet'; // The hiera parameters for the component local inv = kap.inventory(); local params = inv.parameters.openshift4_logging; +local lokiEnabled = params.components.lokistack.enabled; // Generate missing metrics SA token for Loki Operator. @@ -146,8 +147,15 @@ local ingester_stuck = [ }, ]; -{ - missing_metrics_token: [ missing_metrics_token ], - ingester_stuck: ingester_stuck, - app_logs_reader: app_logs_reader, -} +// Define outputs below +if lokiEnabled then + { + '50_fix_missing_metrics_token': missing_metrics_token, + '50_fix_ingester_stuck': ingester_stuck, + '50_fix_app_logs_reader': app_logs_reader, + } +else + std.trace( + 'Lokistack disabled, not deploying Lokistack', + {} + ) diff --git a/component/main.jsonnet b/component/main.jsonnet index 84edfa1..146d87c 100644 --- a/component/main.jsonnet +++ b/component/main.jsonnet @@ -2,13 +2,10 @@ local com = import 'lib/commodore.libjsonnet'; local kap = import 'lib/kapitan.libjsonnet'; local kube = import 'lib/kube.libjsonnet'; local operatorlib = import 'lib/openshift4-operators.libsonnet'; -local utils = import 'utils.libsonnet'; local inv = kap.inventory(); local params = inv.parameters.openshift4_logging; - -local deployLokistack = params.components.lokistack.enabled; -local deployElasticsearch = params.components.elasticsearch.enabled; +local lokiEnabled = params.components.lokistack.enabled; // Namespace @@ -16,6 +13,7 @@ local namespace = kube.Namespace(params.namespace) { metadata+: { annotations+: { 'openshift.io/node-selector': '', + 'argocd.argoproj.io/sync-wave': '-100', }, labels+: { 'openshift.io/cluster-monitoring': 'true', @@ -27,13 +25,11 @@ local namespace = kube.Namespace(params.namespace) { local operatorGroup = operatorlib.OperatorGroup('cluster-logging') { metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-90', + }, namespace: params.namespace, }, - spec: { - [if !params.namespaceLogForwarderEnabled then 'targetNamespaces']: [ - params.namespace, - ], - }, }; // Subscriptions @@ -44,6 +40,11 @@ local logging = operatorlib.namespacedSubscription( params.channel, 'redhat-operators' ) { + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-80', + }, + }, spec+: { config+: { resources: params.operatorResources.clusterLogging, @@ -51,29 +52,19 @@ local logging = operatorlib.namespacedSubscription( }, }; -local lokistack = if deployLokistack then operatorlib.managedSubscription( +local lokistack = if lokiEnabled then operatorlib.managedSubscription( 'openshift-operators-redhat', 'loki-operator', params.channel ) { - spec+: { - config+: { - resources: params.operatorResources.lokistack, + metadata+: { + annotations+: { + 'argocd.argoproj.io/sync-wave': '-80', }, }, -}; - -// With version 5.9 of the logging stack, elasticsearch is deprecated, -// this will clamp elasticsearch-operator subscription to stable-5.8. -local esChannel = if utils.isVersion59 then 'stable-5.8' else params.channel; -local elasticsearch = if deployElasticsearch then operatorlib.managedSubscription( - 'openshift-operators-redhat', - 'elasticsearch-operator', - esChannel -) { spec+: { config+: { - resources: params.operatorResources.elasticsearch, + resources: params.operatorResources.lokistack, }, }, }; @@ -81,7 +72,6 @@ local elasticsearch = if deployElasticsearch then operatorlib.managedSubscriptio local subscriptions = std.filter(function(it) it != null, [ logging, lokistack, - elasticsearch, ]); local secrets = com.generateResources(params.secrets, kube.Secret); @@ -93,9 +83,8 @@ local secrets = com.generateResources(params.secrets, kube.Secret); '20_subscriptions': subscriptions, [if std.length(params.secrets) > 0 then '99_secrets']: secrets, } -+ (import 'config_logging.libsonnet') -+ (import 'config_forwarding.libsonnet') -+ (import 'loki.libsonnet') -+ (import 'elasticsearch.libsonnet') ++ (import 'log_lokistack.libsonnet') ++ (import 'log_forwarder.libsonnet') ++ (import 'log_metricsexporter.libsonnet') ++ (import 'log_workaround.libsonnet') + (import 'alertrules.libsonnet') -+ (import 'logmetrics.libsonnet') diff --git a/component/utils.libsonnet b/component/utils.libsonnet deleted file mode 100644 index 2a6ccad..0000000 --- a/component/utils.libsonnet +++ /dev/null @@ -1,33 +0,0 @@ -local kap = import 'lib/kapitan.libjsonnet'; -local kube = import 'lib/kube.libjsonnet'; - -local inv = kap.inventory(); -local params = inv.parameters.openshift4_logging; - -local isVersion58 = - local major = std.split(params.version, '.')[0]; - local minor = std.split(params.version, '.')[1]; - if major == 'master' then true - else if std.parseInt(major) >= 6 then true - else if std.parseInt(major) == 5 && std.parseInt(minor) >= 8 then true - else false; - -local isVersion59 = - local major = std.split(params.version, '.')[0]; - local minor = std.split(params.version, '.')[1]; - if major == 'master' then true - else if std.parseInt(major) >= 6 then true - else if std.parseInt(major) == 5 && std.parseInt(minor) >= 9 then true - else false; - -local namespacedName(name) = { - local namespaced = std.splitLimit(name, '/', 1), - namespace: if std.length(namespaced) > 1 then namespaced[0] else params.namespace, - name: if std.length(namespaced) > 1 then namespaced[1] else namespaced[0], -}; - -{ - isVersion58: isVersion58, - isVersion59: isVersion59, - namespacedName: namespacedName, -} diff --git a/docs/modules/ROOT/pages/runbooks/SYN_ElasticsearchExpectNodeToReachDiskWatermark.adoc b/docs/modules/ROOT/pages/runbooks/SYN_ElasticsearchExpectNodeToReachDiskWatermark.adoc deleted file mode 100644 index 36e9424..0000000 --- a/docs/modules/ROOT/pages/runbooks/SYN_ElasticsearchExpectNodeToReachDiskWatermark.adoc +++ /dev/null @@ -1,14 +0,0 @@ -= Alert rule: SYN_ElasticsearchExpectNodeToReachDiskWatermark - -include::partial$runbooks/contribution_note.adoc[] - -== icon:glasses[] Overview - -This alert fires when the Elasticsearch node storage utilization is expected to reach the disk low watermark. -The default watermark is 85%. -The node will become read-only at the watermark. -To resolve this alert, unused data should be deleted or the https://kb.vshn.ch/oc4/how-tos/logging/increase-elasticsearch-storage-size.html[disk size must be increased]. - -== icon:bug[] Steps for debugging - -// Add detailed steps to debug and resolve the issue diff --git a/lib/openshift4-logging.libsonnet b/lib/openshift4-logging.libsonnet deleted file mode 100644 index dd2a061..0000000 --- a/lib/openshift4-logging.libsonnet +++ /dev/null @@ -1,18 +0,0 @@ -local kube = import 'lib/kube.libjsonnet'; - -local ClusterLogging(namespace, name) = kube._Object('logging.openshift.io/v1', 'ClusterLogging', name) { - metadata+: { - namespace: namespace, - }, -}; - -local ClusterLogForwarder(namespace, name) = kube._Object('logging.openshift.io/v1', 'ClusterLogForwarder', name) { - metadata+: { - namespace: namespace, - }, -}; - -{ - ClusterLogging: ClusterLogging, - ClusterLogForwarder: ClusterLogForwarder, -} diff --git a/tests/forwardingonly.yml b/tests/forwardingonly.yml index cfcbe5d..1d72626 100644 --- a/tests/forwardingonly.yml +++ b/tests/forwardingonly.yml @@ -26,5 +26,3 @@ parameters: components: lokistack: enabled: false - elasticsearch: - enabled: false diff --git a/tests/golden/defaults/openshift4-logging/apps/openshift4-logging.yaml b/tests/golden/defaults/openshift4-logging/apps/openshift4-logging.yaml index e69de29..6825b97 100644 --- a/tests/golden/defaults/openshift4-logging/apps/openshift4-logging.yaml +++ b/tests/golden/defaults/openshift4-logging/apps/openshift4-logging.yaml @@ -0,0 +1,4 @@ +spec: + syncPolicy: + syncOptions: + - ServerSideApply=true diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/00_namespace.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/00_namespace.yaml index 1b27cf9..4c91e3c 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/00_namespace.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/00_namespace.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Namespace metadata: annotations: + argocd.argoproj.io/sync-wave: '-100' openshift.io/node-selector: '' labels: name: openshift-logging diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/10_operator_group.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/10_operator_group.yaml index ff11675..52f645d 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/10_operator_group.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/10_operator_group.yaml @@ -1,11 +1,9 @@ apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-90' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging -spec: - targetNamespaces: - - openshift-logging diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml index 1f0b7ad..5f550db 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml @@ -1,13 +1,14 @@ apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging spec: - channel: stable-5.9 + channel: stable-6.0 config: resources: limits: @@ -23,13 +24,14 @@ spec: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: loki-operator name: loki-operator namespace: openshift-operators-redhat spec: - channel: stable-5.9 + channel: stable-6.0 config: resources: limits: diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/30_cluster_logging.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_cluster_logging.yaml deleted file mode 100644 index 307f0ca..0000000 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/30_cluster_logging.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogging -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: instance - name: instance - namespace: openshift-logging -spec: - collection: - type: vector - logStore: - lokistack: - name: loki - type: lokistack - managementState: Managed diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_logstore.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_logstore.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_logstore.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_logstore.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_netpol.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_netpol.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_netpol.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_netpol.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_rbac.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_rbac.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_rbac.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_rbac.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_stack.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_stack.yaml similarity index 97% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_stack.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_stack.yaml index 259068c..f859742 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_stack.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/30_loki_stack.yaml @@ -3,6 +3,7 @@ kind: LokiStack metadata: annotations: argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + argocd.argoproj.io/sync-wave: '-50' labels: name: loki name: loki diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/40_log_forwarder.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/40_log_forwarder.yaml new file mode 100644 index 0000000..ae403ba --- /dev/null +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/40_log_forwarder.yaml @@ -0,0 +1,21 @@ +apiVersion: observability.openshift.io/v1 +kind: ClusterLogForwarder +metadata: + annotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + labels: + name: instance + name: instance + namespace: openshift-logging +spec: + pipelines: + - inputRefs: + - application + name: application-logs + outputRefs: + - default + - inputRefs: + - infrastructure + name: infrastructure-logs + outputRefs: + - default diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_app_logs_reader.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_app_logs_reader.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_ingester_stuck.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_ingester_stuck.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_missing_metrics_token.yaml similarity index 100% rename from tests/golden/defaults/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml rename to tests/golden/defaults/openshift4-logging/openshift4-logging/50_fix_missing_metrics_token.yaml diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml index 268663f..2c6ddb1 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -12,7 +12,7 @@ spec: rules: - alert: SYN_CollectorNodeDown annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m. summary: Collector cannot be scraped expr: | @@ -23,97 +23,10 @@ spec: severity: critical syn: 'true' syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchDeprecation - annotations: - message: The OpenShift Elasticsearch Operator is deprecated and is planned - to be removed in a future release. Red Hat provides bug fixes and support - for this feature during the current release lifecycle, but this feature - no longer receives enhancements. As an alternative to using the OpenShift - Elasticsearch Operator to manage the default log storage, you can use - the Loki Operator. - summary: Detected Elasticsearch as the in-cluster storage which is deprecated - and will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: storage - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentdDeprecation - annotations: - message: Fluentd is deprecated and is planned to be removed in a future - release. Red Hat provides bug fixes and support for this feature during - the current release lifecycle, but this feature no longer receives enhancements. - As an alternative to Fluentd, you can use Vector instead. - summary: Detected Fluentd as the collector which is deprecated and will - be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_KibanaDeprecation - annotations: - message: The Kibana web console is now deprecated and is planned to be - removed in a future logging release. - summary: Detected Kibana as the visualization which is deprecated and - will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: visualization - severity: Warning - syn: 'true' - syn_component: openshift4-logging - alert: SYN_DiskBufferUsage annotations: - message: 'Collectors potentially consuming too much node disk, {{ $value - }}% ' + description: 'Collectors potentially consuming too much node disk, {{ + $value }}% ' summary: Detected consuming too much node disk on $labels.hostname host expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\ \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\ diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..4f6c7da 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -204,6 +204,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/golden/forwardingonly/openshift4-logging/apps/openshift4-logging.yaml b/tests/golden/forwardingonly/openshift4-logging/apps/openshift4-logging.yaml index e69de29..6825b97 100644 --- a/tests/golden/forwardingonly/openshift4-logging/apps/openshift4-logging.yaml +++ b/tests/golden/forwardingonly/openshift4-logging/apps/openshift4-logging.yaml @@ -0,0 +1,4 @@ +spec: + syncPolicy: + syncOptions: + - ServerSideApply=true diff --git a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/00_namespace.yaml b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/00_namespace.yaml index 1b27cf9..4c91e3c 100644 --- a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/00_namespace.yaml +++ b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/00_namespace.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Namespace metadata: annotations: + argocd.argoproj.io/sync-wave: '-100' openshift.io/node-selector: '' labels: name: openshift-logging diff --git a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/10_operator_group.yaml b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/10_operator_group.yaml index ff11675..52f645d 100644 --- a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/10_operator_group.yaml +++ b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/10_operator_group.yaml @@ -1,11 +1,9 @@ apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-90' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging -spec: - targetNamespaces: - - openshift-logging diff --git a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/20_subscriptions.yaml index 2c47bfe..a7f1a1d 100644 --- a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/20_subscriptions.yaml @@ -1,13 +1,14 @@ apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging spec: - channel: stable-5.9 + channel: stable-6.0 config: resources: limits: diff --git a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/30_cluster_logging.yaml b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/30_cluster_logging.yaml deleted file mode 100644 index 5b5a28d..0000000 --- a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/30_cluster_logging.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogging -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: instance - name: instance - namespace: openshift-logging -spec: - collection: - type: vector - managementState: Managed diff --git a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/60_collector_alerts.yaml index 268663f..2c6ddb1 100644 --- a/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ b/tests/golden/forwardingonly/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -12,7 +12,7 @@ spec: rules: - alert: SYN_CollectorNodeDown annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m. summary: Collector cannot be scraped expr: | @@ -23,97 +23,10 @@ spec: severity: critical syn: 'true' syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchDeprecation - annotations: - message: The OpenShift Elasticsearch Operator is deprecated and is planned - to be removed in a future release. Red Hat provides bug fixes and support - for this feature during the current release lifecycle, but this feature - no longer receives enhancements. As an alternative to using the OpenShift - Elasticsearch Operator to manage the default log storage, you can use - the Loki Operator. - summary: Detected Elasticsearch as the in-cluster storage which is deprecated - and will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: storage - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentdDeprecation - annotations: - message: Fluentd is deprecated and is planned to be removed in a future - release. Red Hat provides bug fixes and support for this feature during - the current release lifecycle, but this feature no longer receives enhancements. - As an alternative to Fluentd, you can use Vector instead. - summary: Detected Fluentd as the collector which is deprecated and will - be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_KibanaDeprecation - annotations: - message: The Kibana web console is now deprecated and is planned to be - removed in a future logging release. - summary: Detected Kibana as the visualization which is deprecated and - will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: visualization - severity: Warning - syn: 'true' - syn_component: openshift4-logging - alert: SYN_DiskBufferUsage annotations: - message: 'Collectors potentially consuming too much node disk, {{ $value - }}% ' + description: 'Collectors potentially consuming too much node disk, {{ + $value }}% ' summary: Detected consuming too much node disk on $labels.hostname host expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\ \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\ diff --git a/tests/golden/multilineerr/openshift4-logging/apps/openshift4-logging.yaml b/tests/golden/multilineerr/openshift4-logging/apps/openshift4-logging.yaml index e69de29..6825b97 100644 --- a/tests/golden/multilineerr/openshift4-logging/apps/openshift4-logging.yaml +++ b/tests/golden/multilineerr/openshift4-logging/apps/openshift4-logging.yaml @@ -0,0 +1,4 @@ +spec: + syncPolicy: + syncOptions: + - ServerSideApply=true diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/00_namespace.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/00_namespace.yaml index 1b27cf9..4c91e3c 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/00_namespace.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/00_namespace.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Namespace metadata: annotations: + argocd.argoproj.io/sync-wave: '-100' openshift.io/node-selector: '' labels: name: openshift-logging diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/10_operator_group.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/10_operator_group.yaml index ff11675..52f645d 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/10_operator_group.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/10_operator_group.yaml @@ -1,11 +1,9 @@ apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-90' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging -spec: - targetNamespaces: - - openshift-logging diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml index 1f0b7ad..5f550db 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml @@ -1,13 +1,14 @@ apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: cluster-logging name: cluster-logging namespace: openshift-logging spec: - channel: stable-5.9 + channel: stable-6.0 config: resources: limits: @@ -23,13 +24,14 @@ spec: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - annotations: {} + annotations: + argocd.argoproj.io/sync-wave: '-80' labels: name: loki-operator name: loki-operator namespace: openshift-operators-redhat spec: - channel: stable-5.9 + channel: stable-6.0 config: resources: limits: diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_cluster_logging.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_cluster_logging.yaml deleted file mode 100644 index 307f0ca..0000000 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_cluster_logging.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: logging.openshift.io/v1 -kind: ClusterLogging -metadata: - annotations: - argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true - labels: - name: instance - name: instance - namespace: openshift-logging -spec: - collection: - type: vector - logStore: - lokistack: - name: loki - type: lokistack - managementState: Managed diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_logstore.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_logstore.yaml similarity index 100% rename from tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_logstore.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_logstore.yaml diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_netpol.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_netpol.yaml similarity index 100% rename from tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_netpol.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_netpol.yaml diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_rbac.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_rbac.yaml similarity index 100% rename from tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_rbac.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_rbac.yaml diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_stack.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_stack.yaml similarity index 97% rename from tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_stack.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_stack.yaml index 259068c..f859742 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_stack.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/30_loki_stack.yaml @@ -3,6 +3,7 @@ kind: LokiStack metadata: annotations: argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + argocd.argoproj.io/sync-wave: '-50' labels: name: loki name: loki diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/40_log_forwarder.yaml similarity index 92% rename from tests/golden/multilineerr/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/40_log_forwarder.yaml index 15009bd..864adb9 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/31_cluster_logforwarding.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/40_log_forwarder.yaml @@ -1,4 +1,4 @@ -apiVersion: logging.openshift.io/v1 +apiVersion: observability.openshift.io/v1 kind: ClusterLogForwarder metadata: annotations: diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_app_logs_reader.yaml similarity index 100% rename from tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_logreader_fix.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_app_logs_reader.yaml diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_ingester_stuck.yaml similarity index 100% rename from tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_ingester_fix.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_ingester_stuck.yaml diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_missing_metrics_token.yaml similarity index 100% rename from tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_loki_operator_metrics_token.yaml rename to tests/golden/multilineerr/openshift4-logging/openshift4-logging/50_fix_missing_metrics_token.yaml diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml index 268663f..2c6ddb1 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -12,7 +12,7 @@ spec: rules: - alert: SYN_CollectorNodeDown annotations: - message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + description: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m. summary: Collector cannot be scraped expr: | @@ -23,97 +23,10 @@ spec: severity: critical syn: 'true' syn_component: openshift4-logging - - alert: SYN_CollectorHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.001 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_CollectorVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace - }}/{{ $labels.pod }} collector component.' - summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component - errors are very high' - expr: | - 100 * ( - collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - / - collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} - ) > 0.05 - for: 15m - labels: - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_ElasticsearchDeprecation - annotations: - message: The OpenShift Elasticsearch Operator is deprecated and is planned - to be removed in a future release. Red Hat provides bug fixes and support - for this feature during the current release lifecycle, but this feature - no longer receives enhancements. As an alternative to using the OpenShift - Elasticsearch Operator to manage the default log storage, you can use - the Loki Operator. - summary: Detected Elasticsearch as the in-cluster storage which is deprecated - and will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: storage - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentdDeprecation - annotations: - message: Fluentd is deprecated and is planned to be removed in a future - release. Red Hat provides bug fixes and support for this feature during - the current release lifecycle, but this feature no longer receives enhancements. - As an alternative to Fluentd, you can use Vector instead. - summary: Detected Fluentd as the collector which is deprecated and will - be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: collector - severity: Warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_KibanaDeprecation - annotations: - message: The Kibana web console is now deprecated and is planned to be - removed in a future logging release. - summary: Detected Kibana as the visualization which is deprecated and - will be removed in a future release. - expr: | - sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 - for: 5m - labels: - namespace: openshift-logging - service: visualization - severity: Warning - syn: 'true' - syn_component: openshift4-logging - alert: SYN_DiskBufferUsage annotations: - message: 'Collectors potentially consuming too much node disk, {{ $value - }}% ' + description: 'Collectors potentially consuming too much node disk, {{ + $value }}% ' summary: Detected consuming too much node disk on $labels.hostname host expr: "(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink',\ \ buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') \n/ on(instance)\ diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml index 65a573e..4f6c7da 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_lokistack_alerts.yaml @@ -204,6 +204,27 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging + - alert: SYN_LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + runbook_url: '[[ .RunbookURL]]#Loki-Discarded-Samples-Warning' + summary: Loki is discarding samples during ingestion because they fail + validation. + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-logging - alert: SYN_LokistackSchemaUpgradesRequired annotations: message: |- diff --git a/tests/master.yml b/tests/master.yml index d0afa9c..b73f496 100644 --- a/tests/master.yml +++ b/tests/master.yml @@ -50,39 +50,6 @@ parameters: outputRefs: - custom-forwarder - namespaceLogForwarderEnabled: true - namespaceLogForwarder: - jazz/hands: - outputs: - splunk-forwarder: - secret: - name: splunk-forwarder - type: fluentdForward - url: tls://splunk-forwarder:24224 - pipelines: - application-logs: - inputRefs: - - application - outputRefs: - - splunk-forwarder - foo/bar: - serviceAccountName: ueli - inputs: - my-apps: - application: - namespaces: - - app-one - - app-two - outputs: - custom-forwarder: - type: syslog - pipelines: - my-apps: - inputRefs: - - my-apps - outputRefs: - - custom-forwarder - secrets: my-secret: stringData: