diff --git a/client/nodes.go b/client/nodes.go index 7da6ef0..6b021b4 100644 --- a/client/nodes.go +++ b/client/nodes.go @@ -60,11 +60,20 @@ type Nodes struct { Balanced bool `json:"balanced"` } +// Counters from the cluster +// Couchbase does not expose a "null" count it seems, so, some of those +// metrics have been found by lurking into couchbase/ns_server code. +// +// https://github.com/couchbase/ns_server/blob/master/src/ns_rebalancer.erl#L92 type Counters struct { RebalanceStart int64 `json:"rebalance_start"` RebalanceSuccess int64 `json:"rebalance_success"` RebalanceFail int64 `json:"rebalance_fail"` + RebalanceStop int64 `json:"rebalance_stop"` FailoverNode int64 `json:"failover_node"` + Failover int64 `json:"failover"` + FailoverComplete int64 `json:"failover_complete"` + FailoverIncomplete int64 `json:"failover_incomplete"` GracefulFailoverStart int64 `json:"graceful_failover_start"` GracefulFailoverSuccess int64 `json:"graceful_failover_success"` GracefulFailoverFail int64 `json:"graceful_failover_fail"` diff --git a/collector/cluster.go b/collector/cluster.go index 09da433..dc7a6e0 100644 --- a/collector/cluster.go +++ b/collector/cluster.go @@ -23,10 +23,13 @@ type clusterCollector struct { rebalanceStatus *prometheus.Desc maxBucketCount *prometheus.Desc - countersRebalanceStart *prometheus.Desc - countersRebalanceSuccess *prometheus.Desc - countersRebalanceFail *prometheus.Desc - countersFailoverNode *prometheus.Desc + countersRebalanceStart *prometheus.Desc + countersRebalanceSuccess *prometheus.Desc + countersRebalanceFail *prometheus.Desc + countersRebalanceStop *prometheus.Desc + countersFailover *prometheus.Desc + countersFailoverComplete *prometheus.Desc + countersFailoverIncomplete *prometheus.Desc storagetotalsRAMQuotatotal *prometheus.Desc storagetotalsRAMQuotaused *prometheus.Desc @@ -115,12 +118,30 @@ func NewClusterCollector(client client.Client) prometheus.Collector { nil, nil, ), - countersFailoverNode: prometheus.NewDesc( - prometheus.BuildFQName(namespace, subsystem, "failover_node_total"), + countersRebalanceStop: prometheus.NewDesc( + prometheus.BuildFQName(namespace, subsystem, "rebalance_stop_total"), + "Number of rebalances stopped since cluster is up", + nil, + nil, + ), + countersFailover: prometheus.NewDesc( + prometheus.BuildFQName(namespace, subsystem, "failover_total"), "Number of failovers since cluster is up", nil, nil, ), + countersFailoverComplete: prometheus.NewDesc( + prometheus.BuildFQName(namespace, subsystem, "failover_complete_total"), + "Number of failovers completed successfully since cluster is up", + nil, + nil, + ), + countersFailoverIncomplete: prometheus.NewDesc( + prometheus.BuildFQName(namespace, subsystem, "failover_incomplete_total"), + "Number of failovers that failed since cluster is up", + nil, + nil, + ), storagetotalsRAMQuotatotal: prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "storagetotals_ram_quotatotal_bytes"), "Total memory allocated to Couchbase in the cluster", @@ -211,7 +232,10 @@ func (c *clusterCollector) Describe(ch chan<- *prometheus.Desc) { ch <- c.countersRebalanceStart ch <- c.countersRebalanceSuccess ch <- c.countersRebalanceFail - ch <- c.countersFailoverNode + ch <- c.countersRebalanceStop + ch <- c.countersFailover + ch <- c.countersFailoverComplete + ch <- c.countersFailoverIncomplete ch <- c.storagetotalsRAMQuotatotal ch <- c.storagetotalsRAMQuotaused @@ -255,7 +279,10 @@ func (c *clusterCollector) Collect(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric(c.countersRebalanceStart, prometheus.CounterValue, float64(cluster.Counters.RebalanceStart)) ch <- prometheus.MustNewConstMetric(c.countersRebalanceSuccess, prometheus.CounterValue, float64(cluster.Counters.RebalanceSuccess)) ch <- prometheus.MustNewConstMetric(c.countersRebalanceFail, prometheus.CounterValue, float64(cluster.Counters.RebalanceFail)) - ch <- prometheus.MustNewConstMetric(c.countersFailoverNode, prometheus.CounterValue, float64(cluster.Counters.FailoverNode)) + ch <- prometheus.MustNewConstMetric(c.countersRebalanceStop, prometheus.CounterValue, float64(cluster.Counters.RebalanceStop)) + ch <- prometheus.MustNewConstMetric(c.countersFailover, prometheus.CounterValue, float64(cluster.Counters.Failover+cluster.Counters.FailoverNode)) + ch <- prometheus.MustNewConstMetric(c.countersFailoverComplete, prometheus.CounterValue, float64(cluster.Counters.FailoverComplete)) + ch <- prometheus.MustNewConstMetric(c.countersFailoverIncomplete, prometheus.CounterValue, float64(cluster.Counters.FailoverIncomplete)) ch <- prometheus.MustNewConstMetric(c.storagetotalsRAMQuotatotal, prometheus.GaugeValue, cluster.StorageTotals.RAM.QuotaTotal) ch <- prometheus.MustNewConstMetric(c.storagetotalsRAMQuotaused, prometheus.GaugeValue, cluster.StorageTotals.RAM.QuotaUsed) diff --git a/prometheus/couchbase.rules.yml b/prometheus/couchbase.rules.yml index 28d3e1f..2759f97 100644 --- a/prometheus/couchbase.rules.yml +++ b/prometheus/couchbase.rules.yml @@ -9,7 +9,7 @@ groups: - name: couchbase rules: - record: couchbase::node_count - expr: count(couchbase_node_interestingstats_ops) by (instance) + expr: count(couchbase_node_healthy) by (instance) - alert: CouchbaseDown expr: up{job="couchbase"} == 0 for: 1m @@ -30,6 +30,7 @@ groups: for: 1m labels: severity: critical + page: true annotations: summary: "Node {{ $labels.node }} is not healthy, you may want to fail it over" - alert: CouchbaseTooFragmented @@ -61,6 +62,13 @@ groups: severity: info annotations: summary: 'A rebalance just successfully finished' + - alert: CouchbaseRebalanceStopped + expr: increase(couchbase_cluster_rebalance_stop_total[5m]) > 0 + for: 1s + labels: + severity: info + annotations: + summary: 'A rebalance was stopped' - alert: CouchbaseRebalanceFailed expr: increase(couchbase_cluster_rebalance_fail_total[5m]) > 0 for: 1s @@ -68,6 +76,28 @@ groups: severity: warning annotations: summary: 'A rebalance just failed' + - alert: CouchbaseFailover + expr: increase(couchbase_cluster_failover_total[5m]) > 0 + for: 1s + labels: + severity: warning + annotations: + summary: 'A node has been failover has started' + - alert: CouchbaseFailoverComplete + expr: increase(couchbase_cluster_failover_complete_total[5m]) > 0 + for: 1s + labels: + severity: info + annotations: + summary: 'A node has been failover has been completed' + - alert: CouchbaseFailoverIncomplete + expr: increase(couchbase_cluster_failover_incomplete_total[5m]) > 0 + for: 1s + labels: + severity: critical + page: true + annotations: + summary: 'A node has been failover failed' - alert: CouchbaseQuotaUsageHigh expr: couchbase_bucket_basicstats_quota_user_percent > 85 for: 1m