Skip to content

Commit

Permalink
Merge pull request #11 from totvslabs/failover
Browse files Browse the repository at this point in the history
fix: improved failover metrics
  • Loading branch information
caarlos0 authored Dec 4, 2018
2 parents d770c0a + 1f568d4 commit 26b7ddc
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 9 deletions.
9 changes: 9 additions & 0 deletions client/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,20 @@ type Nodes struct {
Balanced bool `json:"balanced"`
}

// Counters from the cluster
// Couchbase does not expose a "null" count it seems, so, some of those
// metrics have been found by lurking into couchbase/ns_server code.
//
// https://github.com/couchbase/ns_server/blob/master/src/ns_rebalancer.erl#L92
type Counters struct {
RebalanceStart int64 `json:"rebalance_start"`
RebalanceSuccess int64 `json:"rebalance_success"`
RebalanceFail int64 `json:"rebalance_fail"`
RebalanceStop int64 `json:"rebalance_stop"`
FailoverNode int64 `json:"failover_node"`
Failover int64 `json:"failover"`
FailoverComplete int64 `json:"failover_complete"`
FailoverIncomplete int64 `json:"failover_incomplete"`
GracefulFailoverStart int64 `json:"graceful_failover_start"`
GracefulFailoverSuccess int64 `json:"graceful_failover_success"`
GracefulFailoverFail int64 `json:"graceful_failover_fail"`
Expand Down
43 changes: 35 additions & 8 deletions collector/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,13 @@ type clusterCollector struct {
rebalanceStatus *prometheus.Desc
maxBucketCount *prometheus.Desc

countersRebalanceStart *prometheus.Desc
countersRebalanceSuccess *prometheus.Desc
countersRebalanceFail *prometheus.Desc
countersFailoverNode *prometheus.Desc
countersRebalanceStart *prometheus.Desc
countersRebalanceSuccess *prometheus.Desc
countersRebalanceFail *prometheus.Desc
countersRebalanceStop *prometheus.Desc
countersFailover *prometheus.Desc
countersFailoverComplete *prometheus.Desc
countersFailoverIncomplete *prometheus.Desc

storagetotalsRAMQuotatotal *prometheus.Desc
storagetotalsRAMQuotaused *prometheus.Desc
Expand Down Expand Up @@ -115,12 +118,30 @@ func NewClusterCollector(client client.Client) prometheus.Collector {
nil,
nil,
),
countersFailoverNode: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "failover_node_total"),
countersRebalanceStop: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "rebalance_stop_total"),
"Number of rebalances stopped since cluster is up",
nil,
nil,
),
countersFailover: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "failover_total"),
"Number of failovers since cluster is up",
nil,
nil,
),
countersFailoverComplete: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "failover_complete_total"),
"Number of failovers completed successfully since cluster is up",
nil,
nil,
),
countersFailoverIncomplete: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "failover_incomplete_total"),
"Number of failovers that failed since cluster is up",
nil,
nil,
),
storagetotalsRAMQuotatotal: prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "storagetotals_ram_quotatotal_bytes"),
"Total memory allocated to Couchbase in the cluster",
Expand Down Expand Up @@ -211,7 +232,10 @@ func (c *clusterCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.countersRebalanceStart
ch <- c.countersRebalanceSuccess
ch <- c.countersRebalanceFail
ch <- c.countersFailoverNode
ch <- c.countersRebalanceStop
ch <- c.countersFailover
ch <- c.countersFailoverComplete
ch <- c.countersFailoverIncomplete

ch <- c.storagetotalsRAMQuotatotal
ch <- c.storagetotalsRAMQuotaused
Expand Down Expand Up @@ -255,7 +279,10 @@ func (c *clusterCollector) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(c.countersRebalanceStart, prometheus.CounterValue, float64(cluster.Counters.RebalanceStart))
ch <- prometheus.MustNewConstMetric(c.countersRebalanceSuccess, prometheus.CounterValue, float64(cluster.Counters.RebalanceSuccess))
ch <- prometheus.MustNewConstMetric(c.countersRebalanceFail, prometheus.CounterValue, float64(cluster.Counters.RebalanceFail))
ch <- prometheus.MustNewConstMetric(c.countersFailoverNode, prometheus.CounterValue, float64(cluster.Counters.FailoverNode))
ch <- prometheus.MustNewConstMetric(c.countersRebalanceStop, prometheus.CounterValue, float64(cluster.Counters.RebalanceStop))
ch <- prometheus.MustNewConstMetric(c.countersFailover, prometheus.CounterValue, float64(cluster.Counters.Failover+cluster.Counters.FailoverNode))
ch <- prometheus.MustNewConstMetric(c.countersFailoverComplete, prometheus.CounterValue, float64(cluster.Counters.FailoverComplete))
ch <- prometheus.MustNewConstMetric(c.countersFailoverIncomplete, prometheus.CounterValue, float64(cluster.Counters.FailoverIncomplete))

ch <- prometheus.MustNewConstMetric(c.storagetotalsRAMQuotatotal, prometheus.GaugeValue, cluster.StorageTotals.RAM.QuotaTotal)
ch <- prometheus.MustNewConstMetric(c.storagetotalsRAMQuotaused, prometheus.GaugeValue, cluster.StorageTotals.RAM.QuotaUsed)
Expand Down
32 changes: 31 additions & 1 deletion prometheus/couchbase.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ groups:
- name: couchbase
rules:
- record: couchbase::node_count
expr: count(couchbase_node_interestingstats_ops) by (instance)
expr: count(couchbase_node_healthy) by (instance)
- alert: CouchbaseDown
expr: up{job="couchbase"} == 0
for: 1m
Expand All @@ -30,6 +30,7 @@ groups:
for: 1m
labels:
severity: critical
page: true
annotations:
summary: "Node {{ $labels.node }} is not healthy, you may want to fail it over"
- alert: CouchbaseTooFragmented
Expand Down Expand Up @@ -61,13 +62,42 @@ groups:
severity: info
annotations:
summary: 'A rebalance just successfully finished'
- alert: CouchbaseRebalanceStopped
expr: increase(couchbase_cluster_rebalance_stop_total[5m]) > 0
for: 1s
labels:
severity: info
annotations:
summary: 'A rebalance was stopped'
- alert: CouchbaseRebalanceFailed
expr: increase(couchbase_cluster_rebalance_fail_total[5m]) > 0
for: 1s
labels:
severity: warning
annotations:
summary: 'A rebalance just failed'
- alert: CouchbaseFailover
expr: increase(couchbase_cluster_failover_total[5m]) > 0
for: 1s
labels:
severity: warning
annotations:
summary: 'A node has been failover has started'
- alert: CouchbaseFailoverComplete
expr: increase(couchbase_cluster_failover_complete_total[5m]) > 0
for: 1s
labels:
severity: info
annotations:
summary: 'A node has been failover has been completed'
- alert: CouchbaseFailoverIncomplete
expr: increase(couchbase_cluster_failover_incomplete_total[5m]) > 0
for: 1s
labels:
severity: critical
page: true
annotations:
summary: 'A node has been failover failed'
- alert: CouchbaseQuotaUsageHigh
expr: couchbase_bucket_basicstats_quota_user_percent > 85
for: 1m
Expand Down

0 comments on commit 26b7ddc

Please sign in to comment.