Skip to content

Commit

Permalink
observability: prometheus labels
Browse files Browse the repository at this point in the history
* part one

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Jul 27, 2024
1 parent 5dd9704 commit 21c1cd6
Show file tree
Hide file tree
Showing 13 changed files with 114 additions and 42 deletions.
79 changes: 65 additions & 14 deletions ais/backend/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/NVIDIA/aistore/api/apc"
"github.com/NVIDIA/aistore/cmn"
"github.com/NVIDIA/aistore/cmn/cos"
"github.com/NVIDIA/aistore/cmn/debug"
"github.com/NVIDIA/aistore/core"
"github.com/NVIDIA/aistore/core/meta"
Expand All @@ -19,41 +20,91 @@ import (

type base struct {
provider string
metrics map[string]string // this backend's metric names (below)
metrics cos.StrKVs // this backend's metric names (below)
}

// NOTE: `stats.LatencyToCounter()` - a public helper that relies on the naming convention below
func (b *base) init(snode *meta.Snode, tstats stats.Tracker) {
func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
prefix := b.provider
if prefix == apc.AIS {
prefix = apc.RemAIS
}

labels := cos.StrKVs{"backend": prefix}
b.metrics = make(map[string]string, 12)

// GET
b.metrics[stats.GetCount] = prefix + "." + stats.GetCount
b.metrics[stats.GetLatencyTotal] = prefix + "." + stats.GetLatencyTotal
b.metrics[stats.GetE2ELatencyTotal] = prefix + "." + stats.GetE2ELatencyTotal
b.metrics[stats.GetSize] = prefix + "." + stats.GetSize

tstats.RegExtMetric(snode, b.metrics[stats.GetCount], stats.KindCounter)
tstats.RegExtMetric(snode, b.metrics[stats.GetLatencyTotal], stats.KindTotal)
tstats.RegExtMetric(snode, b.metrics[stats.GetE2ELatencyTotal], stats.KindTotal)
tstats.RegExtMetric(snode, b.metrics[stats.GetSize], stats.KindSize)

tr.RegExtMetric(snode,
b.metrics[stats.GetCount],
stats.KindCounter,
&stats.Extra{Help: "Total number of cold-GET requests", StrName: "remote_get_n", Labels: labels},
)
tr.RegExtMetric(snode,
b.metrics[stats.GetLatencyTotal],
stats.KindTotal,
&stats.Extra{Help: "Total nanoseconds cold-GET object into AIStore", StrName: "remote_get_ns_total", Labels: labels},
)
tr.RegExtMetric(snode,
b.metrics[stats.GetE2ELatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "Total nanoseconds spent by AIStore to service cold-GET request",
StrName: "remote_e2e_get_ns_total",
Labels: labels,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.GetSize],
stats.KindSize,
&stats.Extra{Help: "Total bytes received through cold-GET", StrName: "remote_get_bytes_total", Labels: labels},
)

// PUT
b.metrics[stats.PutCount] = prefix + "." + stats.PutCount
b.metrics[stats.PutLatencyTotal] = prefix + "." + stats.PutLatencyTotal
b.metrics[stats.PutE2ELatencyTotal] = prefix + "." + stats.PutE2ELatencyTotal
b.metrics[stats.PutSize] = prefix + "." + stats.PutSize

tstats.RegExtMetric(snode, b.metrics[stats.PutCount], stats.KindCounter)
tstats.RegExtMetric(snode, b.metrics[stats.PutLatencyTotal], stats.KindTotal)
tstats.RegExtMetric(snode, b.metrics[stats.PutE2ELatencyTotal], stats.KindTotal)
tstats.RegExtMetric(snode, b.metrics[stats.PutSize], stats.KindSize)

tr.RegExtMetric(snode,
b.metrics[stats.PutCount],
stats.KindCounter,
&stats.Extra{Help: "Total number of remote PUT requests", StrName: "remote_put_n", Labels: labels},
)
tr.RegExtMetric(snode,
b.metrics[stats.PutLatencyTotal],
stats.KindTotal,
&stats.Extra{Help: "Total nanoseconds PUT object from AIStore to remote", StrName: "remote_put_ns_total", Labels: labels},
)
tr.RegExtMetric(snode,
b.metrics[stats.PutE2ELatencyTotal],
stats.KindTotal,
&stats.Extra{StrName: "remote_e2e_put_ns_total", Labels: labels},
)
tr.RegExtMetric(snode,
b.metrics[stats.PutSize],
stats.KindSize,
&stats.Extra{StrName: "remote_e2e_put_bytes_total", Labels: labels},
)

// version changed out-of-band
b.metrics[stats.VerChangeCount] = prefix + "." + stats.VerChangeCount
b.metrics[stats.VerChangeSize] = prefix + "." + stats.VerChangeSize

tstats.RegExtMetric(snode, b.metrics[stats.VerChangeCount], stats.KindCounter)
tstats.RegExtMetric(snode, b.metrics[stats.VerChangeSize], stats.KindSize)
tr.RegExtMetric(snode,
b.metrics[stats.VerChangeCount],
stats.KindCounter,
&stats.Extra{StrName: "remote_ver_change_n", Labels: labels},
)
tr.RegExtMetric(snode,
b.metrics[stats.VerChangeSize],
stats.KindSize,
&stats.Extra{StrName: "remote_ver_change_bytes_total", Labels: labels},
)
}

func (b *base) Provider() string { return b.provider }
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/cli/auth_hdlr.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ func readMasked(c *cli.Context, prompt string) string {
}

func cliAuthnURL(cfg *config.Config) string {
return cos.Rather(cfg.Auth.URL, os.Getenv(env.AuthN.URL))
return cos.Right(cfg.Auth.URL, os.Getenv(env.AuthN.URL))
}

func lookupClusterID(cluID string) (string, error) {
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/cli/completions.go
Original file line number Diff line number Diff line change
Expand Up @@ -798,7 +798,7 @@ func oneClusterCompletions(c *cli.Context) {
return
}
for _, clu := range cluList {
fmt.Println(cos.Either(clu.Alias, clu.ID))
fmt.Println(cos.Left(clu.Alias, clu.ID))
}
}

Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/cli/show_hdlr.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ func showClusterHandler(c *cli.Context) error {
return V(err)
}

return cluDaeStatus(c, smap, tstatusMap, pstatusMap, cluConfig, cos.Either(sid, what))
return cluDaeStatus(c, smap, tstatusMap, pstatusMap, cluConfig, cos.Left(sid, what))
}

func xactList(c *cli.Context, xargs *xact.ArgsMsg, caption bool) (int, error) {
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/NVIDIA/aistore/cmd/cli
go 1.22.3

require (
github.com/NVIDIA/aistore v1.3.24-0.20240727161516-d5b75b650095
github.com/NVIDIA/aistore v1.3.24-0.20240727222627-5dd9704f790f
github.com/fatih/color v1.17.0
github.com/json-iterator/go v1.1.12
github.com/onsi/ginkgo/v2 v2.19.0
Expand Down
4 changes: 2 additions & 2 deletions cmd/cli/go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
code.cloudfoundry.org/bytefmt v0.0.0-20190710193110-1eb035ffe2b6/go.mod h1:wN/zk7mhREp/oviagqUXY3EwuHhWyOvAdsn5Y4CzOrc=
github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/NVIDIA/aistore v1.3.24-0.20240727161516-d5b75b650095 h1:80TQClOFgyTrwdaC8LD+4Y2uNkwTss6mFFinjHt3ZV0=
github.com/NVIDIA/aistore v1.3.24-0.20240727161516-d5b75b650095/go.mod h1:A4wCIW7GooZSzDxTxh4pS092Ve9gCiXh1EvtjlVB8ew=
github.com/NVIDIA/aistore v1.3.24-0.20240727222627-5dd9704f790f h1:l9yJlM7QkeMSd9u2AZMy8e8gckQkPFk1FBvzjhK7buc=
github.com/NVIDIA/aistore v1.3.24-0.20240727222627-5dd9704f790f/go.mod h1:A4wCIW7GooZSzDxTxh4pS092Ve9gCiXh1EvtjlVB8ew=
github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8=
github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q=
github.com/VividCortex/ewma v1.1.1/go.mod h1:2Tkkvm3sRDVXaiyucHiACn4cqf7DpdyLvmxzcbUokwA=
Expand Down
2 changes: 1 addition & 1 deletion core/mock/stats_mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func (*StatsTracker) SetFlag(string, cos.NodeStateFlags)
func (*StatsTracker) SetClrFlag(string, cos.NodeStateFlags, cos.NodeStateFlags) {}
func (*StatsTracker) AddMany(...cos.NamedVal64) {}
func (*StatsTracker) InitPrometheus(*meta.Snode) {}
func (*StatsTracker) RegExtMetric(*meta.Snode, string, string) {}
func (*StatsTracker) RegExtMetric(*meta.Snode, string, string, *stats.Extra) {}
func (*StatsTracker) GetMetricNames() cos.StrKVs { return nil }
func (*StatsTracker) GetStats() *stats.Node { return nil }
func (*StatsTracker) GetStatsV322() *stats.NodeV322 { return nil }
Expand Down
14 changes: 13 additions & 1 deletion stats/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,12 @@ type (
ResetStats(errorsOnly bool)
GetMetricNames() cos.StrKVs // (name, kind) pairs

RegExtMetric(node *meta.Snode, name, kind string)
// for aistore modules, to add their respective metrics
RegExtMetric(node *meta.Snode, name, kind string, extra *Extra)
}
)

type (

// REST API
Node struct {
Expand Down Expand Up @@ -105,6 +109,14 @@ type (
}
)

type (
Extra struct {
StrName string
Help string
Labels cos.StrKVs
}
)

func IsErrMetric(name string) bool {
return strings.HasPrefix(name, errPrefix) // e.g., "err.get.n"
}
Expand Down
22 changes: 12 additions & 10 deletions stats/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,18 +77,18 @@ const (

// error counters
// see also: `IncErr`, `regCommon`, `ioErrNames`
ErrGetCount = "err.get.n"
ErrPutCount = "err.put.n"
ErrAppendCount = "err.append.n"
ErrDeleteCount = "err.del.n"
ErrRenameCount = "err.ren.n"
ErrListCount = "err.lst.n"
ErrGetCount = errPrefix + GetCount
ErrPutCount = errPrefix + PutCount
ErrAppendCount = errPrefix + AppendCount
ErrDeleteCount = errPrefix + DeleteCount
ErrRenameCount = errPrefix + RenameCount
ErrListCount = errPrefix + ListCount

// more errors
// (for even more errors, see target_stats)
ErrHTTPWriteCount = "err.http.write.n" //nolint:gosec // false positive G101
ErrDownloadCount = "err.dl.n"
ErrPutMirrorCount = "err.put.mirror.n"
ErrHTTPWriteCount = errPrefix + "http.write.n"
ErrDownloadCount = errPrefix + "dl.n"
ErrPutMirrorCount = errPrefix + "put.mirror.n"

// KindLatency
GetLatency = "get.ns"
Expand Down Expand Up @@ -165,7 +165,9 @@ func (r *runner) InitPrometheus(snode *meta.Snode) {
r.core.initProm(snode)
}

func (r *runner) RegExtMetric(snode *meta.Snode, name, kind string) { r.reg(snode, name, kind) }
func (r *runner) RegExtMetric(snode *meta.Snode, name, kind string, extra *Extra) {
r.reg(snode, name, kind, extra)
}

// common (target, proxy) metrics
func (r *runner) regCommon(snode *meta.Snode) {
Expand Down
4 changes: 3 additions & 1 deletion stats/common_prom.go
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,9 @@ var (

// NOTE naming convention: ".n" for the count and ".ns" for duration (nanoseconds)
// compare with coreStats.initProm()
func (r *runner) reg(_ *meta.Snode, name, kind string) {
func (r *runner) reg(_ *meta.Snode, name, kind string, extra ...*Extra) {
_ = extra // TODO -- FIXME: in progress

v := &statsValue{kind: kind}
// in StatsD metrics ":" delineates the name and the value - replace with underscore
switch kind {
Expand Down
4 changes: 3 additions & 1 deletion stats/common_statsd.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,9 @@ func (s *coreStats) reset(errorsOnly bool) {

// NOTE naming convention: ".n" for the count and ".ns" for duration (nanoseconds)
// compare with coreStats.initProm()
func (r *runner) reg(snode *meta.Snode, name, kind string) {
func (r *runner) reg(snode *meta.Snode, name, kind string, extra ...*Extra) {
_ = extra // TODO -- FIXME: in progress

v := &statsValue{kind: kind}
// in StatsD metrics ":" delineates the name and the value - replace with underscore
switch kind {
Expand Down
4 changes: 3 additions & 1 deletion stats/proxy_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/NVIDIA/aistore/cmn"
"github.com/NVIDIA/aistore/cmn/atomic"
"github.com/NVIDIA/aistore/cmn/cos"
"github.com/NVIDIA/aistore/cmn/debug"
"github.com/NVIDIA/aistore/cmn/nlog"
"github.com/NVIDIA/aistore/core"
"github.com/NVIDIA/aistore/core/meta"
Expand Down Expand Up @@ -77,8 +78,9 @@ func (r *Prunner) log(now int64, uptime time.Duration, _ *cmn.Config) {
if now >= r.next || !idle {
s.sgl.Reset() // sharing w/ CoreStats.copyT
r.ctracker.write(s.sgl, r.sorted, false /*target*/, idle)
if s.sgl.Len() > 3 { // skip '{}'
if l := s.sgl.Len(); l > 3 { // skip '{}'
line := string(s.sgl.Bytes())
debug.Assert(l < s.sgl.Slab().Size(), l, " vs slab ", s.sgl.Slab().Size())
if line != r.prev {
nlog.Infoln(line)
r.prev = line
Expand Down
15 changes: 8 additions & 7 deletions stats/target_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ const (
VerChangeSize = "ver.change.size"

// errors
ErrCksumCount = "err.cksum.n"
ErrCksumSize = "err.cksum.size"
ErrCksumCount = errPrefix + "cksum.n"
ErrCksumSize = errPrefix + "cksum.size"

ErrFSHCCount = "err.fshc.n"
ErrFSHCCount = errPrefix + "fshc.n"

// IO errors (must have ioErrPrefix)
IOErrGetCount = "err.io.get.n"
IOErrPutCount = "err.io.put.n"
IOErrDeleteCount = "err.io.del.n"
IOErrGetCount = ioErrPrefix + "get.n"
IOErrPutCount = ioErrPrefix + "put.n"
IOErrDeleteCount = ioErrPrefix + "del.n"

// KindLatency
PutLatency = "put.ns"
Expand Down Expand Up @@ -409,8 +409,9 @@ func (r *Trunner) log(now int64, uptime time.Duration, config *cmn.Config) {
if now >= r.next || !idle {
s.sgl.Reset() // sharing w/ CoreStats.copyT
r.ctracker.write(s.sgl, r.sorted, true /*target*/, idle)
if s.sgl.Len() > 3 { // skip '{}'
if l := s.sgl.Len(); l > 3 { // skip '{}'
line := string(s.sgl.Bytes())
debug.Assert(l < s.sgl.Slab().Size(), l, " vs slab ", s.sgl.Slab().Size())
if line != r.prev {
r.lines = append(r.lines, line)
r.prev = line
Expand Down

0 comments on commit 21c1cd6

Please sign in to comment.