diff --git a/ais/backend/common.go b/ais/backend/common.go index cd7bb718763..4149e53cb00 100644 --- a/ais/backend/common.go +++ b/ais/backend/common.go @@ -10,6 +10,7 @@ import ( "github.com/NVIDIA/aistore/api/apc" "github.com/NVIDIA/aistore/cmn" + "github.com/NVIDIA/aistore/cmn/cos" "github.com/NVIDIA/aistore/cmn/debug" "github.com/NVIDIA/aistore/core" "github.com/NVIDIA/aistore/core/meta" @@ -19,41 +20,91 @@ import ( type base struct { provider string - metrics map[string]string // this backend's metric names (below) + metrics cos.StrKVs // this backend's metric names (below) } // NOTE: `stats.LatencyToCounter()` - a public helper that relies on the naming convention below -func (b *base) init(snode *meta.Snode, tstats stats.Tracker) { +func (b *base) init(snode *meta.Snode, tr stats.Tracker) { prefix := b.provider if prefix == apc.AIS { prefix = apc.RemAIS } + + labels := cos.StrKVs{"backend": prefix} b.metrics = make(map[string]string, 12) + + // GET b.metrics[stats.GetCount] = prefix + "." + stats.GetCount b.metrics[stats.GetLatencyTotal] = prefix + "." + stats.GetLatencyTotal b.metrics[stats.GetE2ELatencyTotal] = prefix + "." + stats.GetE2ELatencyTotal b.metrics[stats.GetSize] = prefix + "." + stats.GetSize - tstats.RegExtMetric(snode, b.metrics[stats.GetCount], stats.KindCounter) - tstats.RegExtMetric(snode, b.metrics[stats.GetLatencyTotal], stats.KindTotal) - tstats.RegExtMetric(snode, b.metrics[stats.GetE2ELatencyTotal], stats.KindTotal) - tstats.RegExtMetric(snode, b.metrics[stats.GetSize], stats.KindSize) - + tr.RegExtMetric(snode, + b.metrics[stats.GetCount], + stats.KindCounter, + &stats.Extra{Help: "Total number of cold-GET requests", StrName: "remote_get_n", Labels: labels}, + ) + tr.RegExtMetric(snode, + b.metrics[stats.GetLatencyTotal], + stats.KindTotal, + &stats.Extra{Help: "Total nanoseconds cold-GET object into AIStore", StrName: "remote_get_ns_total", Labels: labels}, + ) + tr.RegExtMetric(snode, + b.metrics[stats.GetE2ELatencyTotal], + stats.KindTotal, + &stats.Extra{ + Help: "Total nanoseconds spent by AIStore to service cold-GET request", + StrName: "remote_e2e_get_ns_total", + Labels: labels, + }, + ) + tr.RegExtMetric(snode, + b.metrics[stats.GetSize], + stats.KindSize, + &stats.Extra{Help: "Total bytes received through cold-GET", StrName: "remote_get_bytes_total", Labels: labels}, + ) + + // PUT b.metrics[stats.PutCount] = prefix + "." + stats.PutCount b.metrics[stats.PutLatencyTotal] = prefix + "." + stats.PutLatencyTotal b.metrics[stats.PutE2ELatencyTotal] = prefix + "." + stats.PutE2ELatencyTotal b.metrics[stats.PutSize] = prefix + "." + stats.PutSize - tstats.RegExtMetric(snode, b.metrics[stats.PutCount], stats.KindCounter) - tstats.RegExtMetric(snode, b.metrics[stats.PutLatencyTotal], stats.KindTotal) - tstats.RegExtMetric(snode, b.metrics[stats.PutE2ELatencyTotal], stats.KindTotal) - tstats.RegExtMetric(snode, b.metrics[stats.PutSize], stats.KindSize) - + tr.RegExtMetric(snode, + b.metrics[stats.PutCount], + stats.KindCounter, + &stats.Extra{Help: "Total number of remote PUT requests", StrName: "remote_put_n", Labels: labels}, + ) + tr.RegExtMetric(snode, + b.metrics[stats.PutLatencyTotal], + stats.KindTotal, + &stats.Extra{Help: "Total nanoseconds PUT object from AIStore to remote", StrName: "remote_put_ns_total", Labels: labels}, + ) + tr.RegExtMetric(snode, + b.metrics[stats.PutE2ELatencyTotal], + stats.KindTotal, + &stats.Extra{StrName: "remote_e2e_put_ns_total", Labels: labels}, + ) + tr.RegExtMetric(snode, + b.metrics[stats.PutSize], + stats.KindSize, + &stats.Extra{StrName: "remote_e2e_put_bytes_total", Labels: labels}, + ) + + // version changed out-of-band b.metrics[stats.VerChangeCount] = prefix + "." + stats.VerChangeCount b.metrics[stats.VerChangeSize] = prefix + "." + stats.VerChangeSize - tstats.RegExtMetric(snode, b.metrics[stats.VerChangeCount], stats.KindCounter) - tstats.RegExtMetric(snode, b.metrics[stats.VerChangeSize], stats.KindSize) + tr.RegExtMetric(snode, + b.metrics[stats.VerChangeCount], + stats.KindCounter, + &stats.Extra{StrName: "remote_ver_change_n", Labels: labels}, + ) + tr.RegExtMetric(snode, + b.metrics[stats.VerChangeSize], + stats.KindSize, + &stats.Extra{StrName: "remote_ver_change_bytes_total", Labels: labels}, + ) } func (b *base) Provider() string { return b.provider } diff --git a/cmd/cli/cli/auth_hdlr.go b/cmd/cli/cli/auth_hdlr.go index b969dd12b87..30541729497 100644 --- a/cmd/cli/cli/auth_hdlr.go +++ b/cmd/cli/cli/auth_hdlr.go @@ -242,7 +242,7 @@ func readMasked(c *cli.Context, prompt string) string { } func cliAuthnURL(cfg *config.Config) string { - return cos.Rather(cfg.Auth.URL, os.Getenv(env.AuthN.URL)) + return cos.Right(cfg.Auth.URL, os.Getenv(env.AuthN.URL)) } func lookupClusterID(cluID string) (string, error) { diff --git a/cmd/cli/cli/completions.go b/cmd/cli/cli/completions.go index e24c67fae82..aed245c0a99 100644 --- a/cmd/cli/cli/completions.go +++ b/cmd/cli/cli/completions.go @@ -798,7 +798,7 @@ func oneClusterCompletions(c *cli.Context) { return } for _, clu := range cluList { - fmt.Println(cos.Either(clu.Alias, clu.ID)) + fmt.Println(cos.Left(clu.Alias, clu.ID)) } } diff --git a/cmd/cli/cli/show_hdlr.go b/cmd/cli/cli/show_hdlr.go index 1269e0d5f0a..7ebafcc0428 100644 --- a/cmd/cli/cli/show_hdlr.go +++ b/cmd/cli/cli/show_hdlr.go @@ -407,7 +407,7 @@ func showClusterHandler(c *cli.Context) error { return V(err) } - return cluDaeStatus(c, smap, tstatusMap, pstatusMap, cluConfig, cos.Either(sid, what)) + return cluDaeStatus(c, smap, tstatusMap, pstatusMap, cluConfig, cos.Left(sid, what)) } func xactList(c *cli.Context, xargs *xact.ArgsMsg, caption bool) (int, error) { diff --git a/cmd/cli/go.mod b/cmd/cli/go.mod index d3b31780853..5929c07df41 100644 --- a/cmd/cli/go.mod +++ b/cmd/cli/go.mod @@ -3,7 +3,7 @@ module github.com/NVIDIA/aistore/cmd/cli go 1.22.3 require ( - github.com/NVIDIA/aistore v1.3.24-0.20240727161516-d5b75b650095 + github.com/NVIDIA/aistore v1.3.24-0.20240727222627-5dd9704f790f github.com/fatih/color v1.17.0 github.com/json-iterator/go v1.1.12 github.com/onsi/ginkgo/v2 v2.19.0 diff --git a/cmd/cli/go.sum b/cmd/cli/go.sum index 44093d8e74d..020bc260577 100644 --- a/cmd/cli/go.sum +++ b/cmd/cli/go.sum @@ -1,7 +1,7 @@ code.cloudfoundry.org/bytefmt v0.0.0-20190710193110-1eb035ffe2b6/go.mod h1:wN/zk7mhREp/oviagqUXY3EwuHhWyOvAdsn5Y4CzOrc= github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= -github.com/NVIDIA/aistore v1.3.24-0.20240727161516-d5b75b650095 h1:80TQClOFgyTrwdaC8LD+4Y2uNkwTss6mFFinjHt3ZV0= -github.com/NVIDIA/aistore v1.3.24-0.20240727161516-d5b75b650095/go.mod h1:A4wCIW7GooZSzDxTxh4pS092Ve9gCiXh1EvtjlVB8ew= +github.com/NVIDIA/aistore v1.3.24-0.20240727222627-5dd9704f790f h1:l9yJlM7QkeMSd9u2AZMy8e8gckQkPFk1FBvzjhK7buc= +github.com/NVIDIA/aistore v1.3.24-0.20240727222627-5dd9704f790f/go.mod h1:A4wCIW7GooZSzDxTxh4pS092Ve9gCiXh1EvtjlVB8ew= github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8= github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q= github.com/VividCortex/ewma v1.1.1/go.mod h1:2Tkkvm3sRDVXaiyucHiACn4cqf7DpdyLvmxzcbUokwA= diff --git a/core/mock/stats_mock.go b/core/mock/stats_mock.go index d49494e3f61..1a09513b77a 100644 --- a/core/mock/stats_mock.go +++ b/core/mock/stats_mock.go @@ -30,7 +30,7 @@ func (*StatsTracker) SetFlag(string, cos.NodeStateFlags) func (*StatsTracker) SetClrFlag(string, cos.NodeStateFlags, cos.NodeStateFlags) {} func (*StatsTracker) AddMany(...cos.NamedVal64) {} func (*StatsTracker) InitPrometheus(*meta.Snode) {} -func (*StatsTracker) RegExtMetric(*meta.Snode, string, string) {} +func (*StatsTracker) RegExtMetric(*meta.Snode, string, string, *stats.Extra) {} func (*StatsTracker) GetMetricNames() cos.StrKVs { return nil } func (*StatsTracker) GetStats() *stats.Node { return nil } func (*StatsTracker) GetStatsV322() *stats.NodeV322 { return nil } diff --git a/stats/api.go b/stats/api.go index 47320e1c6df..16c647f42ec 100644 --- a/stats/api.go +++ b/stats/api.go @@ -46,8 +46,12 @@ type ( ResetStats(errorsOnly bool) GetMetricNames() cos.StrKVs // (name, kind) pairs - RegExtMetric(node *meta.Snode, name, kind string) + // for aistore modules, to add their respective metrics + RegExtMetric(node *meta.Snode, name, kind string, extra *Extra) } +) + +type ( // REST API Node struct { @@ -105,6 +109,14 @@ type ( } ) +type ( + Extra struct { + StrName string + Help string + Labels cos.StrKVs + } +) + func IsErrMetric(name string) bool { return strings.HasPrefix(name, errPrefix) // e.g., "err.get.n" } diff --git a/stats/common.go b/stats/common.go index 9a3270ba3be..9ddab55257a 100644 --- a/stats/common.go +++ b/stats/common.go @@ -77,18 +77,18 @@ const ( // error counters // see also: `IncErr`, `regCommon`, `ioErrNames` - ErrGetCount = "err.get.n" - ErrPutCount = "err.put.n" - ErrAppendCount = "err.append.n" - ErrDeleteCount = "err.del.n" - ErrRenameCount = "err.ren.n" - ErrListCount = "err.lst.n" + ErrGetCount = errPrefix + GetCount + ErrPutCount = errPrefix + PutCount + ErrAppendCount = errPrefix + AppendCount + ErrDeleteCount = errPrefix + DeleteCount + ErrRenameCount = errPrefix + RenameCount + ErrListCount = errPrefix + ListCount // more errors // (for even more errors, see target_stats) - ErrHTTPWriteCount = "err.http.write.n" //nolint:gosec // false positive G101 - ErrDownloadCount = "err.dl.n" - ErrPutMirrorCount = "err.put.mirror.n" + ErrHTTPWriteCount = errPrefix + "http.write.n" + ErrDownloadCount = errPrefix + "dl.n" + ErrPutMirrorCount = errPrefix + "put.mirror.n" // KindLatency GetLatency = "get.ns" @@ -165,7 +165,9 @@ func (r *runner) InitPrometheus(snode *meta.Snode) { r.core.initProm(snode) } -func (r *runner) RegExtMetric(snode *meta.Snode, name, kind string) { r.reg(snode, name, kind) } +func (r *runner) RegExtMetric(snode *meta.Snode, name, kind string, extra *Extra) { + r.reg(snode, name, kind, extra) +} // common (target, proxy) metrics func (r *runner) regCommon(snode *meta.Snode) { diff --git a/stats/common_prom.go b/stats/common_prom.go index 091dc85f04c..ac4cc0055ac 100644 --- a/stats/common_prom.go +++ b/stats/common_prom.go @@ -263,7 +263,9 @@ var ( // NOTE naming convention: ".n" for the count and ".ns" for duration (nanoseconds) // compare with coreStats.initProm() -func (r *runner) reg(_ *meta.Snode, name, kind string) { +func (r *runner) reg(_ *meta.Snode, name, kind string, extra ...*Extra) { + _ = extra // TODO -- FIXME: in progress + v := &statsValue{kind: kind} // in StatsD metrics ":" delineates the name and the value - replace with underscore switch kind { diff --git a/stats/common_statsd.go b/stats/common_statsd.go index f91e35ac55c..0f7b95b84fb 100644 --- a/stats/common_statsd.go +++ b/stats/common_statsd.go @@ -281,7 +281,9 @@ func (s *coreStats) reset(errorsOnly bool) { // NOTE naming convention: ".n" for the count and ".ns" for duration (nanoseconds) // compare with coreStats.initProm() -func (r *runner) reg(snode *meta.Snode, name, kind string) { +func (r *runner) reg(snode *meta.Snode, name, kind string, extra ...*Extra) { + _ = extra // TODO -- FIXME: in progress + v := &statsValue{kind: kind} // in StatsD metrics ":" delineates the name and the value - replace with underscore switch kind { diff --git a/stats/proxy_stats.go b/stats/proxy_stats.go index b2b2d3a996d..66952df6a22 100644 --- a/stats/proxy_stats.go +++ b/stats/proxy_stats.go @@ -11,6 +11,7 @@ import ( "github.com/NVIDIA/aistore/cmn" "github.com/NVIDIA/aistore/cmn/atomic" "github.com/NVIDIA/aistore/cmn/cos" + "github.com/NVIDIA/aistore/cmn/debug" "github.com/NVIDIA/aistore/cmn/nlog" "github.com/NVIDIA/aistore/core" "github.com/NVIDIA/aistore/core/meta" @@ -77,8 +78,9 @@ func (r *Prunner) log(now int64, uptime time.Duration, _ *cmn.Config) { if now >= r.next || !idle { s.sgl.Reset() // sharing w/ CoreStats.copyT r.ctracker.write(s.sgl, r.sorted, false /*target*/, idle) - if s.sgl.Len() > 3 { // skip '{}' + if l := s.sgl.Len(); l > 3 { // skip '{}' line := string(s.sgl.Bytes()) + debug.Assert(l < s.sgl.Slab().Size(), l, " vs slab ", s.sgl.Slab().Size()) if line != r.prev { nlog.Infoln(line) r.prev = line diff --git a/stats/target_stats.go b/stats/target_stats.go index a0976dd0bf1..eb7971372ea 100644 --- a/stats/target_stats.go +++ b/stats/target_stats.go @@ -47,15 +47,15 @@ const ( VerChangeSize = "ver.change.size" // errors - ErrCksumCount = "err.cksum.n" - ErrCksumSize = "err.cksum.size" + ErrCksumCount = errPrefix + "cksum.n" + ErrCksumSize = errPrefix + "cksum.size" - ErrFSHCCount = "err.fshc.n" + ErrFSHCCount = errPrefix + "fshc.n" // IO errors (must have ioErrPrefix) - IOErrGetCount = "err.io.get.n" - IOErrPutCount = "err.io.put.n" - IOErrDeleteCount = "err.io.del.n" + IOErrGetCount = ioErrPrefix + "get.n" + IOErrPutCount = ioErrPrefix + "put.n" + IOErrDeleteCount = ioErrPrefix + "del.n" // KindLatency PutLatency = "put.ns" @@ -409,8 +409,9 @@ func (r *Trunner) log(now int64, uptime time.Duration, config *cmn.Config) { if now >= r.next || !idle { s.sgl.Reset() // sharing w/ CoreStats.copyT r.ctracker.write(s.sgl, r.sorted, true /*target*/, idle) - if s.sgl.Len() > 3 { // skip '{}' + if l := s.sgl.Len(); l > 3 { // skip '{}' line := string(s.sgl.Bytes()) + debug.Assert(l < s.sgl.Slab().Size(), l, " vs slab ", s.sgl.Slab().Size()) if line != r.prev { r.lines = append(r.lines, line) r.prev = line