diff --git a/ais/backend/ais.go b/ais/backend/ais.go index e13c640fc54..57925cf7f90 100644 --- a/ais/backend/ais.go +++ b/ais/backend/ais.go @@ -64,7 +64,7 @@ var ( preg, treg *regexp.Regexp ) -func NewAIS(t core.TargetPut, tstats stats.Tracker) *AISbp { +func NewAIS(t core.TargetPut, tstats stats.Tracker, startingUp bool) *AISbp { suff := regexp.QuoteMeta(meta.SnameSuffix) preg = regexp.MustCompile(regexp.QuoteMeta(meta.PnamePrefix) + `\S*` + suff + ": ") treg = regexp.MustCompile(regexp.QuoteMeta(meta.TnamePrefix) + `\S*` + suff + ": ") @@ -74,7 +74,7 @@ func NewAIS(t core.TargetPut, tstats stats.Tracker) *AISbp { alias: make(cos.StrKVs), base: base{provider: apc.AIS}, } - bp.base.init(t.Snode(), tstats) + bp.base.init(t.Snode(), tstats, startingUp) return bp } diff --git a/ais/backend/aws.go b/ais/backend/aws.go index a3434c0a36a..e4fbc0aad88 100644 --- a/ais/backend/aws.go +++ b/ais/backend/aws.go @@ -74,13 +74,10 @@ func NewAWS(t core.TargetPut, tstats stats.Tracker, startingUp bool) (core.Backe mm: t.PageMM(), base: base{provider: apc.AWS}, } - if startingUp { - // register metrics only once - bp.base.init(t.Snode(), tstats) - } else { - // reset clients map to recreate and reload credentials - clients.Clear() - } + // register metrics + bp.base.init(t.Snode(), tstats, startingUp) + // reset clients map + clients.Clear() return bp, nil } diff --git a/ais/backend/azure.go b/ais/backend/azure.go index 8435d7fc42a..c70711ab210 100644 --- a/ais/backend/azure.go +++ b/ais/backend/azure.go @@ -112,10 +112,9 @@ func NewAzure(t core.TargetPut, tstats stats.Tracker, startingUp bool) (core.Bac u: blurl, base: base{provider: apc.Azure}, } - if startingUp { - // register metrics only once - bp.base.init(t.Snode(), tstats) - } + // register metrics + bp.base.init(t.Snode(), tstats, startingUp) + return bp, nil } diff --git a/ais/backend/common.go b/ais/backend/common.go index 00c4dc32101..d66904b2954 100644 --- a/ais/backend/common.go +++ b/ais/backend/common.go @@ -12,26 +12,40 @@ import ( "github.com/NVIDIA/aistore/cmn" "github.com/NVIDIA/aistore/cmn/cos" "github.com/NVIDIA/aistore/cmn/debug" + "github.com/NVIDIA/aistore/cmn/nlog" "github.com/NVIDIA/aistore/core" "github.com/NVIDIA/aistore/core/meta" "github.com/NVIDIA/aistore/fs" "github.com/NVIDIA/aistore/stats" ) +const numBackendMetricks = 12 + type base struct { metrics cos.StrKVs // this backend's metric names (below) provider string } -// NOTE: `stats.LatencyToCounter()` - a public helper that relies on the naming convention below -func (b *base) init(snode *meta.Snode, tr stats.Tracker) { - prefix := b.provider +func (b *base) init(snode *meta.Snode, tr stats.Tracker, startingUp bool) { + var ( + prefix = b.provider + regExt = true + ) if prefix == apc.AIS { prefix = apc.RemAIS } + if !startingUp { + // re-initializing or enabling at runtime + all := tr.GetMetricNames() + if _, ok := all[prefix+"."+stats.GetCount]; ok { + nlog.Infoln(prefix, "backend metrics already reg-ed") + regExt = false + } + } + labels := cos.StrKVs{"backend": prefix} - b.metrics = make(map[string]string, 12) + b.metrics = make(map[string]string, numBackendMetricks) // GET b.metrics[stats.GetCount] = prefix + "." + stats.GetCount @@ -39,47 +53,49 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) { b.metrics[stats.GetE2ELatencyTotal] = prefix + "." + stats.GetE2ELatencyTotal b.metrics[stats.GetSize] = prefix + "." + stats.GetSize - tr.RegExtMetric(snode, - b.metrics[stats.GetCount], - stats.KindCounter, - &stats.Extra{ - Help: "GET: total number of executed remote requests (cold GETs)", - StrName: "remote_get_count", - Labels: labels, - VarLabs: stats.BckVarlabs, - }, - ) - tr.RegExtMetric(snode, - b.metrics[stats.GetLatencyTotal], - stats.KindTotal, - &stats.Extra{ - Help: "GET: total cumulative time (nanoseconds) to execute cold GETs and store new object versions in-cluster", - StrName: "remote_get_ns_total", - Labels: labels, - VarLabs: stats.BckVarlabs, - }, - ) - tr.RegExtMetric(snode, - b.metrics[stats.GetE2ELatencyTotal], - stats.KindTotal, - &stats.Extra{ - Help: "GET: total end-to-end time (nanoseconds) servicing remote requests; " + - "includes: receiving request, executing cold-GET, storing new object version in-cluster, and transmitting response", - StrName: "remote_e2e_get_ns_total", - Labels: labels, - VarLabs: stats.BckVarlabs, - }, - ) - tr.RegExtMetric(snode, - b.metrics[stats.GetSize], - stats.KindSize, - &stats.Extra{ - Help: "GET: total cumulative size (bytes) of all cold-GET transactions", - StrName: "remote_get_bytes_total", - Labels: labels, - VarLabs: stats.BckVarlabs, - }, - ) + if regExt { + tr.RegExtMetric(snode, + b.metrics[stats.GetCount], + stats.KindCounter, + &stats.Extra{ + Help: "GET: total number of executed remote requests (cold GETs)", + StrName: "remote_get_count", + Labels: labels, + VarLabs: stats.BckVarlabs, + }, + ) + tr.RegExtMetric(snode, + b.metrics[stats.GetLatencyTotal], + stats.KindTotal, + &stats.Extra{ + Help: "GET: total cumulative time (nanoseconds) to execute cold GETs and store new object versions in-cluster", + StrName: "remote_get_ns_total", + Labels: labels, + VarLabs: stats.BckVarlabs, + }, + ) + tr.RegExtMetric(snode, + b.metrics[stats.GetE2ELatencyTotal], + stats.KindTotal, + &stats.Extra{ + Help: "GET: total end-to-end time (nanoseconds) servicing remote requests; " + + "includes: receiving request, executing cold-GET, storing new object version in-cluster, and transmitting response", + StrName: "remote_e2e_get_ns_total", + Labels: labels, + VarLabs: stats.BckVarlabs, + }, + ) + tr.RegExtMetric(snode, + b.metrics[stats.GetSize], + stats.KindSize, + &stats.Extra{ + Help: "GET: total cumulative size (bytes) of all cold-GET transactions", + StrName: "remote_get_bytes_total", + Labels: labels, + VarLabs: stats.BckVarlabs, + }, + ) + } // PUT b.metrics[stats.PutCount] = prefix + "." + stats.PutCount @@ -87,101 +103,112 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) { b.metrics[stats.PutE2ELatencyTotal] = prefix + "." + stats.PutE2ELatencyTotal b.metrics[stats.PutSize] = prefix + "." + stats.PutSize - tr.RegExtMetric(snode, - b.metrics[stats.PutCount], - stats.KindCounter, - &stats.Extra{ - Help: "PUT: total number of executed remote requests to a given backend", - StrName: "remote_put_count", - Labels: labels, - VarLabs: stats.BckXactVarlabs, - }, - ) - tr.RegExtMetric(snode, - b.metrics[stats.PutLatencyTotal], - stats.KindTotal, - &stats.Extra{ - Help: "PUT: total cumulative time (nanoseconds) to execute remote requests and store new object versions in-cluster", - StrName: "remote_put_ns_total", - Labels: labels, - VarLabs: stats.BckXactVarlabs, - }, - ) - tr.RegExtMetric(snode, - b.metrics[stats.PutE2ELatencyTotal], - stats.KindTotal, - &stats.Extra{ - StrName: "remote_e2e_put_ns_total", - Help: "PUT: total end-to-end time (nanoseconds) servicing remote requests; " + - "includes: receiving PUT payload, storing it in-cluster, executing remote PUT, finalizing new in-cluster object", - Labels: labels, - VarLabs: stats.BckXactVarlabs, - }, - ) - tr.RegExtMetric(snode, - b.metrics[stats.PutSize], - stats.KindSize, - &stats.Extra{ - Help: "PUT: total cumulative size (bytes) of all PUTs to a given remote backend", - StrName: "remote_e2e_put_bytes_total", - Labels: labels, - VarLabs: stats.BckXactVarlabs, - }, - ) + if regExt { + tr.RegExtMetric(snode, + b.metrics[stats.PutCount], + stats.KindCounter, + &stats.Extra{ + Help: "PUT: total number of executed remote requests to a given backend", + StrName: "remote_put_count", + Labels: labels, + VarLabs: stats.BckXactVarlabs, + }, + ) + tr.RegExtMetric(snode, + b.metrics[stats.PutLatencyTotal], + stats.KindTotal, + &stats.Extra{ + Help: "PUT: total cumulative time (nanoseconds) to execute remote requests and store new object versions in-cluster", + StrName: "remote_put_ns_total", + Labels: labels, + VarLabs: stats.BckXactVarlabs, + }, + ) + tr.RegExtMetric(snode, + b.metrics[stats.PutE2ELatencyTotal], + stats.KindTotal, + &stats.Extra{ + StrName: "remote_e2e_put_ns_total", + Help: "PUT: total end-to-end time (nanoseconds) servicing remote requests; " + + "includes: receiving PUT payload, storing it in-cluster, executing remote PUT, finalizing new in-cluster object", + Labels: labels, + VarLabs: stats.BckXactVarlabs, + }, + ) + tr.RegExtMetric(snode, + b.metrics[stats.PutSize], + stats.KindSize, + &stats.Extra{ + Help: "PUT: total cumulative size (bytes) of all PUTs to a given remote backend", + StrName: "remote_e2e_put_bytes_total", + Labels: labels, + VarLabs: stats.BckXactVarlabs, + }, + ) + } // HEAD b.metrics[stats.HeadCount] = prefix + "." + stats.HeadCount b.metrics[stats.HeadLatencyTotal] = prefix + "." + stats.HeadLatencyTotal - tr.RegExtMetric(snode, - b.metrics[stats.HeadCount], - stats.KindCounter, - &stats.Extra{ - Help: "HEAD: total number of executed remote requests to a given backend", - StrName: "remote_head_count", - Labels: labels, - VarLabs: stats.BckVarlabs, - }, - ) - tr.RegExtMetric(snode, - b.metrics[stats.HeadLatencyTotal], - stats.KindTotal, - &stats.Extra{ - Help: "HEAD: total cumulative time (nanoseconds) to execute remote requests", - StrName: "remote_head_ns_total", - Labels: labels, - VarLabs: stats.BckVarlabs, - }, - ) + if regExt { + tr.RegExtMetric(snode, + b.metrics[stats.HeadCount], + stats.KindCounter, + &stats.Extra{ + Help: "HEAD: total number of executed remote requests to a given backend", + StrName: "remote_head_count", + Labels: labels, + VarLabs: stats.BckVarlabs, + }, + ) + tr.RegExtMetric(snode, + b.metrics[stats.HeadLatencyTotal], + stats.KindTotal, + &stats.Extra{ + Help: "HEAD: total cumulative time (nanoseconds) to execute remote requests", + StrName: "remote_head_ns_total", + Labels: labels, + VarLabs: stats.BckVarlabs, + }, + ) + } // version changed out-of-band b.metrics[stats.VerChangeCount] = prefix + "." + stats.VerChangeCount b.metrics[stats.VerChangeSize] = prefix + "." + stats.VerChangeSize - tr.RegExtMetric(snode, - b.metrics[stats.VerChangeCount], - stats.KindCounter, - &stats.Extra{ - Help: "number of out-of-band updates (by a 3rd party performing remote PUTs outside this cluster)", - StrName: "remote_ver_change_count", - Labels: labels, - VarLabs: stats.BckVarlabs, - }, - ) - tr.RegExtMetric(snode, - b.metrics[stats.VerChangeSize], - stats.KindSize, - &stats.Extra{ - Help: "total cumulative size of objects that were updated out-of-band", - StrName: "remote_ver_change_bytes_total", - Labels: labels, - VarLabs: stats.BckVarlabs, - }, - ) + if regExt { + tr.RegExtMetric(snode, + b.metrics[stats.VerChangeCount], + stats.KindCounter, + &stats.Extra{ + Help: "number of out-of-band updates (by a 3rd party performing remote PUTs outside this cluster)", + StrName: "remote_ver_change_count", + Labels: labels, + VarLabs: stats.BckVarlabs, + }, + ) + tr.RegExtMetric(snode, + b.metrics[stats.VerChangeSize], + stats.KindSize, + &stats.Extra{ + Help: "total cumulative size of objects that were updated out-of-band", + StrName: "remote_ver_change_bytes_total", + Labels: labels, + VarLabs: stats.BckVarlabs, + }, + ) + } } -func (b *base) Provider() string { return b.provider } -func (b *base) MetricName(name string) string { return b.metrics[name] } +func (b *base) Provider() string { return b.provider } + +func (b *base) MetricName(name string) string { + out, ok := b.metrics[name] + debug.Assert(ok && out != "", name) + return out +} func (b *base) CreateBucket(_ *meta.Bck) (int, error) { return http.StatusNotImplemented, cmn.NewErrUnsupp("create", b.provider+" bucket") diff --git a/ais/backend/gcp.go b/ais/backend/gcp.go index 219bf40bb6b..7a4890b6571 100644 --- a/ais/backend/gcp.go +++ b/ais/backend/gcp.go @@ -89,10 +89,9 @@ func NewGCP(t core.TargetPut, tstats stats.Tracker, startingUp bool) (_ core.Bac projectID: projectID, base: base{provider: apc.GCP}, } - if startingUp { - // register metrics only once - bp.base.init(t.Snode(), tstats) - } + // register metrics + bp.base.init(t.Snode(), tstats, startingUp) + gctx = context.Background() gcpClient, err = bp.createClient(gctx) diff --git a/ais/backend/ht.go b/ais/backend/ht.go index f04d482f18e..858c4cf5264 100644 --- a/ais/backend/ht.go +++ b/ais/backend/ht.go @@ -34,13 +34,13 @@ type ( // interface guard var _ core.Backend = (*htbp)(nil) -func NewHT(t core.TargetPut, config *cmn.Config, tstats stats.Tracker) (core.Backend, error) { +func NewHT(t core.TargetPut, config *cmn.Config, tstats stats.Tracker, startingUp bool) (core.Backend, error) { bp := &htbp{ t: t, base: base{provider: apc.HT}, } bp.cliH, bp.cliTLS = cmn.NewDefaultClients(config.Client.TimeoutLong.D()) - bp.init(t.Snode(), tstats) + bp.init(t.Snode(), tstats, startingUp) return bp, nil } diff --git a/ais/backend/mock_ht.go b/ais/backend/mock_ht.go index 18edc42c849..6ac2f0e3585 100644 --- a/ais/backend/mock_ht.go +++ b/ais/backend/mock_ht.go @@ -13,6 +13,6 @@ import ( "github.com/NVIDIA/aistore/stats" ) -func NewHT(core.TargetPut, *cmn.Config, stats.Tracker) (core.Backend, error) { +func NewHT(core.TargetPut, *cmn.Config, stats.Tracker, bool) (core.Backend, error) { return nil, &cmn.ErrInitBackend{Provider: apc.HT} } diff --git a/ais/backend/oci.go b/ais/backend/oci.go index 4f9755660be..d51798db898 100644 --- a/ais/backend/oci.go +++ b/ais/backend/oci.go @@ -74,6 +74,9 @@ type ocibp struct { base } +// interface guard +var _ core.Backend = (*ocibp)(nil) + func NewOCI(t core.TargetPut, tstats stats.Tracker, startingUp bool) (core.Backend, error) { bp := &ocibp{ t: t, @@ -128,10 +131,8 @@ func NewOCI(t core.TargetPut, tstats stats.Tracker, startingUp bool) (core.Backe } bp.namespace = *resp.Value - if startingUp { - // register metrics only once - bp.base.init(t.Snode(), tstats) - } + // register metrics + bp.base.init(t.Snode(), tstats, startingUp) return bp, nil } diff --git a/ais/target.go b/ais/target.go index a41220233e1..87a7838e505 100644 --- a/ais/target.go +++ b/ais/target.go @@ -103,7 +103,7 @@ func (*target) interruptedRestarted() (i, r bool) { func (t *target) initBackends(tstats *stats.Trunner) { config := cmn.GCO.Get() - aisbp := backend.NewAIS(t, tstats) + aisbp := backend.NewAIS(t, tstats, true) t.backend[apc.AIS] = aisbp // always present if aisConf := config.Backend.Get(apc.AIS); aisConf != nil { @@ -139,7 +139,7 @@ func (t *target) initBuiltTagged(tstats *stats.Trunner, config *cmn.Config, star case apc.OCI: add, err = backend.NewOCI(t, tstats, startingUp) case apc.HT: - add, err = backend.NewHT(t, config, tstats) + add, err = backend.NewHT(t, config, tstats, startingUp) case apc.AIS: continue default: diff --git a/ais/tgtcp.go b/ais/tgtcp.go index 51df8e43a70..ce04a4da32d 100644 --- a/ais/tgtcp.go +++ b/ais/tgtcp.go @@ -1217,7 +1217,7 @@ func (t *target) receiveConfig(newConfig *globalConfig, msg *actMsgExt, payload if aisConf := newConfig.Backend.Get(apc.AIS); aisConf != nil { err = t.attachDetachRemAis(newConfig, msg) } else { - t.backend[apc.AIS] = backend.NewAIS(t, t.statsT) + t.backend[apc.AIS] = backend.NewAIS(t, t.statsT, false) } } return diff --git a/stats/common.go b/stats/common.go index 0c91fdba6d9..58004a5a648 100644 --- a/stats/common.go +++ b/stats/common.go @@ -596,7 +596,7 @@ func (r *runner) ResetStats(errorsOnly bool) { } func (r *runner) GetMetricNames() cos.StrKVs { - out := make(cos.StrKVs, 32) + out := make(cos.StrKVs, 48) for name, v := range r.core.Tracker { out[name] = v.kind }