Skip to content

Commit

Permalink
observability: add prometheus variable labels; remove collector
Browse files Browse the repository at this point in the history
* amend "re-initializing backends"
* part four, prev. commit: d2ceca3

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Dec 25, 2024
1 parent d2ceca3 commit 9290dc5
Show file tree
Hide file tree
Showing 11 changed files with 179 additions and 156 deletions.
4 changes: 2 additions & 2 deletions ais/backend/ais.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ var (
preg, treg *regexp.Regexp
)

func NewAIS(t core.TargetPut, tstats stats.Tracker) *AISbp {
func NewAIS(t core.TargetPut, tstats stats.Tracker, startingUp bool) *AISbp {
suff := regexp.QuoteMeta(meta.SnameSuffix)
preg = regexp.MustCompile(regexp.QuoteMeta(meta.PnamePrefix) + `\S*` + suff + ": ")
treg = regexp.MustCompile(regexp.QuoteMeta(meta.TnamePrefix) + `\S*` + suff + ": ")
Expand All @@ -74,7 +74,7 @@ func NewAIS(t core.TargetPut, tstats stats.Tracker) *AISbp {
alias: make(cos.StrKVs),
base: base{provider: apc.AIS},
}
bp.base.init(t.Snode(), tstats)
bp.base.init(t.Snode(), tstats, startingUp)
return bp
}

Expand Down
11 changes: 4 additions & 7 deletions ais/backend/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,10 @@ func NewAWS(t core.TargetPut, tstats stats.Tracker, startingUp bool) (core.Backe
mm: t.PageMM(),
base: base{provider: apc.AWS},
}
if startingUp {
// register metrics only once
bp.base.init(t.Snode(), tstats)
} else {
// reset clients map to recreate and reload credentials
clients.Clear()
}
// register metrics
bp.base.init(t.Snode(), tstats, startingUp)
// reset clients map
clients.Clear()
return bp, nil
}

Expand Down
7 changes: 3 additions & 4 deletions ais/backend/azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,9 @@ func NewAzure(t core.TargetPut, tstats stats.Tracker, startingUp bool) (core.Bac
u: blurl,
base: base{provider: apc.Azure},
}
if startingUp {
// register metrics only once
bp.base.init(t.Snode(), tstats)
}
// register metrics
bp.base.init(t.Snode(), tstats, startingUp)

return bp, nil
}

Expand Down
283 changes: 155 additions & 128 deletions ais/backend/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,176 +12,203 @@ import (
"github.com/NVIDIA/aistore/cmn"
"github.com/NVIDIA/aistore/cmn/cos"
"github.com/NVIDIA/aistore/cmn/debug"
"github.com/NVIDIA/aistore/cmn/nlog"
"github.com/NVIDIA/aistore/core"
"github.com/NVIDIA/aistore/core/meta"
"github.com/NVIDIA/aistore/fs"
"github.com/NVIDIA/aistore/stats"
)

const numBackendMetricks = 12

type base struct {
metrics cos.StrKVs // this backend's metric names (below)
provider string
}

// NOTE: `stats.LatencyToCounter()` - a public helper that relies on the naming convention below
func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
prefix := b.provider
func (b *base) init(snode *meta.Snode, tr stats.Tracker, startingUp bool) {
var (
prefix = b.provider
regExt = true
)
if prefix == apc.AIS {
prefix = apc.RemAIS
}

if !startingUp {
// re-initializing or enabling at runtime
all := tr.GetMetricNames()
if _, ok := all[prefix+"."+stats.GetCount]; ok {
nlog.Infoln(prefix, "backend metrics already reg-ed")
regExt = false
}
}

labels := cos.StrKVs{"backend": prefix}
b.metrics = make(map[string]string, 12)
b.metrics = make(map[string]string, numBackendMetricks)

// GET
b.metrics[stats.GetCount] = prefix + "." + stats.GetCount
b.metrics[stats.GetLatencyTotal] = prefix + "." + stats.GetLatencyTotal
b.metrics[stats.GetE2ELatencyTotal] = prefix + "." + stats.GetE2ELatencyTotal
b.metrics[stats.GetSize] = prefix + "." + stats.GetSize

tr.RegExtMetric(snode,
b.metrics[stats.GetCount],
stats.KindCounter,
&stats.Extra{
Help: "GET: total number of executed remote requests (cold GETs)",
StrName: "remote_get_count",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.GetLatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "GET: total cumulative time (nanoseconds) to execute cold GETs and store new object versions in-cluster",
StrName: "remote_get_ns_total",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.GetE2ELatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "GET: total end-to-end time (nanoseconds) servicing remote requests; " +
"includes: receiving request, executing cold-GET, storing new object version in-cluster, and transmitting response",
StrName: "remote_e2e_get_ns_total",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.GetSize],
stats.KindSize,
&stats.Extra{
Help: "GET: total cumulative size (bytes) of all cold-GET transactions",
StrName: "remote_get_bytes_total",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
if regExt {
tr.RegExtMetric(snode,
b.metrics[stats.GetCount],
stats.KindCounter,
&stats.Extra{
Help: "GET: total number of executed remote requests (cold GETs)",
StrName: "remote_get_count",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.GetLatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "GET: total cumulative time (nanoseconds) to execute cold GETs and store new object versions in-cluster",
StrName: "remote_get_ns_total",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.GetE2ELatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "GET: total end-to-end time (nanoseconds) servicing remote requests; " +
"includes: receiving request, executing cold-GET, storing new object version in-cluster, and transmitting response",
StrName: "remote_e2e_get_ns_total",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.GetSize],
stats.KindSize,
&stats.Extra{
Help: "GET: total cumulative size (bytes) of all cold-GET transactions",
StrName: "remote_get_bytes_total",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
}

// PUT
b.metrics[stats.PutCount] = prefix + "." + stats.PutCount
b.metrics[stats.PutLatencyTotal] = prefix + "." + stats.PutLatencyTotal
b.metrics[stats.PutE2ELatencyTotal] = prefix + "." + stats.PutE2ELatencyTotal
b.metrics[stats.PutSize] = prefix + "." + stats.PutSize

tr.RegExtMetric(snode,
b.metrics[stats.PutCount],
stats.KindCounter,
&stats.Extra{
Help: "PUT: total number of executed remote requests to a given backend",
StrName: "remote_put_count",
Labels: labels,
VarLabs: stats.BckXactVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.PutLatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "PUT: total cumulative time (nanoseconds) to execute remote requests and store new object versions in-cluster",
StrName: "remote_put_ns_total",
Labels: labels,
VarLabs: stats.BckXactVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.PutE2ELatencyTotal],
stats.KindTotal,
&stats.Extra{
StrName: "remote_e2e_put_ns_total",
Help: "PUT: total end-to-end time (nanoseconds) servicing remote requests; " +
"includes: receiving PUT payload, storing it in-cluster, executing remote PUT, finalizing new in-cluster object",
Labels: labels,
VarLabs: stats.BckXactVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.PutSize],
stats.KindSize,
&stats.Extra{
Help: "PUT: total cumulative size (bytes) of all PUTs to a given remote backend",
StrName: "remote_e2e_put_bytes_total",
Labels: labels,
VarLabs: stats.BckXactVarlabs,
},
)
if regExt {
tr.RegExtMetric(snode,
b.metrics[stats.PutCount],
stats.KindCounter,
&stats.Extra{
Help: "PUT: total number of executed remote requests to a given backend",
StrName: "remote_put_count",
Labels: labels,
VarLabs: stats.BckXactVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.PutLatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "PUT: total cumulative time (nanoseconds) to execute remote requests and store new object versions in-cluster",
StrName: "remote_put_ns_total",
Labels: labels,
VarLabs: stats.BckXactVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.PutE2ELatencyTotal],
stats.KindTotal,
&stats.Extra{
StrName: "remote_e2e_put_ns_total",
Help: "PUT: total end-to-end time (nanoseconds) servicing remote requests; " +
"includes: receiving PUT payload, storing it in-cluster, executing remote PUT, finalizing new in-cluster object",
Labels: labels,
VarLabs: stats.BckXactVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.PutSize],
stats.KindSize,
&stats.Extra{
Help: "PUT: total cumulative size (bytes) of all PUTs to a given remote backend",
StrName: "remote_e2e_put_bytes_total",
Labels: labels,
VarLabs: stats.BckXactVarlabs,
},
)
}

// HEAD
b.metrics[stats.HeadCount] = prefix + "." + stats.HeadCount
b.metrics[stats.HeadLatencyTotal] = prefix + "." + stats.HeadLatencyTotal

tr.RegExtMetric(snode,
b.metrics[stats.HeadCount],
stats.KindCounter,
&stats.Extra{
Help: "HEAD: total number of executed remote requests to a given backend",
StrName: "remote_head_count",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.HeadLatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "HEAD: total cumulative time (nanoseconds) to execute remote requests",
StrName: "remote_head_ns_total",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
if regExt {
tr.RegExtMetric(snode,
b.metrics[stats.HeadCount],
stats.KindCounter,
&stats.Extra{
Help: "HEAD: total number of executed remote requests to a given backend",
StrName: "remote_head_count",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.HeadLatencyTotal],
stats.KindTotal,
&stats.Extra{
Help: "HEAD: total cumulative time (nanoseconds) to execute remote requests",
StrName: "remote_head_ns_total",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
}

// version changed out-of-band
b.metrics[stats.VerChangeCount] = prefix + "." + stats.VerChangeCount
b.metrics[stats.VerChangeSize] = prefix + "." + stats.VerChangeSize

tr.RegExtMetric(snode,
b.metrics[stats.VerChangeCount],
stats.KindCounter,
&stats.Extra{
Help: "number of out-of-band updates (by a 3rd party performing remote PUTs outside this cluster)",
StrName: "remote_ver_change_count",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.VerChangeSize],
stats.KindSize,
&stats.Extra{
Help: "total cumulative size of objects that were updated out-of-band",
StrName: "remote_ver_change_bytes_total",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
if regExt {
tr.RegExtMetric(snode,
b.metrics[stats.VerChangeCount],
stats.KindCounter,
&stats.Extra{
Help: "number of out-of-band updates (by a 3rd party performing remote PUTs outside this cluster)",
StrName: "remote_ver_change_count",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
tr.RegExtMetric(snode,
b.metrics[stats.VerChangeSize],
stats.KindSize,
&stats.Extra{
Help: "total cumulative size of objects that were updated out-of-band",
StrName: "remote_ver_change_bytes_total",
Labels: labels,
VarLabs: stats.BckVarlabs,
},
)
}
}

func (b *base) Provider() string { return b.provider }
func (b *base) MetricName(name string) string { return b.metrics[name] }
func (b *base) Provider() string { return b.provider }

func (b *base) MetricName(name string) string {
out, ok := b.metrics[name]
debug.Assert(ok && out != "", name)
return out
}

func (b *base) CreateBucket(_ *meta.Bck) (int, error) {
return http.StatusNotImplemented, cmn.NewErrUnsupp("create", b.provider+" bucket")
Expand Down
7 changes: 3 additions & 4 deletions ais/backend/gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,9 @@ func NewGCP(t core.TargetPut, tstats stats.Tracker, startingUp bool) (_ core.Bac
projectID: projectID,
base: base{provider: apc.GCP},
}
if startingUp {
// register metrics only once
bp.base.init(t.Snode(), tstats)
}
// register metrics
bp.base.init(t.Snode(), tstats, startingUp)

gctx = context.Background()
gcpClient, err = bp.createClient(gctx)

Expand Down
Loading

0 comments on commit 9290dc5

Please sign in to comment.