Skip to content

Commit

Permalink
high number of goroutines: revise/amend; add node-state alert
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Aug 21, 2024
1 parent 49df738 commit 7324344
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 40 deletions.
2 changes: 1 addition & 1 deletion ais/htrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func (h *htrun) ByteMM() *memsys.MMSA { return h.smm }
// NOTE: currently, only 'resume' (see also: kaSuspendMsg)
func (h *htrun) smapUpdatedCB(_, _ *smapX, nfl, ofl cos.BitFlags) {
if ofl.IsAnySet(meta.SnodeMaintDecomm) && !nfl.IsAnySet(meta.SnodeMaintDecomm) {
h.statsT.SetClrFlag(stats.NodeStateFlags, 0, cos.MaintenanceMode)
h.statsT.ClrFlag(stats.NodeStateFlags, cos.MaintenanceMode)
h.keepalive.ctrl(kaResumeMsg)
}
}
Expand Down
8 changes: 6 additions & 2 deletions cmn/cos/node_state_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ const (
LowCapacity // (used > high); warning: OOS possible soon..
LowMemory // ditto OOM
DiskFault // red
NoMountpaths // red
NoMountpaths // red (TODO: reserved, not used)
NumGoroutines // red
)

func (f NodeStateFlags) IsOK() bool { return f == NodeStarted|ClusterStarted }
Expand Down Expand Up @@ -94,9 +95,12 @@ func (f NodeStateFlags) String() string {
if f&DiskFault == DiskFault {
sb.WriteString("disk-fault,")
}
if f&DiskFault == NoMountpaths {
if f&NoMountpaths == NoMountpaths {
sb.WriteString("no-mountpaths,")
}
if f&NumGoroutines == NumGoroutines {
sb.WriteString("high-number-of-goroutines,")
}
s := sb.String()
if s == "" {
err := fmt.Errorf("unknown flag %b", int64(f))
Expand Down
1 change: 1 addition & 0 deletions cmn/cos/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ type (
Inc(name string)
Add(name string, val int64)
SetFlag(name string, set NodeStateFlags)
ClrFlag(name string, clr NodeStateFlags)
SetClrFlag(name string, set, clr NodeStateFlags)
Get(name string) int64
AddMany(namedVal64 ...NamedVal64)
Expand Down
1 change: 1 addition & 0 deletions core/mock/stats_mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ func (*StatsTracker) IncErr(string)
func (*StatsTracker) Inc(string) {}
func (*StatsTracker) Add(string, int64) {}
func (*StatsTracker) SetFlag(string, cos.NodeStateFlags) {}
func (*StatsTracker) ClrFlag(string, cos.NodeStateFlags) {}
func (*StatsTracker) SetClrFlag(string, cos.NodeStateFlags, cos.NodeStateFlags) {}
func (*StatsTracker) AddMany(...cos.NamedVal64) {}
func (*StatsTracker) RegExtMetric(*meta.Snode, string, string, *stats.Extra) {}
Expand Down
2 changes: 1 addition & 1 deletion reb/globrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ func (reb *Reb) RunRebalance(smap *meta.Smap, id int64, notif *xact.NotifXact, t
}

reb.fini(rargs, logHdr, err)
tstats.SetClrFlag(stats.NodeStateFlags, 0, cos.Rebalancing)
tstats.ClrFlag(stats.NodeStateFlags, cos.Rebalancing)

offGFN()
}
Expand Down
76 changes: 40 additions & 36 deletions stats/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,12 @@ const (
const (
maxLogSizeCheckTime = time.Hour // periodically check the logs for max accumulated size
startupSleep = 300 * time.Millisecond // periodically poll ClusterStarted()
numGorHighCheckTime = 10 * time.Minute // periodically log a warning if the number of goroutines remains high
)

// number-of-goroutines watermarks expressed as multipliers over the number of CPUs (GOMAXPROCS)
const (
numGorHigh = 100
numGorExtreme = 1000
ngrHighTime = 10 * time.Minute // log a warning if the number of goroutines remains high
ngrExtremeTime = time.Minute // (when extreme)
lshiftGorHigh = 5 // max num expressed as left shift on the num CPUs: (number-of-CPUs * 2^^lshift...)
)

// [naming convention] error counter prefixes
Expand Down Expand Up @@ -316,20 +315,27 @@ func (r *runner) AddMany(nvs ...cos.NamedVal64) {

func (r *runner) SetFlag(name string, set cos.NodeStateFlags) {
v := r.core.Tracker[name]
oval := cos.BitFlags(ratomic.LoadInt64(&v.Value))
nval := oval | cos.BitFlags(set)
ratomic.StoreInt64(&v.Value, int64(nval))
oval := ratomic.LoadInt64(&v.Value)
nval := oval | int64(set)
ratomic.StoreInt64(&v.Value, nval)
}

func (r *runner) ClrFlag(name string, clr cos.NodeStateFlags) {
v := r.core.Tracker[name]
oval := ratomic.LoadInt64(&v.Value)
nval := oval &^ int64(clr)
ratomic.StoreInt64(&v.Value, nval)
}

func (r *runner) SetClrFlag(name string, set, clr cos.NodeStateFlags) {
v := r.core.Tracker[name]
oval := cos.BitFlags(ratomic.LoadInt64(&v.Value))
nval := oval | cos.BitFlags(set)
oval := ratomic.LoadInt64(&v.Value)
nval := oval | int64(set)
if cos.NodeStateFlags(nval).IsOK() && cos.NodeStateFlags(oval).IsOK() {
return
}
nval &^= cos.BitFlags(clr)
ratomic.StoreInt64(&v.Value, int64(nval))
nval &^= int64(clr)
ratomic.StoreInt64(&v.Value, nval)
}

func (r *runner) Name() string { return r.name }
Expand Down Expand Up @@ -414,7 +420,7 @@ waitStartup:
r.core.initStatsdOrProm(r.node.Snode(), r)

var (
checkNumGorHigh int64
lastNgr int64
startTime = mono.NanoTime() // uptime henceforth
lastDateTimestamp = startTime
)
Expand All @@ -424,7 +430,7 @@ waitStartup:
now := mono.NanoTime()
config = cmn.GCO.Get()
logger.log(now, time.Duration(now-startTime) /*uptime*/, config)
checkNumGorHigh = _checkGor(now, checkNumGorHigh, goMaxProcs)
lastNgr = r.checkNgr(now, lastNgr, goMaxProcs)

if statsTime != config.Periodic.StatsTime.D() {
statsTime = config.Periodic.StatsTime.D()
Expand Down Expand Up @@ -502,6 +508,27 @@ func (r *runner) GetMetricNames() cos.StrKVs {
return out
}

func (r *runner) checkNgr(now, lastNgr int64, goMaxProcs int) int64 {
lim := goMaxProcs << lshiftGorHigh
ngr := runtime.NumGoroutine()
if ngr < lim {
if lastNgr != 0 {
r.ClrFlag(NodeStateFlags, cos.NumGoroutines)
}
return 0
}
if lastNgr == 0 {
r.SetFlag(NodeStateFlags, cos.NumGoroutines)
lastNgr = now
} else if d := time.Duration(now - lastNgr); (d >= ngrHighTime) || (ngr > lim<<1 && d >= ngrExtremeTime) {
lastNgr = now
}
if lastNgr == now {
nlog.Warningln("High number of goroutines:", ngr)
}
return lastNgr
}

////////////////
// statsValue //
////////////////
Expand Down Expand Up @@ -687,26 +714,3 @@ func roundMBs(val int64) (mbs float64) {
mbs = float64(num) / 100
return
}

func _checkGor(now, checkNumGorHigh int64, goMaxProcs int) int64 {
var (
ngr = runtime.NumGoroutine()
extreme bool
)
if ngr < goMaxProcs*numGorHigh {
return 0
}
if ngr >= goMaxProcs*numGorExtreme {
extreme = true
nlog.Errorf("Extremely high number of goroutines: %d", ngr)
}
if checkNumGorHigh == 0 {
checkNumGorHigh = now
} else if time.Duration(now-checkNumGorHigh) > numGorHighCheckTime {
if !extreme {
nlog.Warningf("High number of goroutines: %d", ngr)
}
checkNumGorHigh = 0
}
return checkNumGorHigh
}
1 change: 1 addition & 0 deletions transport/obj_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ func (*dummyStatsTracker) Add(string, int64)
func (*dummyStatsTracker) Inc(string) {}
func (*dummyStatsTracker) Get(string) int64 { return 0 }
func (*dummyStatsTracker) AddMany(...cos.NamedVal64) {}
func (*dummyStatsTracker) ClrFlag(string, cos.NodeStateFlags) {}
func (*dummyStatsTracker) SetFlag(string, cos.NodeStateFlags) {}
func (*dummyStatsTracker) SetClrFlag(string, cos.NodeStateFlags, cos.NodeStateFlags) {}

Expand Down

0 comments on commit 7324344

Please sign in to comment.