diff --git a/ais/htrun.go b/ais/htrun.go index 6ad836491cf..7be3ce8bc8e 100644 --- a/ais/htrun.go +++ b/ais/htrun.go @@ -103,7 +103,7 @@ func (h *htrun) ByteMM() *memsys.MMSA { return h.smm } // NOTE: currently, only 'resume' (see also: kaSuspendMsg) func (h *htrun) smapUpdatedCB(_, _ *smapX, nfl, ofl cos.BitFlags) { if ofl.IsAnySet(meta.SnodeMaintDecomm) && !nfl.IsAnySet(meta.SnodeMaintDecomm) { - h.statsT.SetClrFlag(stats.NodeStateFlags, 0, cos.MaintenanceMode) + h.statsT.ClrFlag(stats.NodeStateFlags, cos.MaintenanceMode) h.keepalive.ctrl(kaResumeMsg) } } diff --git a/cmn/cos/node_state_info.go b/cmn/cos/node_state_info.go index a2d220b5df1..b73e462156e 100644 --- a/cmn/cos/node_state_info.go +++ b/cmn/cos/node_state_info.go @@ -29,7 +29,8 @@ const ( LowCapacity // (used > high); warning: OOS possible soon.. LowMemory // ditto OOM DiskFault // red - NoMountpaths // red + NoMountpaths // red (TODO: reserved, not used) + NumGoroutines // red ) func (f NodeStateFlags) IsOK() bool { return f == NodeStarted|ClusterStarted } @@ -94,9 +95,12 @@ func (f NodeStateFlags) String() string { if f&DiskFault == DiskFault { sb.WriteString("disk-fault,") } - if f&DiskFault == NoMountpaths { + if f&NoMountpaths == NoMountpaths { sb.WriteString("no-mountpaths,") } + if f&NumGoroutines == NumGoroutines { + sb.WriteString("high-number-of-goroutines,") + } s := sb.String() if s == "" { err := fmt.Errorf("unknown flag %b", int64(f)) diff --git a/cmn/cos/stats.go b/cmn/cos/stats.go index 29d56feec8b..81893c3765f 100644 --- a/cmn/cos/stats.go +++ b/cmn/cos/stats.go @@ -19,6 +19,7 @@ type ( Inc(name string) Add(name string, val int64) SetFlag(name string, set NodeStateFlags) + ClrFlag(name string, clr NodeStateFlags) SetClrFlag(name string, set, clr NodeStateFlags) Get(name string) int64 AddMany(namedVal64 ...NamedVal64) diff --git a/core/mock/stats_mock.go b/core/mock/stats_mock.go index e8aaa20259a..79c0b0ceeba 100644 --- a/core/mock/stats_mock.go +++ b/core/mock/stats_mock.go @@ -27,6 +27,7 @@ func (*StatsTracker) IncErr(string) func (*StatsTracker) Inc(string) {} func (*StatsTracker) Add(string, int64) {} func (*StatsTracker) SetFlag(string, cos.NodeStateFlags) {} +func (*StatsTracker) ClrFlag(string, cos.NodeStateFlags) {} func (*StatsTracker) SetClrFlag(string, cos.NodeStateFlags, cos.NodeStateFlags) {} func (*StatsTracker) AddMany(...cos.NamedVal64) {} func (*StatsTracker) RegExtMetric(*meta.Snode, string, string, *stats.Extra) {} diff --git a/reb/globrun.go b/reb/globrun.go index 34aa8c28b8b..067c762940d 100644 --- a/reb/globrun.go +++ b/reb/globrun.go @@ -240,7 +240,7 @@ func (reb *Reb) RunRebalance(smap *meta.Smap, id int64, notif *xact.NotifXact, t } reb.fini(rargs, logHdr, err) - tstats.SetClrFlag(stats.NodeStateFlags, 0, cos.Rebalancing) + tstats.ClrFlag(stats.NodeStateFlags, cos.Rebalancing) offGFN() } diff --git a/stats/common.go b/stats/common.go index d9e73e35cad..e2504cfc8eb 100644 --- a/stats/common.go +++ b/stats/common.go @@ -48,13 +48,12 @@ const ( const ( maxLogSizeCheckTime = time.Hour // periodically check the logs for max accumulated size startupSleep = 300 * time.Millisecond // periodically poll ClusterStarted() - numGorHighCheckTime = 10 * time.Minute // periodically log a warning if the number of goroutines remains high ) -// number-of-goroutines watermarks expressed as multipliers over the number of CPUs (GOMAXPROCS) const ( - numGorHigh = 100 - numGorExtreme = 1000 + ngrHighTime = 10 * time.Minute // log a warning if the number of goroutines remains high + ngrExtremeTime = time.Minute // (when extreme) + lshiftGorHigh = 5 // max num expressed as left shift on the num CPUs: (number-of-CPUs * 2^^lshift...) ) // [naming convention] error counter prefixes @@ -316,20 +315,27 @@ func (r *runner) AddMany(nvs ...cos.NamedVal64) { func (r *runner) SetFlag(name string, set cos.NodeStateFlags) { v := r.core.Tracker[name] - oval := cos.BitFlags(ratomic.LoadInt64(&v.Value)) - nval := oval | cos.BitFlags(set) - ratomic.StoreInt64(&v.Value, int64(nval)) + oval := ratomic.LoadInt64(&v.Value) + nval := oval | int64(set) + ratomic.StoreInt64(&v.Value, nval) +} + +func (r *runner) ClrFlag(name string, clr cos.NodeStateFlags) { + v := r.core.Tracker[name] + oval := ratomic.LoadInt64(&v.Value) + nval := oval &^ int64(clr) + ratomic.StoreInt64(&v.Value, nval) } func (r *runner) SetClrFlag(name string, set, clr cos.NodeStateFlags) { v := r.core.Tracker[name] - oval := cos.BitFlags(ratomic.LoadInt64(&v.Value)) - nval := oval | cos.BitFlags(set) + oval := ratomic.LoadInt64(&v.Value) + nval := oval | int64(set) if cos.NodeStateFlags(nval).IsOK() && cos.NodeStateFlags(oval).IsOK() { return } - nval &^= cos.BitFlags(clr) - ratomic.StoreInt64(&v.Value, int64(nval)) + nval &^= int64(clr) + ratomic.StoreInt64(&v.Value, nval) } func (r *runner) Name() string { return r.name } @@ -414,7 +420,7 @@ waitStartup: r.core.initStatsdOrProm(r.node.Snode(), r) var ( - checkNumGorHigh int64 + lastNgr int64 startTime = mono.NanoTime() // uptime henceforth lastDateTimestamp = startTime ) @@ -424,7 +430,7 @@ waitStartup: now := mono.NanoTime() config = cmn.GCO.Get() logger.log(now, time.Duration(now-startTime) /*uptime*/, config) - checkNumGorHigh = _checkGor(now, checkNumGorHigh, goMaxProcs) + lastNgr = r.checkNgr(now, lastNgr, goMaxProcs) if statsTime != config.Periodic.StatsTime.D() { statsTime = config.Periodic.StatsTime.D() @@ -502,6 +508,27 @@ func (r *runner) GetMetricNames() cos.StrKVs { return out } +func (r *runner) checkNgr(now, lastNgr int64, goMaxProcs int) int64 { + lim := goMaxProcs << lshiftGorHigh + ngr := runtime.NumGoroutine() + if ngr < lim { + if lastNgr != 0 { + r.ClrFlag(NodeStateFlags, cos.NumGoroutines) + } + return 0 + } + if lastNgr == 0 { + r.SetFlag(NodeStateFlags, cos.NumGoroutines) + lastNgr = now + } else if d := time.Duration(now - lastNgr); (d >= ngrHighTime) || (ngr > lim<<1 && d >= ngrExtremeTime) { + lastNgr = now + } + if lastNgr == now { + nlog.Warningln("High number of goroutines:", ngr) + } + return lastNgr +} + //////////////// // statsValue // //////////////// @@ -687,26 +714,3 @@ func roundMBs(val int64) (mbs float64) { mbs = float64(num) / 100 return } - -func _checkGor(now, checkNumGorHigh int64, goMaxProcs int) int64 { - var ( - ngr = runtime.NumGoroutine() - extreme bool - ) - if ngr < goMaxProcs*numGorHigh { - return 0 - } - if ngr >= goMaxProcs*numGorExtreme { - extreme = true - nlog.Errorf("Extremely high number of goroutines: %d", ngr) - } - if checkNumGorHigh == 0 { - checkNumGorHigh = now - } else if time.Duration(now-checkNumGorHigh) > numGorHighCheckTime { - if !extreme { - nlog.Warningf("High number of goroutines: %d", ngr) - } - checkNumGorHigh = 0 - } - return checkNumGorHigh -} diff --git a/transport/obj_test.go b/transport/obj_test.go index 1c4d5d6caea..3f6952ea626 100644 --- a/transport/obj_test.go +++ b/transport/obj_test.go @@ -65,6 +65,7 @@ func (*dummyStatsTracker) Add(string, int64) func (*dummyStatsTracker) Inc(string) {} func (*dummyStatsTracker) Get(string) int64 { return 0 } func (*dummyStatsTracker) AddMany(...cos.NamedVal64) {} +func (*dummyStatsTracker) ClrFlag(string, cos.NodeStateFlags) {} func (*dummyStatsTracker) SetFlag(string, cos.NodeStateFlags) {} func (*dummyStatsTracker) SetClrFlag(string, cos.NodeStateFlags, cos.NodeStateFlags) {}