// Copyright 2022 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package runtime import "runtime/internal/atomic" // gcCPULimiter is a mechanism to limit GC CPU utilization in situations // where it might become excessive and inhibit application progress (e.g. // a death spiral). // // The core of the limiter is a leaky bucket mechanism that fills with GC // CPU time and drains with mutator time. Because the bucket fills and // drains with time directly (i.e. without any weighting), this effectively // sets a very conservative limit of 50%. This limit could be enforced directly, // however, but the purpose of the bucket is to accommodate spikes in GC CPU // utilization without hurting throughput. // // Note that the bucket in the leaky bucket mechanism can never go negative, // so the GC never gets credit for a lot of CPU time spent without the GC // running. This is intentional, as an application that stays idle for, say, // an entire day, could build up enough credit to fail to prevent a death // spiral the following day. The bucket's capacity is the GC's only leeway. // // The capacity thus also sets the window the limiter considers. For example, // if the capacity of the bucket is 1 cpu-second, then the limiter will not // kick in until at least 1 full cpu-second in the last 2 cpu-second window // is spent on GC CPU time. var gcCPULimiter gcCPULimiterState type gcCPULimiterState struct { lock atomic.Uint32 enabled atomic.Bool bucket struct { // Invariants: // - fill >= 0 // - capacity >= 0 // - fill <= capacity fill, capacity uint64 } // overflow is the cumulative amount of GC CPU time that we tried to fill the // bucket with but exceeded its capacity. overflow uint64 // gcEnabled is an internal copy of gcBlackenEnabled that determines // whether the limiter tracks total assist time. // // gcBlackenEnabled isn't used directly so as to keep this structure // unit-testable. gcEnabled bool // transitioning is true when the GC is in a STW and transitioning between // the mark and sweep phases. transitioning bool // assistTimePool is the accumulated assist time since the last update. assistTimePool atomic.Int64 // idleMarkTimePool is the accumulated idle mark time since the last update. idleMarkTimePool atomic.Int64 // idleTimePool is the accumulated time Ps spent on the idle list since the last update. idleTimePool atomic.Int64 // lastUpdate is the nanotime timestamp of the last time update was called. // // Updated under lock, but may be read concurrently. lastUpdate atomic.Int64 // lastEnabledCycle is the GC cycle that last had the limiter enabled. lastEnabledCycle atomic.Uint32 // nprocs is an internal copy of gomaxprocs, used to determine total available // CPU time. // // gomaxprocs isn't used directly so as to keep this structure unit-testable. nprocs int32 // test indicates whether this instance of the struct was made for testing purposes. test bool } // limiting returns true if the CPU limiter is currently enabled, meaning the Go GC // should take action to limit CPU utilization. // // It is safe to call concurrently with other operations. func (l *gcCPULimiterState) limiting() bool { return l.enabled.Load() } // startGCTransition notifies the limiter of a GC transition. // // This call takes ownership of the limiter and disables all other means of // updating the limiter. Release ownership by calling finishGCTransition. // // It is safe to call concurrently with other operations. func (l *gcCPULimiterState) startGCTransition(enableGC bool, now int64) { if !l.tryLock() { // This must happen during a STW, so we can't fail to acquire the lock. // If we did, something went wrong. Throw. throw("failed to acquire lock to start a GC transition") } if l.gcEnabled == enableGC { throw("transitioning GC to the same state as before?") } // Flush whatever was left between the last update and now. l.updateLocked(now) l.gcEnabled = enableGC l.transitioning = true // N.B. finishGCTransition releases the lock. // // We don't release here to increase the chance that if there's a failure // to finish the transition, that we throw on failing to acquire the lock. } // finishGCTransition notifies the limiter that the GC transition is complete // and releases ownership of it. It also accumulates STW time in the bucket. // now must be the timestamp from the end of the STW pause. func (l *gcCPULimiterState) finishGCTransition(now int64) { if !l.transitioning { throw("finishGCTransition called without starting one?") } // Count the full nprocs set of CPU time because the world is stopped // between startGCTransition and finishGCTransition. Even though the GC // isn't running on all CPUs, it is preventing user code from doing so, // so it might as well be. if lastUpdate := l.lastUpdate.Load(); now >= lastUpdate { l.accumulate(0, (now-lastUpdate)*int64(l.nprocs)) } l.lastUpdate.Store(now) l.transitioning = false l.unlock() } // gcCPULimiterUpdatePeriod dictates the maximum amount of wall-clock time // we can go before updating the limiter. const gcCPULimiterUpdatePeriod = 10e6 // 10ms // needUpdate returns true if the limiter's maximum update period has been // exceeded, and so would benefit from an update. func (l *gcCPULimiterState) needUpdate(now int64) bool { return now-l.lastUpdate.Load() > gcCPULimiterUpdatePeriod } // addAssistTime notifies the limiter of additional assist time. It will be // included in the next update. func (l *gcCPULimiterState) addAssistTime(t int64) { l.assistTimePool.Add(t) } // addIdleTime notifies the limiter of additional time a P spent on the idle list. It will be // subtracted from the total CPU time in the next update. func (l *gcCPULimiterState) addIdleTime(t int64) { l.idleTimePool.Add(t) } // update updates the bucket given runtime-specific information. now is the // current monotonic time in nanoseconds. // // This is safe to call concurrently with other operations, except *GCTransition. func (l *gcCPULimiterState) update(now int64) { if !l.tryLock() { // We failed to acquire the lock, which means something else is currently // updating. Just drop our update, the next one to update will include // our total assist time. return } if l.transitioning { throw("update during transition") } l.updateLocked(now) l.unlock() } // updateLocked is the implementation of update. l.lock must be held. func (l *gcCPULimiterState) updateLocked(now int64) { lastUpdate := l.lastUpdate.Load() if now < lastUpdate { // Defensively avoid overflow. This isn't even the latest update anyway. return } windowTotalTime := (now - lastUpdate) * int64(l.nprocs) l.lastUpdate.Store(now) // Drain the pool of assist time. assistTime := l.assistTimePool.Load() if assistTime != 0 { l.assistTimePool.Add(-assistTime) } // Drain the pool of idle time. idleTime := l.idleTimePool.Load() if idleTime != 0 { l.idleTimePool.Add(-idleTime) } if !l.test { // Consume time from in-flight events. Make sure we're not preemptible so allp can't change. // // The reason we do this instead of just waiting for those events to finish and push updates // is to ensure that all the time we're accounting for happened sometime between lastUpdate // and now. This dramatically simplifies reasoning about the limiter because we're not at // risk of extra time being accounted for in this window than actually happened in this window, // leading to all sorts of weird transient behavior. mp := acquirem() for _, pp := range allp { typ, duration := pp.limiterEvent.consume(now) switch typ { case limiterEventIdleMarkWork: fallthrough case limiterEventIdle: idleTime += duration sched.idleTime.Add(duration) case limiterEventMarkAssist: fallthrough case limiterEventScavengeAssist: assistTime += duration case limiterEventNone: break default: throw("invalid limiter event type found") } } releasem(mp) } // Compute total GC time. windowGCTime := assistTime if l.gcEnabled { windowGCTime += int64(float64(windowTotalTime) * gcBackgroundUtilization) } // Subtract out all idle time from the total time. Do this after computing // GC time, because the background utilization is dependent on the *real* // total time, not the total time after idle time is subtracted. // // Idle time is counted as any time that a P is on the P idle list plus idle mark // time. Idle mark workers soak up time that the application spends idle. // // On a heavily undersubscribed system, any additional idle time can skew GC CPU // utilization, because the GC might be executing continuously and thrashing, // yet the CPU utilization with respect to GOMAXPROCS will be quite low, so // the limiter fails to turn on. By subtracting idle time, we're removing time that // we know the application was idle giving a more accurate picture of whether // the GC is thrashing. // // Note that this can cause the limiter to turn on even if it's not needed. For // instance, on a system with 32 Ps but only 1 running goroutine, each GC will have // 8 dedicated GC workers. Assuming the GC cycle is half mark phase and half sweep // phase, then the GC CPU utilization over that cycle, with idle time removed, will // be 8/(8+2) = 80%. Even though the limiter turns on, though, assist should be // unnecessary, as the GC has way more CPU time to outpace the 1 goroutine that's // running. windowTotalTime -= idleTime l.accumulate(windowTotalTime-windowGCTime, windowGCTime) } // accumulate adds time to the bucket and signals whether the limiter is enabled. // // This is an internal function that deals just with the bucket. Prefer update. // l.lock must be held. func (l *gcCPULimiterState) accumulate(mutatorTime, gcTime int64) { headroom := l.bucket.capacity - l.bucket.fill enabled := headroom == 0 // Let's be careful about three things here: // 1. The addition and subtraction, for the invariants. // 2. Overflow. // 3. Excessive mutation of l.enabled, which is accessed // by all assists, potentially more than once. change := gcTime - mutatorTime // Handle limiting case. if change > 0 && headroom <= uint64(change) { l.overflow += uint64(change) - headroom l.bucket.fill = l.bucket.capacity if !enabled { l.enabled.Store(true) l.lastEnabledCycle.Store(memstats.numgc + 1) } return } // Handle non-limiting cases. if change < 0 && l.bucket.fill <= uint64(-change) { // Bucket emptied. l.bucket.fill = 0 } else { // All other cases. l.bucket.fill -= uint64(-change) } if change != 0 && enabled { l.enabled.Store(false) } } // tryLock attempts to lock l. Returns true on success. func (l *gcCPULimiterState) tryLock() bool { return l.lock.CompareAndSwap(0, 1) } // unlock releases the lock on l. Must be called if tryLock returns true. func (l *gcCPULimiterState) unlock() { old := l.lock.Swap(0) if old != 1 { throw("double unlock") } } // capacityPerProc is the limiter's bucket capacity for each P in GOMAXPROCS. const capacityPerProc = 1e9 // 1 second in nanoseconds // resetCapacity updates the capacity based on GOMAXPROCS. Must not be called // while the GC is enabled. // // It is safe to call concurrently with other operations. func (l *gcCPULimiterState) resetCapacity(now int64, nprocs int32) { if !l.tryLock() { // This must happen during a STW, so we can't fail to acquire the lock. // If we did, something went wrong. Throw. throw("failed to acquire lock to reset capacity") } // Flush the rest of the time for this period. l.updateLocked(now) l.nprocs = nprocs l.bucket.capacity = uint64(nprocs) * capacityPerProc if l.bucket.fill > l.bucket.capacity { l.bucket.fill = l.bucket.capacity l.enabled.Store(true) l.lastEnabledCycle.Store(memstats.numgc + 1) } else if l.bucket.fill < l.bucket.capacity { l.enabled.Store(false) } l.unlock() } // limiterEventType indicates the type of an event occurring on some P. // // These events represent the full set of events that the GC CPU limiter tracks // to execute its function. // // This type may use no more than limiterEventBits bits of information. type limiterEventType uint8 const ( limiterEventNone limiterEventType = iota // None of the following events. limiterEventIdleMarkWork // Refers to an idle mark worker (see gcMarkWorkerMode). limiterEventMarkAssist // Refers to mark assist (see gcAssistAlloc). limiterEventScavengeAssist // Refers to a scavenge assist (see allocSpan). limiterEventIdle // Refers to time a P spent on the idle list. limiterEventBits = 3 ) // limiterEventTypeMask is a mask for the bits in p.limiterEventStart that represent // the event type. The rest of the bits of that field represent a timestamp. const ( limiterEventTypeMask = uint64((1<> (64 - limiterEventBits)) } // limiterEvent represents tracking state for an event tracked by the GC CPU limiter. type limiterEvent struct { stamp atomic.Uint64 // Stores a limiterEventStamp. } // start begins tracking a new limiter event of the current type. If an event // is already in flight, then a new event cannot begin because the current time is // already being attributed to that event. In this case, this function returns false. // Otherwise, it returns true. // // The caller must be non-preemptible until at least stop is called or this function // returns false. Because this is trying to measure "on-CPU" time of some event, getting // scheduled away during it can mean that whatever we're measuring isn't a reflection // of "on-CPU" time. The OS could deschedule us at any time, but we want to maintain as // close of an approximation as we can. func (e *limiterEvent) start(typ limiterEventType, now int64) bool { if limiterEventStamp(e.stamp.Load()).typ() != limiterEventNone { return false } e.stamp.Store(uint64(makeLimiterEventStamp(typ, now))) return true } // consume acquires the partial event CPU time from any in-flight event. // It achieves this by storing the current time as the new event time. // // Returns the type of the in-flight event, as well as how long it's currently been // executing for. Returns limiterEventNone if no event is active. func (e *limiterEvent) consume(now int64) (typ limiterEventType, duration int64) { // Read the limiter event timestamp and update it to now. for { old := limiterEventStamp(e.stamp.Load()) typ = old.typ() if typ == limiterEventNone { // There's no in-flight event, so just push that up. return } duration = old.duration(now) if duration == 0 { // We might have a stale now value, or this crossed the // 2^(64-limiterEventBits) boundary in the clock readings. // Just ignore it. return limiterEventNone, 0 } new := makeLimiterEventStamp(typ, now) if e.stamp.CompareAndSwap(uint64(old), uint64(new)) { break } } return } // stop stops the active limiter event. Throws if the // // The caller must be non-preemptible across the event. See start as to why. func (e *limiterEvent) stop(typ limiterEventType, now int64) { var stamp limiterEventStamp for { stamp = limiterEventStamp(e.stamp.Load()) if stamp.typ() != typ { print("runtime: want=", typ, " got=", stamp.typ(), "\n") throw("limiterEvent.stop: found wrong event in p's limiter event slot") } if e.stamp.CompareAndSwap(uint64(stamp), uint64(limiterEventStampNone)) { break } } duration := stamp.duration(now) if duration == 0 { // It's possible that we're missing time because we crossed a // 2^(64-limiterEventBits) boundary between the start and end. // In this case, we're dropping that information. This is OK because // at worst it'll cause a transient hiccup that will quickly resolve // itself as all new timestamps begin on the other side of the boundary. // Such a hiccup should be incredibly rare. return } // Account for the event. switch typ { case limiterEventIdleMarkWork: gcCPULimiter.addIdleTime(duration) case limiterEventIdle: gcCPULimiter.addIdleTime(duration) sched.idleTime.Add(duration) case limiterEventMarkAssist: fallthrough case limiterEventScavengeAssist: gcCPULimiter.addAssistTime(duration) default: throw("limiterEvent.stop: invalid limiter event type found") } }