Source file src/runtime/mgcscavenge.go
1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Scavenging free pages. 6 // 7 // This file implements scavenging (the release of physical pages backing mapped 8 // memory) of free and unused pages in the heap as a way to deal with page-level 9 // fragmentation and reduce the RSS of Go applications. 10 // 11 // Scavenging in Go happens on two fronts: there's the background 12 // (asynchronous) scavenger and the heap-growth (synchronous) scavenger. 13 // 14 // The former happens on a goroutine much like the background sweeper which is 15 // soft-capped at using scavengePercent of the mutator's time, based on 16 // order-of-magnitude estimates of the costs of scavenging. The background 17 // scavenger's primary goal is to bring the estimated heap RSS of the 18 // application down to a goal. 19 // 20 // Before we consider what this looks like, we need to split the world into two 21 // halves. One in which a memory limit is not set, and one in which it is. 22 // 23 // For the former, the goal is defined as: 24 // (retainExtraPercent+100) / 100 * (heapGoal / lastHeapGoal) * lastHeapInUse 25 // 26 // Essentially, we wish to have the application's RSS track the heap goal, but 27 // the heap goal is defined in terms of bytes of objects, rather than pages like 28 // RSS. As a result, we need to take into account for fragmentation internal to 29 // spans. heapGoal / lastHeapGoal defines the ratio between the current heap goal 30 // and the last heap goal, which tells us by how much the heap is growing and 31 // shrinking. We estimate what the heap will grow to in terms of pages by taking 32 // this ratio and multiplying it by heapInUse at the end of the last GC, which 33 // allows us to account for this additional fragmentation. Note that this 34 // procedure makes the assumption that the degree of fragmentation won't change 35 // dramatically over the next GC cycle. Overestimating the amount of 36 // fragmentation simply results in higher memory use, which will be accounted 37 // for by the next pacing up date. Underestimating the fragmentation however 38 // could lead to performance degradation. Handling this case is not within the 39 // scope of the scavenger. Situations where the amount of fragmentation balloons 40 // over the course of a single GC cycle should be considered pathologies, 41 // flagged as bugs, and fixed appropriately. 42 // 43 // An additional factor of retainExtraPercent is added as a buffer to help ensure 44 // that there's more unscavenged memory to allocate out of, since each allocation 45 // out of scavenged memory incurs a potentially expensive page fault. 46 // 47 // If a memory limit is set, then we wish to pick a scavenge goal that maintains 48 // that memory limit. For that, we look at total memory that has been committed 49 // (memstats.mappedReady) and try to bring that down below the limit. In this case, 50 // we want to give buffer space in the *opposite* direction. When the application 51 // is close to the limit, we want to make sure we push harder to keep it under, so 52 // if we target below the memory limit, we ensure that the background scavenger is 53 // giving the situation the urgency it deserves. 54 // 55 // In this case, the goal is defined as: 56 // (100-reduceExtraPercent) / 100 * memoryLimit 57 // 58 // We compute both of these goals, and check whether either of them have been met. 59 // The background scavenger continues operating as long as either one of the goals 60 // has not been met. 61 // 62 // The goals are updated after each GC. 63 // 64 // The synchronous heap-growth scavenging happens whenever the heap grows in 65 // size, for some definition of heap-growth. The intuition behind this is that 66 // the application had to grow the heap because existing fragments were 67 // not sufficiently large to satisfy a page-level memory allocation, so we 68 // scavenge those fragments eagerly to offset the growth in RSS that results. 69 70 package runtime 71 72 import ( 73 "internal/goos" 74 "runtime/internal/atomic" 75 "runtime/internal/sys" 76 "unsafe" 77 ) 78 79 const ( 80 // The background scavenger is paced according to these parameters. 81 // 82 // scavengePercent represents the portion of mutator time we're willing 83 // to spend on scavenging in percent. 84 scavengePercent = 1 // 1% 85 86 // retainExtraPercent represents the amount of memory over the heap goal 87 // that the scavenger should keep as a buffer space for the allocator. 88 // This constant is used when we do not have a memory limit set. 89 // 90 // The purpose of maintaining this overhead is to have a greater pool of 91 // unscavenged memory available for allocation (since using scavenged memory 92 // incurs an additional cost), to account for heap fragmentation and 93 // the ever-changing layout of the heap. 94 retainExtraPercent = 10 95 96 // reduceExtraPercent represents the amount of memory under the limit 97 // that the scavenger should target. For example, 5 means we target 95% 98 // of the limit. 99 // 100 // The purpose of shooting lower than the limit is to ensure that, once 101 // close to the limit, the scavenger is working hard to maintain it. If 102 // we have a memory limit set but are far away from it, there's no harm 103 // in leaving up to 100-retainExtraPercent live, and it's more efficient 104 // anyway, for the same reasons that retainExtraPercent exists. 105 reduceExtraPercent = 5 106 107 // maxPagesPerPhysPage is the maximum number of supported runtime pages per 108 // physical page, based on maxPhysPageSize. 109 maxPagesPerPhysPage = maxPhysPageSize / pageSize 110 111 // scavengeCostRatio is the approximate ratio between the costs of using previously 112 // scavenged memory and scavenging memory. 113 // 114 // For most systems the cost of scavenging greatly outweighs the costs 115 // associated with using scavenged memory, making this constant 0. On other systems 116 // (especially ones where "sysUsed" is not just a no-op) this cost is non-trivial. 117 // 118 // This ratio is used as part of multiplicative factor to help the scavenger account 119 // for the additional costs of using scavenged memory in its pacing. 120 scavengeCostRatio = 0.7 * (goos.IsDarwin + goos.IsIos) 121 ) 122 123 // heapRetained returns an estimate of the current heap RSS. 124 func heapRetained() uint64 { 125 return gcController.heapInUse.load() + gcController.heapFree.load() 126 } 127 128 // gcPaceScavenger updates the scavenger's pacing, particularly 129 // its rate and RSS goal. For this, it requires the current heapGoal, 130 // and the heapGoal for the previous GC cycle. 131 // 132 // The RSS goal is based on the current heap goal with a small overhead 133 // to accommodate non-determinism in the allocator. 134 // 135 // The pacing is based on scavengePageRate, which applies to both regular and 136 // huge pages. See that constant for more information. 137 // 138 // Must be called whenever GC pacing is updated. 139 // 140 // mheap_.lock must be held or the world must be stopped. 141 func gcPaceScavenger(memoryLimit int64, heapGoal, lastHeapGoal uint64) { 142 assertWorldStoppedOrLockHeld(&mheap_.lock) 143 144 // As described at the top of this file, there are two scavenge goals here: one 145 // for gcPercent and one for memoryLimit. Let's handle the latter first because 146 // it's simpler. 147 148 // We want to target retaining (100-reduceExtraPercent)% of the heap. 149 memoryLimitGoal := uint64(float64(memoryLimit) * (100.0 - reduceExtraPercent)) 150 151 // mappedReady is comparable to memoryLimit, and represents how much total memory 152 // the Go runtime has committed now (estimated). 153 mappedReady := gcController.mappedReady.Load() 154 155 // If we're below the goal already indicate that we don't need the background 156 // scavenger for the memory limit. This may seems worrisome at first, but note 157 // that the allocator will assist the background scavenger in the face of a memory 158 // limit, so we'll be safe even if we stop the scavenger when we shouldn't have. 159 if mappedReady <= memoryLimitGoal { 160 scavenge.memoryLimitGoal.Store(^uint64(0)) 161 } else { 162 scavenge.memoryLimitGoal.Store(memoryLimitGoal) 163 } 164 165 // Now handle the gcPercent goal. 166 167 // If we're called before the first GC completed, disable scavenging. 168 // We never scavenge before the 2nd GC cycle anyway (we don't have enough 169 // information about the heap yet) so this is fine, and avoids a fault 170 // or garbage data later. 171 if lastHeapGoal == 0 { 172 scavenge.gcPercentGoal.Store(^uint64(0)) 173 return 174 } 175 // Compute our scavenging goal. 176 goalRatio := float64(heapGoal) / float64(lastHeapGoal) 177 gcPercentGoal := uint64(float64(memstats.lastHeapInUse) * goalRatio) 178 // Add retainExtraPercent overhead to retainedGoal. This calculation 179 // looks strange but the purpose is to arrive at an integer division 180 // (e.g. if retainExtraPercent = 12.5, then we get a divisor of 8) 181 // that also avoids the overflow from a multiplication. 182 gcPercentGoal += gcPercentGoal / (1.0 / (retainExtraPercent / 100.0)) 183 // Align it to a physical page boundary to make the following calculations 184 // a bit more exact. 185 gcPercentGoal = (gcPercentGoal + uint64(physPageSize) - 1) &^ (uint64(physPageSize) - 1) 186 187 // Represents where we are now in the heap's contribution to RSS in bytes. 188 // 189 // Guaranteed to always be a multiple of physPageSize on systems where 190 // physPageSize <= pageSize since we map new heap memory at a size larger than 191 // any physPageSize and released memory in multiples of the physPageSize. 192 // 193 // However, certain functions recategorize heap memory as other stats (e.g. 194 // stacks) and this happens in multiples of pageSize, so on systems 195 // where physPageSize > pageSize the calculations below will not be exact. 196 // Generally this is OK since we'll be off by at most one regular 197 // physical page. 198 heapRetainedNow := heapRetained() 199 200 // If we're already below our goal, or within one page of our goal, then indicate 201 // that we don't need the background scavenger for maintaining a memory overhead 202 // proportional to the heap goal. 203 if heapRetainedNow <= gcPercentGoal || heapRetainedNow-gcPercentGoal < uint64(physPageSize) { 204 scavenge.gcPercentGoal.Store(^uint64(0)) 205 } else { 206 scavenge.gcPercentGoal.Store(gcPercentGoal) 207 } 208 } 209 210 var scavenge struct { 211 // gcPercentGoal is the amount of retained heap memory (measured by 212 // heapRetained) that the runtime will try to maintain by returning 213 // memory to the OS. This goal is derived from gcController.gcPercent 214 // by choosing to retain enough memory to allocate heap memory up to 215 // the heap goal. 216 gcPercentGoal atomic.Uint64 217 218 // memoryLimitGoal is the amount of memory retained by the runtime ( 219 // measured by gcController.mappedReady) that the runtime will try to 220 // maintain by returning memory to the OS. This goal is derived from 221 // gcController.memoryLimit by choosing to target the memory limit or 222 // some lower target to keep the scavenger working. 223 memoryLimitGoal atomic.Uint64 224 225 // assistTime is the time spent by the allocator scavenging in the last GC cycle. 226 // 227 // This is reset once a GC cycle ends. 228 assistTime atomic.Int64 229 230 // backgroundTime is the time spent by the background scavenger in the last GC cycle. 231 // 232 // This is reset once a GC cycle ends. 233 backgroundTime atomic.Int64 234 } 235 236 const ( 237 // It doesn't really matter what value we start at, but we can't be zero, because 238 // that'll cause divide-by-zero issues. Pick something conservative which we'll 239 // also use as a fallback. 240 startingScavSleepRatio = 0.001 241 242 // Spend at least 1 ms scavenging, otherwise the corresponding 243 // sleep time to maintain our desired utilization is too low to 244 // be reliable. 245 minScavWorkTime = 1e6 246 ) 247 248 // Sleep/wait state of the background scavenger. 249 var scavenger scavengerState 250 251 type scavengerState struct { 252 // lock protects all fields below. 253 lock mutex 254 255 // g is the goroutine the scavenger is bound to. 256 g *g 257 258 // parked is whether or not the scavenger is parked. 259 parked bool 260 261 // timer is the timer used for the scavenger to sleep. 262 timer *timer 263 264 // sysmonWake signals to sysmon that it should wake the scavenger. 265 sysmonWake atomic.Uint32 266 267 // targetCPUFraction is the target CPU overhead for the scavenger. 268 targetCPUFraction float64 269 270 // sleepRatio is the ratio of time spent doing scavenging work to 271 // time spent sleeping. This is used to decide how long the scavenger 272 // should sleep for in between batches of work. It is set by 273 // critSleepController in order to maintain a CPU overhead of 274 // targetCPUFraction. 275 // 276 // Lower means more sleep, higher means more aggressive scavenging. 277 sleepRatio float64 278 279 // sleepController controls sleepRatio. 280 // 281 // See sleepRatio for more details. 282 sleepController piController 283 284 // cooldown is the time left in nanoseconds during which we avoid 285 // using the controller and we hold sleepRatio at a conservative 286 // value. Used if the controller's assumptions fail to hold. 287 controllerCooldown int64 288 289 // printControllerReset instructs printScavTrace to signal that 290 // the controller was reset. 291 printControllerReset bool 292 293 // sleepStub is a stub used for testing to avoid actually having 294 // the scavenger sleep. 295 // 296 // Unlike the other stubs, this is not populated if left nil 297 // Instead, it is called when non-nil because any valid implementation 298 // of this function basically requires closing over this scavenger 299 // state, and allocating a closure is not allowed in the runtime as 300 // a matter of policy. 301 sleepStub func(n int64) int64 302 303 // scavenge is a function that scavenges n bytes of memory. 304 // Returns how many bytes of memory it actually scavenged, as 305 // well as the time it took in nanoseconds. Usually mheap.pages.scavenge 306 // with nanotime called around it, but stubbed out for testing. 307 // Like mheap.pages.scavenge, if it scavenges less than n bytes of 308 // memory, the caller may assume the heap is exhausted of scavengable 309 // memory for now. 310 // 311 // If this is nil, it is populated with the real thing in init. 312 scavenge func(n uintptr) (uintptr, int64) 313 314 // shouldStop is a callback called in the work loop and provides a 315 // point that can force the scavenger to stop early, for example because 316 // the scavenge policy dictates too much has been scavenged already. 317 // 318 // If this is nil, it is populated with the real thing in init. 319 shouldStop func() bool 320 321 // gomaxprocs returns the current value of gomaxprocs. Stub for testing. 322 // 323 // If this is nil, it is populated with the real thing in init. 324 gomaxprocs func() int32 325 } 326 327 // init initializes a scavenger state and wires to the current G. 328 // 329 // Must be called from a regular goroutine that can allocate. 330 func (s *scavengerState) init() { 331 if s.g != nil { 332 throw("scavenger state is already wired") 333 } 334 lockInit(&s.lock, lockRankScavenge) 335 s.g = getg() 336 337 s.timer = new(timer) 338 s.timer.arg = s 339 s.timer.f = func(s any, _ uintptr) { 340 s.(*scavengerState).wake() 341 } 342 343 // input: fraction of CPU time actually used. 344 // setpoint: ideal CPU fraction. 345 // output: ratio of time worked to time slept (determines sleep time). 346 // 347 // The output of this controller is somewhat indirect to what we actually 348 // want to achieve: how much time to sleep for. The reason for this definition 349 // is to ensure that the controller's outputs have a direct relationship with 350 // its inputs (as opposed to an inverse relationship), making it somewhat 351 // easier to reason about for tuning purposes. 352 s.sleepController = piController{ 353 // Tuned loosely via Ziegler-Nichols process. 354 kp: 0.3375, 355 ti: 3.2e6, 356 tt: 1e9, // 1 second reset time. 357 358 // These ranges seem wide, but we want to give the controller plenty of 359 // room to hunt for the optimal value. 360 min: 0.001, // 1:1000 361 max: 1000.0, // 1000:1 362 } 363 s.sleepRatio = startingScavSleepRatio 364 365 // Install real functions if stubs aren't present. 366 if s.scavenge == nil { 367 s.scavenge = func(n uintptr) (uintptr, int64) { 368 start := nanotime() 369 r := mheap_.pages.scavenge(n, nil) 370 end := nanotime() 371 if start >= end { 372 return r, 0 373 } 374 scavenge.backgroundTime.Add(end - start) 375 return r, end - start 376 } 377 } 378 if s.shouldStop == nil { 379 s.shouldStop = func() bool { 380 // If background scavenging is disabled or if there's no work to do just stop. 381 return heapRetained() <= scavenge.gcPercentGoal.Load() && 382 (!go119MemoryLimitSupport || 383 gcController.mappedReady.Load() <= scavenge.memoryLimitGoal.Load()) 384 } 385 } 386 if s.gomaxprocs == nil { 387 s.gomaxprocs = func() int32 { 388 return gomaxprocs 389 } 390 } 391 } 392 393 // park parks the scavenger goroutine. 394 func (s *scavengerState) park() { 395 lock(&s.lock) 396 if getg() != s.g { 397 throw("tried to park scavenger from another goroutine") 398 } 399 s.parked = true 400 goparkunlock(&s.lock, waitReasonGCScavengeWait, traceEvGoBlock, 2) 401 } 402 403 // ready signals to sysmon that the scavenger should be awoken. 404 func (s *scavengerState) ready() { 405 s.sysmonWake.Store(1) 406 } 407 408 // wake immediately unparks the scavenger if necessary. 409 // 410 // Safe to run without a P. 411 func (s *scavengerState) wake() { 412 lock(&s.lock) 413 if s.parked { 414 // Unset sysmonWake, since the scavenger is now being awoken. 415 s.sysmonWake.Store(0) 416 417 // s.parked is unset to prevent a double wake-up. 418 s.parked = false 419 420 // Ready the goroutine by injecting it. We use injectglist instead 421 // of ready or goready in order to allow us to run this function 422 // without a P. injectglist also avoids placing the goroutine in 423 // the current P's runnext slot, which is desirable to prevent 424 // the scavenger from interfering with user goroutine scheduling 425 // too much. 426 var list gList 427 list.push(s.g) 428 injectglist(&list) 429 } 430 unlock(&s.lock) 431 } 432 433 // sleep puts the scavenger to sleep based on the amount of time that it worked 434 // in nanoseconds. 435 // 436 // Note that this function should only be called by the scavenger. 437 // 438 // The scavenger may be woken up earlier by a pacing change, and it may not go 439 // to sleep at all if there's a pending pacing change. 440 func (s *scavengerState) sleep(worked float64) { 441 lock(&s.lock) 442 if getg() != s.g { 443 throw("tried to sleep scavenger from another goroutine") 444 } 445 446 if worked < minScavWorkTime { 447 // This means there wasn't enough work to actually fill up minScavWorkTime. 448 // That's fine; we shouldn't try to do anything with this information 449 // because it's going result in a short enough sleep request that things 450 // will get messy. Just assume we did at least this much work. 451 // All this means is that we'll sleep longer than we otherwise would have. 452 worked = minScavWorkTime 453 } 454 455 // Multiply the critical time by 1 + the ratio of the costs of using 456 // scavenged memory vs. scavenging memory. This forces us to pay down 457 // the cost of reusing this memory eagerly by sleeping for a longer period 458 // of time and scavenging less frequently. More concretely, we avoid situations 459 // where we end up scavenging so often that we hurt allocation performance 460 // because of the additional overheads of using scavenged memory. 461 worked *= 1 + scavengeCostRatio 462 463 // sleepTime is the amount of time we're going to sleep, based on the amount 464 // of time we worked, and the sleepRatio. 465 sleepTime := int64(worked / s.sleepRatio) 466 467 var slept int64 468 if s.sleepStub == nil { 469 // Set the timer. 470 // 471 // This must happen here instead of inside gopark 472 // because we can't close over any variables without 473 // failing escape analysis. 474 start := nanotime() 475 resetTimer(s.timer, start+sleepTime) 476 477 // Mark ourselves as asleep and go to sleep. 478 s.parked = true 479 goparkunlock(&s.lock, waitReasonSleep, traceEvGoSleep, 2) 480 481 // How long we actually slept for. 482 slept = nanotime() - start 483 484 lock(&s.lock) 485 // Stop the timer here because s.wake is unable to do it for us. 486 // We don't really care if we succeed in stopping the timer. One 487 // reason we might fail is that we've already woken up, but the timer 488 // might be in the process of firing on some other P; essentially we're 489 // racing with it. That's totally OK. Double wake-ups are perfectly safe. 490 stopTimer(s.timer) 491 unlock(&s.lock) 492 } else { 493 unlock(&s.lock) 494 slept = s.sleepStub(sleepTime) 495 } 496 497 // Stop here if we're cooling down from the controller. 498 if s.controllerCooldown > 0 { 499 // worked and slept aren't exact measures of time, but it's OK to be a bit 500 // sloppy here. We're just hoping we're avoiding some transient bad behavior. 501 t := slept + int64(worked) 502 if t > s.controllerCooldown { 503 s.controllerCooldown = 0 504 } else { 505 s.controllerCooldown -= t 506 } 507 return 508 } 509 510 // idealFraction is the ideal % of overall application CPU time that we 511 // spend scavenging. 512 idealFraction := float64(scavengePercent) / 100.0 513 514 // Calculate the CPU time spent. 515 // 516 // This may be slightly inaccurate with respect to GOMAXPROCS, but we're 517 // recomputing this often enough relative to GOMAXPROCS changes in general 518 // (it only changes when the world is stopped, and not during a GC) that 519 // that small inaccuracy is in the noise. 520 cpuFraction := worked / ((float64(slept) + worked) * float64(s.gomaxprocs())) 521 522 // Update the critSleepRatio, adjusting until we reach our ideal fraction. 523 var ok bool 524 s.sleepRatio, ok = s.sleepController.next(cpuFraction, idealFraction, float64(slept)+worked) 525 if !ok { 526 // The core assumption of the controller, that we can get a proportional 527 // response, broke down. This may be transient, so temporarily switch to 528 // sleeping a fixed, conservative amount. 529 s.sleepRatio = startingScavSleepRatio 530 s.controllerCooldown = 5e9 // 5 seconds. 531 532 // Signal the scav trace printer to output this. 533 s.controllerFailed() 534 } 535 } 536 537 // controllerFailed indicates that the scavenger's scheduling 538 // controller failed. 539 func (s *scavengerState) controllerFailed() { 540 lock(&s.lock) 541 s.printControllerReset = true 542 unlock(&s.lock) 543 } 544 545 // run is the body of the main scavenging loop. 546 // 547 // Returns the number of bytes released and the estimated time spent 548 // releasing those bytes. 549 // 550 // Must be run on the scavenger goroutine. 551 func (s *scavengerState) run() (released uintptr, worked float64) { 552 lock(&s.lock) 553 if getg() != s.g { 554 throw("tried to run scavenger from another goroutine") 555 } 556 unlock(&s.lock) 557 558 for worked < minScavWorkTime { 559 // If something from outside tells us to stop early, stop. 560 if s.shouldStop() { 561 break 562 } 563 564 // scavengeQuantum is the amount of memory we try to scavenge 565 // in one go. A smaller value means the scavenger is more responsive 566 // to the scheduler in case of e.g. preemption. A larger value means 567 // that the overheads of scavenging are better amortized, so better 568 // scavenging throughput. 569 // 570 // The current value is chosen assuming a cost of ~10µs/physical page 571 // (this is somewhat pessimistic), which implies a worst-case latency of 572 // about 160µs for 4 KiB physical pages. The current value is biased 573 // toward latency over throughput. 574 const scavengeQuantum = 64 << 10 575 576 // Accumulate the amount of time spent scavenging. 577 r, duration := s.scavenge(scavengeQuantum) 578 579 // On some platforms we may see end >= start if the time it takes to scavenge 580 // memory is less than the minimum granularity of its clock (e.g. Windows) or 581 // due to clock bugs. 582 // 583 // In this case, just assume scavenging takes 10 µs per regular physical page 584 // (determined empirically), and conservatively ignore the impact of huge pages 585 // on timing. 586 const approxWorkedNSPerPhysicalPage = 10e3 587 if duration == 0 { 588 worked += approxWorkedNSPerPhysicalPage * float64(r/physPageSize) 589 } else { 590 // TODO(mknyszek): If duration is small compared to worked, it could be 591 // rounded down to zero. Probably not a problem in practice because the 592 // values are all within a few orders of magnitude of each other but maybe 593 // worth worrying about. 594 worked += float64(duration) 595 } 596 released += r 597 598 // scavenge does not return until it either finds the requisite amount of 599 // memory to scavenge, or exhausts the heap. If we haven't found enough 600 // to scavenge, then the heap must be exhausted. 601 if r < scavengeQuantum { 602 break 603 } 604 // When using fake time just do one loop. 605 if faketime != 0 { 606 break 607 } 608 } 609 if released > 0 && released < physPageSize { 610 // If this happens, it means that we may have attempted to release part 611 // of a physical page, but the likely effect of that is that it released 612 // the whole physical page, some of which may have still been in-use. 613 // This could lead to memory corruption. Throw. 614 throw("released less than one physical page of memory") 615 } 616 return 617 } 618 619 // Background scavenger. 620 // 621 // The background scavenger maintains the RSS of the application below 622 // the line described by the proportional scavenging statistics in 623 // the mheap struct. 624 func bgscavenge(c chan int) { 625 scavenger.init() 626 627 c <- 1 628 scavenger.park() 629 630 for { 631 released, workTime := scavenger.run() 632 if released == 0 { 633 scavenger.park() 634 continue 635 } 636 atomic.Xadduintptr(&mheap_.pages.scav.released, released) 637 scavenger.sleep(workTime) 638 } 639 } 640 641 // scavenge scavenges nbytes worth of free pages, starting with the 642 // highest address first. Successive calls continue from where it left 643 // off until the heap is exhausted. Call scavengeStartGen to bring it 644 // back to the top of the heap. 645 // 646 // Returns the amount of memory scavenged in bytes. 647 // 648 // scavenge always tries to scavenge nbytes worth of memory, and will 649 // only fail to do so if the heap is exhausted for now. 650 func (p *pageAlloc) scavenge(nbytes uintptr, shouldStop func() bool) uintptr { 651 released := uintptr(0) 652 for released < nbytes { 653 ci, pageIdx := p.scav.index.find() 654 if ci == 0 { 655 break 656 } 657 systemstack(func() { 658 released += p.scavengeOne(ci, pageIdx, nbytes-released) 659 }) 660 if shouldStop != nil && shouldStop() { 661 break 662 } 663 } 664 return released 665 } 666 667 // printScavTrace prints a scavenge trace line to standard error. 668 // 669 // released should be the amount of memory released since the last time this 670 // was called, and forced indicates whether the scavenge was forced by the 671 // application. 672 // 673 // scavenger.lock must be held. 674 func printScavTrace(released uintptr, forced bool) { 675 assertLockHeld(&scavenger.lock) 676 677 printlock() 678 print("scav ", 679 released>>10, " KiB work, ", 680 gcController.heapReleased.load()>>10, " KiB total, ", 681 (gcController.heapInUse.load()*100)/heapRetained(), "% util", 682 ) 683 if forced { 684 print(" (forced)") 685 } else if scavenger.printControllerReset { 686 print(" [controller reset]") 687 scavenger.printControllerReset = false 688 } 689 println() 690 printunlock() 691 } 692 693 // scavengeOne walks over the chunk at chunk index ci and searches for 694 // a contiguous run of pages to scavenge. It will try to scavenge 695 // at most max bytes at once, but may scavenge more to avoid 696 // breaking huge pages. Once it scavenges some memory it returns 697 // how much it scavenged in bytes. 698 // 699 // searchIdx is the page index to start searching from in ci. 700 // 701 // Returns the number of bytes scavenged. 702 // 703 // Must run on the systemstack because it acquires p.mheapLock. 704 // 705 //go:systemstack 706 func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintptr { 707 // Calculate the maximum number of pages to scavenge. 708 // 709 // This should be alignUp(max, pageSize) / pageSize but max can and will 710 // be ^uintptr(0), so we need to be very careful not to overflow here. 711 // Rather than use alignUp, calculate the number of pages rounded down 712 // first, then add back one if necessary. 713 maxPages := max / pageSize 714 if max%pageSize != 0 { 715 maxPages++ 716 } 717 718 // Calculate the minimum number of pages we can scavenge. 719 // 720 // Because we can only scavenge whole physical pages, we must 721 // ensure that we scavenge at least minPages each time, aligned 722 // to minPages*pageSize. 723 minPages := physPageSize / pageSize 724 if minPages < 1 { 725 minPages = 1 726 } 727 728 lock(p.mheapLock) 729 if p.summary[len(p.summary)-1][ci].max() >= uint(minPages) { 730 // We only bother looking for a candidate if there at least 731 // minPages free pages at all. 732 base, npages := p.chunkOf(ci).findScavengeCandidate(searchIdx, minPages, maxPages) 733 734 // If we found something, scavenge it and return! 735 if npages != 0 { 736 // Compute the full address for the start of the range. 737 addr := chunkBase(ci) + uintptr(base)*pageSize 738 739 // Mark the range we're about to scavenge as allocated, because 740 // we don't want any allocating goroutines to grab it while 741 // the scavenging is in progress. 742 if scav := p.allocRange(addr, uintptr(npages)); scav != 0 { 743 throw("double scavenge") 744 } 745 746 // With that done, it's safe to unlock. 747 unlock(p.mheapLock) 748 749 if !p.test { 750 pageTraceScav(getg().m.p.ptr(), 0, addr, uintptr(npages)) 751 752 // Only perform the actual scavenging if we're not in a test. 753 // It's dangerous to do so otherwise. 754 sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize) 755 756 // Update global accounting only when not in test, otherwise 757 // the runtime's accounting will be wrong. 758 nbytes := int64(npages) * pageSize 759 gcController.heapReleased.add(nbytes) 760 gcController.heapFree.add(-nbytes) 761 762 stats := memstats.heapStats.acquire() 763 atomic.Xaddint64(&stats.committed, -nbytes) 764 atomic.Xaddint64(&stats.released, nbytes) 765 memstats.heapStats.release() 766 } 767 768 // Relock the heap, because now we need to make these pages 769 // available allocation. Free them back to the page allocator. 770 lock(p.mheapLock) 771 p.free(addr, uintptr(npages), true) 772 773 // Mark the range as scavenged. 774 p.chunkOf(ci).scavenged.setRange(base, npages) 775 unlock(p.mheapLock) 776 777 return uintptr(npages) * pageSize 778 } 779 } 780 // Mark this chunk as having no free pages. 781 p.scav.index.clear(ci) 782 unlock(p.mheapLock) 783 784 return 0 785 } 786 787 // fillAligned returns x but with all zeroes in m-aligned 788 // groups of m bits set to 1 if any bit in the group is non-zero. 789 // 790 // For example, fillAligned(0x0100a3, 8) == 0xff00ff. 791 // 792 // Note that if m == 1, this is a no-op. 793 // 794 // m must be a power of 2 <= maxPagesPerPhysPage. 795 func fillAligned(x uint64, m uint) uint64 { 796 apply := func(x uint64, c uint64) uint64 { 797 // The technique used it here is derived from 798 // https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 799 // and extended for more than just bytes (like nibbles 800 // and uint16s) by using an appropriate constant. 801 // 802 // To summarize the technique, quoting from that page: 803 // "[It] works by first zeroing the high bits of the [8] 804 // bytes in the word. Subsequently, it adds a number that 805 // will result in an overflow to the high bit of a byte if 806 // any of the low bits were initially set. Next the high 807 // bits of the original word are ORed with these values; 808 // thus, the high bit of a byte is set iff any bit in the 809 // byte was set. Finally, we determine if any of these high 810 // bits are zero by ORing with ones everywhere except the 811 // high bits and inverting the result." 812 return ^((((x & c) + c) | x) | c) 813 } 814 // Transform x to contain a 1 bit at the top of each m-aligned 815 // group of m zero bits. 816 switch m { 817 case 1: 818 return x 819 case 2: 820 x = apply(x, 0x5555555555555555) 821 case 4: 822 x = apply(x, 0x7777777777777777) 823 case 8: 824 x = apply(x, 0x7f7f7f7f7f7f7f7f) 825 case 16: 826 x = apply(x, 0x7fff7fff7fff7fff) 827 case 32: 828 x = apply(x, 0x7fffffff7fffffff) 829 case 64: // == maxPagesPerPhysPage 830 x = apply(x, 0x7fffffffffffffff) 831 default: 832 throw("bad m value") 833 } 834 // Now, the top bit of each m-aligned group in x is set 835 // that group was all zero in the original x. 836 837 // From each group of m bits subtract 1. 838 // Because we know only the top bits of each 839 // m-aligned group are set, we know this will 840 // set each group to have all the bits set except 841 // the top bit, so just OR with the original 842 // result to set all the bits. 843 return ^((x - (x >> (m - 1))) | x) 844 } 845 846 // findScavengeCandidate returns a start index and a size for this pallocData 847 // segment which represents a contiguous region of free and unscavenged memory. 848 // 849 // searchIdx indicates the page index within this chunk to start the search, but 850 // note that findScavengeCandidate searches backwards through the pallocData. As a 851 // a result, it will return the highest scavenge candidate in address order. 852 // 853 // min indicates a hard minimum size and alignment for runs of pages. That is, 854 // findScavengeCandidate will not return a region smaller than min pages in size, 855 // or that is min pages or greater in size but not aligned to min. min must be 856 // a non-zero power of 2 <= maxPagesPerPhysPage. 857 // 858 // max is a hint for how big of a region is desired. If max >= pallocChunkPages, then 859 // findScavengeCandidate effectively returns entire free and unscavenged regions. 860 // If max < pallocChunkPages, it may truncate the returned region such that size is 861 // max. However, findScavengeCandidate may still return a larger region if, for 862 // example, it chooses to preserve huge pages, or if max is not aligned to min (it 863 // will round up). That is, even if max is small, the returned size is not guaranteed 864 // to be equal to max. max is allowed to be less than min, in which case it is as if 865 // max == min. 866 func (m *pallocData) findScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) { 867 if min&(min-1) != 0 || min == 0 { 868 print("runtime: min = ", min, "\n") 869 throw("min must be a non-zero power of 2") 870 } else if min > maxPagesPerPhysPage { 871 print("runtime: min = ", min, "\n") 872 throw("min too large") 873 } 874 // max may not be min-aligned, so we might accidentally truncate to 875 // a max value which causes us to return a non-min-aligned value. 876 // To prevent this, align max up to a multiple of min (which is always 877 // a power of 2). This also prevents max from ever being less than 878 // min, unless it's zero, so handle that explicitly. 879 if max == 0 { 880 max = min 881 } else { 882 max = alignUp(max, min) 883 } 884 885 i := int(searchIdx / 64) 886 // Start by quickly skipping over blocks of non-free or scavenged pages. 887 for ; i >= 0; i-- { 888 // 1s are scavenged OR non-free => 0s are unscavenged AND free 889 x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min)) 890 if x != ^uint64(0) { 891 break 892 } 893 } 894 if i < 0 { 895 // Failed to find any free/unscavenged pages. 896 return 0, 0 897 } 898 // We have something in the 64-bit chunk at i, but it could 899 // extend further. Loop until we find the extent of it. 900 901 // 1s are scavenged OR non-free => 0s are unscavenged AND free 902 x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min)) 903 z1 := uint(sys.LeadingZeros64(^x)) 904 run, end := uint(0), uint(i)*64+(64-z1) 905 if x<<z1 != 0 { 906 // After shifting out z1 bits, we still have 1s, 907 // so the run ends inside this word. 908 run = uint(sys.LeadingZeros64(x << z1)) 909 } else { 910 // After shifting out z1 bits, we have no more 1s. 911 // This means the run extends to the bottom of the 912 // word so it may extend into further words. 913 run = 64 - z1 914 for j := i - 1; j >= 0; j-- { 915 x := fillAligned(m.scavenged[j]|m.pallocBits[j], uint(min)) 916 run += uint(sys.LeadingZeros64(x)) 917 if x != 0 { 918 // The run stopped in this word. 919 break 920 } 921 } 922 } 923 924 // Split the run we found if it's larger than max but hold on to 925 // our original length, since we may need it later. 926 size := run 927 if size > uint(max) { 928 size = uint(max) 929 } 930 start := end - size 931 932 // Each huge page is guaranteed to fit in a single palloc chunk. 933 // 934 // TODO(mknyszek): Support larger huge page sizes. 935 // TODO(mknyszek): Consider taking pages-per-huge-page as a parameter 936 // so we can write tests for this. 937 if physHugePageSize > pageSize && physHugePageSize > physPageSize { 938 // We have huge pages, so let's ensure we don't break one by scavenging 939 // over a huge page boundary. If the range [start, start+size) overlaps with 940 // a free-and-unscavenged huge page, we want to grow the region we scavenge 941 // to include that huge page. 942 943 // Compute the huge page boundary above our candidate. 944 pagesPerHugePage := uintptr(physHugePageSize / pageSize) 945 hugePageAbove := uint(alignUp(uintptr(start), pagesPerHugePage)) 946 947 // If that boundary is within our current candidate, then we may be breaking 948 // a huge page. 949 if hugePageAbove <= end { 950 // Compute the huge page boundary below our candidate. 951 hugePageBelow := uint(alignDown(uintptr(start), pagesPerHugePage)) 952 953 if hugePageBelow >= end-run { 954 // We're in danger of breaking apart a huge page since start+size crosses 955 // a huge page boundary and rounding down start to the nearest huge 956 // page boundary is included in the full run we found. Include the entire 957 // huge page in the bound by rounding down to the huge page size. 958 size = size + (start - hugePageBelow) 959 start = hugePageBelow 960 } 961 } 962 } 963 return start, size 964 } 965 966 // scavengeIndex is a structure for efficiently managing which pageAlloc chunks have 967 // memory available to scavenge. 968 type scavengeIndex struct { 969 // chunks is a bitmap representing the entire address space. Each bit represents 970 // a single chunk, and a 1 value indicates the presence of pages available for 971 // scavenging. Updates to the bitmap are serialized by the pageAlloc lock. 972 // 973 // The underlying storage of chunks is platform dependent and may not even be 974 // totally mapped read/write. min and max reflect the extent that is safe to access. 975 // min is inclusive, max is exclusive. 976 // 977 // searchAddr is the maximum address (in the offset address space, so we have a linear 978 // view of the address space; see mranges.go:offAddr) containing memory available to 979 // scavenge. It is a hint to the find operation to avoid O(n^2) behavior in repeated lookups. 980 // 981 // searchAddr is always inclusive and should be the base address of the highest runtime 982 // page available for scavenging. 983 // 984 // searchAddr is managed by both find and mark. 985 // 986 // Normally, find monotonically decreases searchAddr as it finds no more free pages to 987 // scavenge. However, mark, when marking a new chunk at an index greater than the current 988 // searchAddr, sets searchAddr to the *negative* index into chunks of that page. The trick here 989 // is that concurrent calls to find will fail to monotonically decrease searchAddr, and so they 990 // won't barge over new memory becoming available to scavenge. Furthermore, this ensures 991 // that some future caller of find *must* observe the new high index. That caller 992 // (or any other racing with it), then makes searchAddr positive before continuing, bringing 993 // us back to our monotonically decreasing steady-state. 994 // 995 // A pageAlloc lock serializes updates between min, max, and searchAddr, so abs(searchAddr) 996 // is always guaranteed to be >= min and < max (converted to heap addresses). 997 // 998 // TODO(mknyszek): Ideally we would use something bigger than a uint8 for faster 999 // iteration like uint32, but we lack the bit twiddling intrinsics. We'd need to either 1000 // copy them from math/bits or fix the fact that we can't import math/bits' code from 1001 // the runtime due to compiler instrumentation. 1002 searchAddr atomicOffAddr 1003 chunks []atomic.Uint8 1004 minHeapIdx atomic.Int32 1005 min, max atomic.Int32 1006 } 1007 1008 // find returns the highest chunk index that may contain pages available to scavenge. 1009 // It also returns an offset to start searching in the highest chunk. 1010 func (s *scavengeIndex) find() (chunkIdx, uint) { 1011 searchAddr, marked := s.searchAddr.Load() 1012 if searchAddr == minOffAddr.addr() { 1013 // We got a cleared search addr. 1014 return 0, 0 1015 } 1016 1017 // Starting from searchAddr's chunk, and moving down to minHeapIdx, 1018 // iterate until we find a chunk with pages to scavenge. 1019 min := s.minHeapIdx.Load() 1020 searchChunk := chunkIndex(uintptr(searchAddr)) 1021 start := int32(searchChunk / 8) 1022 for i := start; i >= min; i-- { 1023 // Skip over irrelevant address space. 1024 chunks := s.chunks[i].Load() 1025 if chunks == 0 { 1026 continue 1027 } 1028 // Note that we can't have 8 leading zeroes here because 1029 // we necessarily skipped that case. So, what's left is 1030 // an index. If there are no zeroes, we want the 7th 1031 // index, if 1 zero, the 6th, and so on. 1032 n := 7 - sys.LeadingZeros8(chunks) 1033 ci := chunkIdx(uint(i)*8 + uint(n)) 1034 if searchChunk == ci { 1035 return ci, chunkPageIndex(uintptr(searchAddr)) 1036 } 1037 // Try to reduce searchAddr to newSearchAddr. 1038 newSearchAddr := chunkBase(ci) + pallocChunkBytes - pageSize 1039 if marked { 1040 // Attempt to be the first one to decrease the searchAddr 1041 // after an increase. If we fail, that means there was another 1042 // increase, or somebody else got to it before us. Either way, 1043 // it doesn't matter. We may lose some performance having an 1044 // incorrect search address, but it's far more important that 1045 // we don't miss updates. 1046 s.searchAddr.StoreUnmark(searchAddr, newSearchAddr) 1047 } else { 1048 // Decrease searchAddr. 1049 s.searchAddr.StoreMin(newSearchAddr) 1050 } 1051 return ci, pallocChunkPages - 1 1052 } 1053 // Clear searchAddr, because we've exhausted the heap. 1054 s.searchAddr.Clear() 1055 return 0, 0 1056 } 1057 1058 // mark sets the inclusive range of chunks between indices start and end as 1059 // containing pages available to scavenge. 1060 // 1061 // Must be serialized with other mark, markRange, and clear calls. 1062 func (s *scavengeIndex) mark(base, limit uintptr) { 1063 start, end := chunkIndex(base), chunkIndex(limit-pageSize) 1064 if start == end { 1065 // Within a chunk. 1066 mask := uint8(1 << (start % 8)) 1067 s.chunks[start/8].Or(mask) 1068 } else if start/8 == end/8 { 1069 // Within the same byte in the index. 1070 mask := uint8(uint16(1<<(end-start+1))-1) << (start % 8) 1071 s.chunks[start/8].Or(mask) 1072 } else { 1073 // Crosses multiple bytes in the index. 1074 startAligned := chunkIdx(alignUp(uintptr(start), 8)) 1075 endAligned := chunkIdx(alignDown(uintptr(end), 8)) 1076 1077 // Do the end of the first byte first. 1078 if width := startAligned - start; width > 0 { 1079 mask := uint8(uint16(1<<width)-1) << (start % 8) 1080 s.chunks[start/8].Or(mask) 1081 } 1082 // Do the middle aligned sections that take up a whole 1083 // byte. 1084 for ci := startAligned; ci < endAligned; ci += 8 { 1085 s.chunks[ci/8].Store(^uint8(0)) 1086 } 1087 // Do the end of the last byte. 1088 // 1089 // This width check doesn't match the one above 1090 // for start because aligning down into the endAligned 1091 // block means we always have at least one chunk in this 1092 // block (note that end is *inclusive*). This also means 1093 // that if end == endAligned+n, then what we really want 1094 // is to fill n+1 chunks, i.e. width n+1. By induction, 1095 // this is true for all n. 1096 if width := end - endAligned + 1; width > 0 { 1097 mask := uint8(uint16(1<<width) - 1) 1098 s.chunks[end/8].Or(mask) 1099 } 1100 } 1101 newSearchAddr := limit - pageSize 1102 searchAddr, _ := s.searchAddr.Load() 1103 // N.B. Because mark is serialized, it's not necessary to do a 1104 // full CAS here. mark only ever increases searchAddr, while 1105 // find only ever decreases it. Since we only ever race with 1106 // decreases, even if the value we loaded is stale, the actual 1107 // value will never be larger. 1108 if (offAddr{searchAddr}).lessThan(offAddr{newSearchAddr}) { 1109 s.searchAddr.StoreMarked(newSearchAddr) 1110 } 1111 } 1112 1113 // clear sets the chunk at index ci as not containing pages available to scavenge. 1114 // 1115 // Must be serialized with other mark, markRange, and clear calls. 1116 func (s *scavengeIndex) clear(ci chunkIdx) { 1117 s.chunks[ci/8].And(^uint8(1 << (ci % 8))) 1118 } 1119 1120 type piController struct { 1121 kp float64 // Proportional constant. 1122 ti float64 // Integral time constant. 1123 tt float64 // Reset time. 1124 1125 min, max float64 // Output boundaries. 1126 1127 // PI controller state. 1128 1129 errIntegral float64 // Integral of the error from t=0 to now. 1130 1131 // Error flags. 1132 errOverflow bool // Set if errIntegral ever overflowed. 1133 inputOverflow bool // Set if an operation with the input overflowed. 1134 } 1135 1136 // next provides a new sample to the controller. 1137 // 1138 // input is the sample, setpoint is the desired point, and period is how much 1139 // time (in whatever unit makes the most sense) has passed since the last sample. 1140 // 1141 // Returns a new value for the variable it's controlling, and whether the operation 1142 // completed successfully. One reason this might fail is if error has been growing 1143 // in an unbounded manner, to the point of overflow. 1144 // 1145 // In the specific case of an error overflow occurs, the errOverflow field will be 1146 // set and the rest of the controller's internal state will be fully reset. 1147 func (c *piController) next(input, setpoint, period float64) (float64, bool) { 1148 // Compute the raw output value. 1149 prop := c.kp * (setpoint - input) 1150 rawOutput := prop + c.errIntegral 1151 1152 // Clamp rawOutput into output. 1153 output := rawOutput 1154 if isInf(output) || isNaN(output) { 1155 // The input had a large enough magnitude that either it was already 1156 // overflowed, or some operation with it overflowed. 1157 // Set a flag and reset. That's the safest thing to do. 1158 c.reset() 1159 c.inputOverflow = true 1160 return c.min, false 1161 } 1162 if output < c.min { 1163 output = c.min 1164 } else if output > c.max { 1165 output = c.max 1166 } 1167 1168 // Update the controller's state. 1169 if c.ti != 0 && c.tt != 0 { 1170 c.errIntegral += (c.kp*period/c.ti)*(setpoint-input) + (period/c.tt)*(output-rawOutput) 1171 if isInf(c.errIntegral) || isNaN(c.errIntegral) { 1172 // So much error has accumulated that we managed to overflow. 1173 // The assumptions around the controller have likely broken down. 1174 // Set a flag and reset. That's the safest thing to do. 1175 c.reset() 1176 c.errOverflow = true 1177 return c.min, false 1178 } 1179 } 1180 return output, true 1181 } 1182 1183 // reset resets the controller state, except for controller error flags. 1184 func (c *piController) reset() { 1185 c.errIntegral = 0 1186 } 1187