1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Scavenging free pages. 6 // 7 // This file implements scavenging (the release of physical pages backing mapped 8 // memory) of free and unused pages in the heap as a way to deal with page-level 9 // fragmentation and reduce the RSS of Go applications. 10 // 11 // Scavenging in Go happens on two fronts: there's the background 12 // (asynchronous) scavenger and the heap-growth (synchronous) scavenger. 13 // 14 // The former happens on a goroutine much like the background sweeper which is 15 // soft-capped at using scavengePercent of the mutator's time, based on 16 // order-of-magnitude estimates of the costs of scavenging. The background 17 // scavenger's primary goal is to bring the estimated heap RSS of the 18 // application down to a goal. 19 // 20 // Before we consider what this looks like, we need to split the world into two 21 // halves. One in which a memory limit is not set, and one in which it is. 22 // 23 // For the former, the goal is defined as: 24 // (retainExtraPercent+100) / 100 * (heapGoal / lastHeapGoal) * lastHeapInUse 25 // 26 // Essentially, we wish to have the application's RSS track the heap goal, but 27 // the heap goal is defined in terms of bytes of objects, rather than pages like 28 // RSS. As a result, we need to take into account for fragmentation internal to 29 // spans. heapGoal / lastHeapGoal defines the ratio between the current heap goal 30 // and the last heap goal, which tells us by how much the heap is growing and 31 // shrinking. We estimate what the heap will grow to in terms of pages by taking 32 // this ratio and multiplying it by heapInUse at the end of the last GC, which 33 // allows us to account for this additional fragmentation. Note that this 34 // procedure makes the assumption that the degree of fragmentation won't change 35 // dramatically over the next GC cycle. Overestimating the amount of 36 // fragmentation simply results in higher memory use, which will be accounted 37 // for by the next pacing up date. Underestimating the fragmentation however 38 // could lead to performance degradation. Handling this case is not within the 39 // scope of the scavenger. Situations where the amount of fragmentation balloons 40 // over the course of a single GC cycle should be considered pathologies, 41 // flagged as bugs, and fixed appropriately. 42 // 43 // An additional factor of retainExtraPercent is added as a buffer to help ensure 44 // that there's more unscavenged memory to allocate out of, since each allocation 45 // out of scavenged memory incurs a potentially expensive page fault. 46 // 47 // If a memory limit is set, then we wish to pick a scavenge goal that maintains 48 // that memory limit. For that, we look at total memory that has been committed 49 // (memstats.mappedReady) and try to bring that down below the limit. In this case, 50 // we want to give buffer space in the *opposite* direction. When the application 51 // is close to the limit, we want to make sure we push harder to keep it under, so 52 // if we target below the memory limit, we ensure that the background scavenger is 53 // giving the situation the urgency it deserves. 54 // 55 // In this case, the goal is defined as: 56 // (100-reduceExtraPercent) / 100 * memoryLimit 57 // 58 // We compute both of these goals, and check whether either of them have been met. 59 // The background scavenger continues operating as long as either one of the goals 60 // has not been met. 61 // 62 // The goals are updated after each GC. 63 // 64 // The synchronous heap-growth scavenging happens whenever the heap grows in 65 // size, for some definition of heap-growth. The intuition behind this is that 66 // the application had to grow the heap because existing fragments were 67 // not sufficiently large to satisfy a page-level memory allocation, so we 68 // scavenge those fragments eagerly to offset the growth in RSS that results. 69 70 package runtime 71 72 import ( 73 "internal/goos" 74 "runtime/internal/atomic" 75 "runtime/internal/sys" 76 "unsafe" 77 ) 78 79 const ( 80 // The background scavenger is paced according to these parameters. 81 // 82 // scavengePercent represents the portion of mutator time we're willing 83 // to spend on scavenging in percent. 84 scavengePercent = 1 // 1% 85 86 // retainExtraPercent represents the amount of memory over the heap goal 87 // that the scavenger should keep as a buffer space for the allocator. 88 // This constant is used when we do not have a memory limit set. 89 // 90 // The purpose of maintaining this overhead is to have a greater pool of 91 // unscavenged memory available for allocation (since using scavenged memory 92 // incurs an additional cost), to account for heap fragmentation and 93 // the ever-changing layout of the heap. 94 retainExtraPercent = 10 95 96 // reduceExtraPercent represents the amount of memory under the limit 97 // that the scavenger should target. For example, 5 means we target 95% 98 // of the limit. 99 // 100 // The purpose of shooting lower than the limit is to ensure that, once 101 // close to the limit, the scavenger is working hard to maintain it. If 102 // we have a memory limit set but are far away from it, there's no harm 103 // in leaving up to 100-retainExtraPercent live, and it's more efficient 104 // anyway, for the same reasons that retainExtraPercent exists. 105 reduceExtraPercent = 5 106 107 // maxPagesPerPhysPage is the maximum number of supported runtime pages per 108 // physical page, based on maxPhysPageSize. 109 maxPagesPerPhysPage = maxPhysPageSize / pageSize 110 111 // scavengeCostRatio is the approximate ratio between the costs of using previously 112 // scavenged memory and scavenging memory. 113 // 114 // For most systems the cost of scavenging greatly outweighs the costs 115 // associated with using scavenged memory, making this constant 0. On other systems 116 // (especially ones where "sysUsed" is not just a no-op) this cost is non-trivial. 117 // 118 // This ratio is used as part of multiplicative factor to help the scavenger account 119 // for the additional costs of using scavenged memory in its pacing. 120 scavengeCostRatio = 0.7 * (goos.IsDarwin + goos.IsIos) 121 ) 122 123 // heapRetained returns an estimate of the current heap RSS. 124 func heapRetained() uint64 { 125 return gcController.heapInUse.load() + gcController.heapFree.load() 126 } 127 128 // gcPaceScavenger updates the scavenger's pacing, particularly 129 // its rate and RSS goal. For this, it requires the current heapGoal, 130 // and the heapGoal for the previous GC cycle. 131 // 132 // The RSS goal is based on the current heap goal with a small overhead 133 // to accommodate non-determinism in the allocator. 134 // 135 // The pacing is based on scavengePageRate, which applies to both regular and 136 // huge pages. See that constant for more information. 137 // 138 // Must be called whenever GC pacing is updated. 139 // 140 // mheap_.lock must be held or the world must be stopped. 141 func gcPaceScavenger(memoryLimit int64, heapGoal, lastHeapGoal uint64) { 142 assertWorldStoppedOrLockHeld(&mheap_.lock) 143 144 // As described at the top of this file, there are two scavenge goals here: one 145 // for gcPercent and one for memoryLimit. Let's handle the latter first because 146 // it's simpler. 147 148 // We want to target retaining (100-reduceExtraPercent)% of the heap. 149 memoryLimitGoal := uint64(float64(memoryLimit) * (100.0 - reduceExtraPercent)) 150 151 // mappedReady is comparable to memoryLimit, and represents how much total memory 152 // the Go runtime has committed now (estimated). 153 mappedReady := gcController.mappedReady.Load() 154 155 // If we're below the goal already indicate that we don't need the background 156 // scavenger for the memory limit. This may seems worrisome at first, but note 157 // that the allocator will assist the background scavenger in the face of a memory 158 // limit, so we'll be safe even if we stop the scavenger when we shouldn't have. 159 if mappedReady <= memoryLimitGoal { 160 scavenge.memoryLimitGoal.Store(^uint64(0)) 161 } else { 162 scavenge.memoryLimitGoal.Store(memoryLimitGoal) 163 } 164 165 // Now handle the gcPercent goal. 166 167 // If we're called before the first GC completed, disable scavenging. 168 // We never scavenge before the 2nd GC cycle anyway (we don't have enough 169 // information about the heap yet) so this is fine, and avoids a fault 170 // or garbage data later. 171 if lastHeapGoal == 0 { 172 scavenge.gcPercentGoal.Store(^uint64(0)) 173 return 174 } 175 // Compute our scavenging goal. 176 goalRatio := float64(heapGoal) / float64(lastHeapGoal) 177 gcPercentGoal := uint64(float64(memstats.lastHeapInUse) * goalRatio) 178 // Add retainExtraPercent overhead to retainedGoal. This calculation 179 // looks strange but the purpose is to arrive at an integer division 180 // (e.g. if retainExtraPercent = 12.5, then we get a divisor of 8) 181 // that also avoids the overflow from a multiplication. 182 gcPercentGoal += gcPercentGoal / (1.0 / (retainExtraPercent / 100.0)) 183 // Align it to a physical page boundary to make the following calculations 184 // a bit more exact. 185 gcPercentGoal = (gcPercentGoal + uint64(physPageSize) - 1) &^ (uint64(physPageSize) - 1) 186 187 // Represents where we are now in the heap's contribution to RSS in bytes. 188 // 189 // Guaranteed to always be a multiple of physPageSize on systems where 190 // physPageSize <= pageSize since we map new heap memory at a size larger than 191 // any physPageSize and released memory in multiples of the physPageSize. 192 // 193 // However, certain functions recategorize heap memory as other stats (e.g. 194 // stacks) and this happens in multiples of pageSize, so on systems 195 // where physPageSize > pageSize the calculations below will not be exact. 196 // Generally this is OK since we'll be off by at most one regular 197 // physical page. 198 heapRetainedNow := heapRetained() 199 200 // If we're already below our goal, or within one page of our goal, then indicate 201 // that we don't need the background scavenger for maintaining a memory overhead 202 // proportional to the heap goal. 203 if heapRetainedNow <= gcPercentGoal || heapRetainedNow-gcPercentGoal < uint64(physPageSize) { 204 scavenge.gcPercentGoal.Store(^uint64(0)) 205 } else { 206 scavenge.gcPercentGoal.Store(gcPercentGoal) 207 } 208 } 209 210 var scavenge struct { 211 // gcPercentGoal is the amount of retained heap memory (measured by 212 // heapRetained) that the runtime will try to maintain by returning 213 // memory to the OS. This goal is derived from gcController.gcPercent 214 // by choosing to retain enough memory to allocate heap memory up to 215 // the heap goal. 216 gcPercentGoal atomic.Uint64 217 218 // memoryLimitGoal is the amount of memory retained by the runtime ( 219 // measured by gcController.mappedReady) that the runtime will try to 220 // maintain by returning memory to the OS. This goal is derived from 221 // gcController.memoryLimit by choosing to target the memory limit or 222 // some lower target to keep the scavenger working. 223 memoryLimitGoal atomic.Uint64 224 } 225 226 const ( 227 // It doesn't really matter what value we start at, but we can't be zero, because 228 // that'll cause divide-by-zero issues. Pick something conservative which we'll 229 // also use as a fallback. 230 startingScavSleepRatio = 0.001 231 232 // Spend at least 1 ms scavenging, otherwise the corresponding 233 // sleep time to maintain our desired utilization is too low to 234 // be reliable. 235 minScavWorkTime = 1e6 236 ) 237 238 // Sleep/wait state of the background scavenger. 239 var scavenger scavengerState 240 241 type scavengerState struct { 242 // lock protects all fields below. 243 lock mutex 244 245 // g is the goroutine the scavenger is bound to. 246 g *g 247 248 // parked is whether or not the scavenger is parked. 249 parked bool 250 251 // timer is the timer used for the scavenger to sleep. 252 timer *timer 253 254 // sysmonWake signals to sysmon that it should wake the scavenger. 255 sysmonWake atomic.Uint32 256 257 // targetCPUFraction is the target CPU overhead for the scavenger. 258 targetCPUFraction float64 259 260 // sleepRatio is the ratio of time spent doing scavenging work to 261 // time spent sleeping. This is used to decide how long the scavenger 262 // should sleep for in between batches of work. It is set by 263 // critSleepController in order to maintain a CPU overhead of 264 // targetCPUFraction. 265 // 266 // Lower means more sleep, higher means more aggressive scavenging. 267 sleepRatio float64 268 269 // sleepController controls sleepRatio. 270 // 271 // See sleepRatio for more details. 272 sleepController piController 273 274 // cooldown is the time left in nanoseconds during which we avoid 275 // using the controller and we hold sleepRatio at a conservative 276 // value. Used if the controller's assumptions fail to hold. 277 controllerCooldown int64 278 279 // printControllerReset instructs printScavTrace to signal that 280 // the controller was reset. 281 printControllerReset bool 282 283 // sleepStub is a stub used for testing to avoid actually having 284 // the scavenger sleep. 285 // 286 // Unlike the other stubs, this is not populated if left nil 287 // Instead, it is called when non-nil because any valid implementation 288 // of this function basically requires closing over this scavenger 289 // state, and allocating a closure is not allowed in the runtime as 290 // a matter of policy. 291 sleepStub func(n int64) int64 292 293 // scavenge is a function that scavenges n bytes of memory. 294 // Returns how many bytes of memory it actually scavenged, as 295 // well as the time it took in nanoseconds. Usually mheap.pages.scavenge 296 // with nanotime called around it, but stubbed out for testing. 297 // Like mheap.pages.scavenge, if it scavenges less than n bytes of 298 // memory, the caller may assume the heap is exhausted of scavengable 299 // memory for now. 300 // 301 // If this is nil, it is populated with the real thing in init. 302 scavenge func(n uintptr) (uintptr, int64) 303 304 // shouldStop is a callback called in the work loop and provides a 305 // point that can force the scavenger to stop early, for example because 306 // the scavenge policy dictates too much has been scavenged already. 307 // 308 // If this is nil, it is populated with the real thing in init. 309 shouldStop func() bool 310 311 // gomaxprocs returns the current value of gomaxprocs. Stub for testing. 312 // 313 // If this is nil, it is populated with the real thing in init. 314 gomaxprocs func() int32 315 } 316 317 // init initializes a scavenger state and wires to the current G. 318 // 319 // Must be called from a regular goroutine that can allocate. 320 func (s *scavengerState) init() { 321 if s.g != nil { 322 throw("scavenger state is already wired") 323 } 324 lockInit(&s.lock, lockRankScavenge) 325 s.g = getg() 326 327 s.timer = new(timer) 328 s.timer.arg = s 329 s.timer.f = func(s any, _ uintptr) { 330 s.(*scavengerState).wake() 331 } 332 333 // input: fraction of CPU time actually used. 334 // setpoint: ideal CPU fraction. 335 // output: ratio of time worked to time slept (determines sleep time). 336 // 337 // The output of this controller is somewhat indirect to what we actually 338 // want to achieve: how much time to sleep for. The reason for this definition 339 // is to ensure that the controller's outputs have a direct relationship with 340 // its inputs (as opposed to an inverse relationship), making it somewhat 341 // easier to reason about for tuning purposes. 342 s.sleepController = piController{ 343 // Tuned loosely via Ziegler-Nichols process. 344 kp: 0.3375, 345 ti: 3.2e6, 346 tt: 1e9, // 1 second reset time. 347 348 // These ranges seem wide, but we want to give the controller plenty of 349 // room to hunt for the optimal value. 350 min: 0.001, // 1:1000 351 max: 1000.0, // 1000:1 352 } 353 s.sleepRatio = startingScavSleepRatio 354 355 // Install real functions if stubs aren't present. 356 if s.scavenge == nil { 357 s.scavenge = func(n uintptr) (uintptr, int64) { 358 start := nanotime() 359 r := mheap_.pages.scavenge(n, nil) 360 end := nanotime() 361 if start >= end { 362 return r, 0 363 } 364 return r, end - start 365 } 366 } 367 if s.shouldStop == nil { 368 s.shouldStop = func() bool { 369 // If background scavenging is disabled or if there's no work to do just stop. 370 return heapRetained() <= scavenge.gcPercentGoal.Load() && 371 (!go119MemoryLimitSupport || 372 gcController.mappedReady.Load() <= scavenge.memoryLimitGoal.Load()) 373 } 374 } 375 if s.gomaxprocs == nil { 376 s.gomaxprocs = func() int32 { 377 return gomaxprocs 378 } 379 } 380 } 381 382 // park parks the scavenger goroutine. 383 func (s *scavengerState) park() { 384 lock(&s.lock) 385 if getg() != s.g { 386 throw("tried to park scavenger from another goroutine") 387 } 388 s.parked = true 389 goparkunlock(&s.lock, waitReasonGCScavengeWait, traceEvGoBlock, 2) 390 } 391 392 // ready signals to sysmon that the scavenger should be awoken. 393 func (s *scavengerState) ready() { 394 s.sysmonWake.Store(1) 395 } 396 397 // wake immediately unparks the scavenger if necessary. 398 // 399 // Safe to run without a P. 400 func (s *scavengerState) wake() { 401 lock(&s.lock) 402 if s.parked { 403 // Unset sysmonWake, since the scavenger is now being awoken. 404 s.sysmonWake.Store(0) 405 406 // s.parked is unset to prevent a double wake-up. 407 s.parked = false 408 409 // Ready the goroutine by injecting it. We use injectglist instead 410 // of ready or goready in order to allow us to run this function 411 // without a P. injectglist also avoids placing the goroutine in 412 // the current P's runnext slot, which is desirable to prevent 413 // the scavenger from interfering with user goroutine scheduling 414 // too much. 415 var list gList 416 list.push(s.g) 417 injectglist(&list) 418 } 419 unlock(&s.lock) 420 } 421 422 // sleep puts the scavenger to sleep based on the amount of time that it worked 423 // in nanoseconds. 424 // 425 // Note that this function should only be called by the scavenger. 426 // 427 // The scavenger may be woken up earlier by a pacing change, and it may not go 428 // to sleep at all if there's a pending pacing change. 429 func (s *scavengerState) sleep(worked float64) { 430 lock(&s.lock) 431 if getg() != s.g { 432 throw("tried to sleep scavenger from another goroutine") 433 } 434 435 if worked < minScavWorkTime { 436 // This means there wasn't enough work to actually fill up minScavWorkTime. 437 // That's fine; we shouldn't try to do anything with this information 438 // because it's going result in a short enough sleep request that things 439 // will get messy. Just assume we did at least this much work. 440 // All this means is that we'll sleep longer than we otherwise would have. 441 worked = minScavWorkTime 442 } 443 444 // Multiply the critical time by 1 + the ratio of the costs of using 445 // scavenged memory vs. scavenging memory. This forces us to pay down 446 // the cost of reusing this memory eagerly by sleeping for a longer period 447 // of time and scavenging less frequently. More concretely, we avoid situations 448 // where we end up scavenging so often that we hurt allocation performance 449 // because of the additional overheads of using scavenged memory. 450 worked *= 1 + scavengeCostRatio 451 452 // sleepTime is the amount of time we're going to sleep, based on the amount 453 // of time we worked, and the sleepRatio. 454 sleepTime := int64(worked / s.sleepRatio) 455 456 var slept int64 457 if s.sleepStub == nil { 458 // Set the timer. 459 // 460 // This must happen here instead of inside gopark 461 // because we can't close over any variables without 462 // failing escape analysis. 463 start := nanotime() 464 resetTimer(s.timer, start+sleepTime) 465 466 // Mark ourselves as asleep and go to sleep. 467 s.parked = true 468 goparkunlock(&s.lock, waitReasonSleep, traceEvGoSleep, 2) 469 470 // How long we actually slept for. 471 slept = nanotime() - start 472 473 lock(&s.lock) 474 // Stop the timer here because s.wake is unable to do it for us. 475 // We don't really care if we succeed in stopping the timer. One 476 // reason we might fail is that we've already woken up, but the timer 477 // might be in the process of firing on some other P; essentially we're 478 // racing with it. That's totally OK. Double wake-ups are perfectly safe. 479 stopTimer(s.timer) 480 unlock(&s.lock) 481 } else { 482 unlock(&s.lock) 483 slept = s.sleepStub(sleepTime) 484 } 485 486 // Stop here if we're cooling down from the controller. 487 if s.controllerCooldown > 0 { 488 // worked and slept aren't exact measures of time, but it's OK to be a bit 489 // sloppy here. We're just hoping we're avoiding some transient bad behavior. 490 t := slept + int64(worked) 491 if t > s.controllerCooldown { 492 s.controllerCooldown = 0 493 } else { 494 s.controllerCooldown -= t 495 } 496 return 497 } 498 499 // idealFraction is the ideal % of overall application CPU time that we 500 // spend scavenging. 501 idealFraction := float64(scavengePercent) / 100.0 502 503 // Calculate the CPU time spent. 504 // 505 // This may be slightly inaccurate with respect to GOMAXPROCS, but we're 506 // recomputing this often enough relative to GOMAXPROCS changes in general 507 // (it only changes when the world is stopped, and not during a GC) that 508 // that small inaccuracy is in the noise. 509 cpuFraction := worked / ((float64(slept) + worked) * float64(s.gomaxprocs())) 510 511 // Update the critSleepRatio, adjusting until we reach our ideal fraction. 512 var ok bool 513 s.sleepRatio, ok = s.sleepController.next(cpuFraction, idealFraction, float64(slept)+worked) 514 if !ok { 515 // The core assumption of the controller, that we can get a proportional 516 // response, broke down. This may be transient, so temporarily switch to 517 // sleeping a fixed, conservative amount. 518 s.sleepRatio = startingScavSleepRatio 519 s.controllerCooldown = 5e9 // 5 seconds. 520 521 // Signal the scav trace printer to output this. 522 s.controllerFailed() 523 } 524 } 525 526 // controllerFailed indicates that the scavenger's scheduling 527 // controller failed. 528 func (s *scavengerState) controllerFailed() { 529 lock(&s.lock) 530 s.printControllerReset = true 531 unlock(&s.lock) 532 } 533 534 // run is the body of the main scavenging loop. 535 // 536 // Returns the number of bytes released and the estimated time spent 537 // releasing those bytes. 538 // 539 // Must be run on the scavenger goroutine. 540 func (s *scavengerState) run() (released uintptr, worked float64) { 541 lock(&s.lock) 542 if getg() != s.g { 543 throw("tried to run scavenger from another goroutine") 544 } 545 unlock(&s.lock) 546 547 for worked < minScavWorkTime { 548 // If something from outside tells us to stop early, stop. 549 if s.shouldStop() { 550 break 551 } 552 553 // scavengeQuantum is the amount of memory we try to scavenge 554 // in one go. A smaller value means the scavenger is more responsive 555 // to the scheduler in case of e.g. preemption. A larger value means 556 // that the overheads of scavenging are better amortized, so better 557 // scavenging throughput. 558 // 559 // The current value is chosen assuming a cost of ~10µs/physical page 560 // (this is somewhat pessimistic), which implies a worst-case latency of 561 // about 160µs for 4 KiB physical pages. The current value is biased 562 // toward latency over throughput. 563 const scavengeQuantum = 64 << 10 564 565 // Accumulate the amount of time spent scavenging. 566 r, duration := s.scavenge(scavengeQuantum) 567 568 // On some platforms we may see end >= start if the time it takes to scavenge 569 // memory is less than the minimum granularity of its clock (e.g. Windows) or 570 // due to clock bugs. 571 // 572 // In this case, just assume scavenging takes 10 µs per regular physical page 573 // (determined empirically), and conservatively ignore the impact of huge pages 574 // on timing. 575 const approxWorkedNSPerPhysicalPage = 10e3 576 if duration == 0 { 577 worked += approxWorkedNSPerPhysicalPage * float64(r/physPageSize) 578 } else { 579 // TODO(mknyszek): If duration is small compared to worked, it could be 580 // rounded down to zero. Probably not a problem in practice because the 581 // values are all within a few orders of magnitude of each other but maybe 582 // worth worrying about. 583 worked += float64(duration) 584 } 585 released += r 586 587 // scavenge does not return until it either finds the requisite amount of 588 // memory to scavenge, or exhausts the heap. If we haven't found enough 589 // to scavenge, then the heap must be exhausted. 590 if r < scavengeQuantum { 591 break 592 } 593 // When using fake time just do one loop. 594 if faketime != 0 { 595 break 596 } 597 } 598 if released > 0 && released < physPageSize { 599 // If this happens, it means that we may have attempted to release part 600 // of a physical page, but the likely effect of that is that it released 601 // the whole physical page, some of which may have still been in-use. 602 // This could lead to memory corruption. Throw. 603 throw("released less than one physical page of memory") 604 } 605 return 606 } 607 608 // Background scavenger. 609 // 610 // The background scavenger maintains the RSS of the application below 611 // the line described by the proportional scavenging statistics in 612 // the mheap struct. 613 func bgscavenge(c chan int) { 614 scavenger.init() 615 616 c <- 1 617 scavenger.park() 618 619 for { 620 released, workTime := scavenger.run() 621 if released == 0 { 622 scavenger.park() 623 continue 624 } 625 atomic.Xadduintptr(&mheap_.pages.scav.released, released) 626 scavenger.sleep(workTime) 627 } 628 } 629 630 // scavenge scavenges nbytes worth of free pages, starting with the 631 // highest address first. Successive calls continue from where it left 632 // off until the heap is exhausted. Call scavengeStartGen to bring it 633 // back to the top of the heap. 634 // 635 // Returns the amount of memory scavenged in bytes. 636 // 637 // scavenge always tries to scavenge nbytes worth of memory, and will 638 // only fail to do so if the heap is exhausted for now. 639 func (p *pageAlloc) scavenge(nbytes uintptr, shouldStop func() bool) uintptr { 640 released := uintptr(0) 641 for released < nbytes { 642 ci, pageIdx := p.scav.index.find() 643 if ci == 0 { 644 break 645 } 646 systemstack(func() { 647 released += p.scavengeOne(ci, pageIdx, nbytes-released) 648 }) 649 if shouldStop != nil && shouldStop() { 650 break 651 } 652 } 653 return released 654 } 655 656 // printScavTrace prints a scavenge trace line to standard error. 657 // 658 // released should be the amount of memory released since the last time this 659 // was called, and forced indicates whether the scavenge was forced by the 660 // application. 661 // 662 // scavenger.lock must be held. 663 func printScavTrace(released uintptr, forced bool) { 664 assertLockHeld(&scavenger.lock) 665 666 printlock() 667 print("scav ", 668 released>>10, " KiB work, ", 669 gcController.heapReleased.load()>>10, " KiB total, ", 670 (gcController.heapInUse.load()*100)/heapRetained(), "% util", 671 ) 672 if forced { 673 print(" (forced)") 674 } else if scavenger.printControllerReset { 675 print(" [controller reset]") 676 scavenger.printControllerReset = false 677 } 678 println() 679 printunlock() 680 } 681 682 // scavengeOne walks over the chunk at chunk index ci and searches for 683 // a contiguous run of pages to scavenge. It will try to scavenge 684 // at most max bytes at once, but may scavenge more to avoid 685 // breaking huge pages. Once it scavenges some memory it returns 686 // how much it scavenged in bytes. 687 // 688 // searchIdx is the page index to start searching from in ci. 689 // 690 // Returns the number of bytes scavenged. 691 // 692 // Must run on the systemstack because it acquires p.mheapLock. 693 // 694 //go:systemstack 695 func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintptr { 696 // Calculate the maximum number of pages to scavenge. 697 // 698 // This should be alignUp(max, pageSize) / pageSize but max can and will 699 // be ^uintptr(0), so we need to be very careful not to overflow here. 700 // Rather than use alignUp, calculate the number of pages rounded down 701 // first, then add back one if necessary. 702 maxPages := max / pageSize 703 if max%pageSize != 0 { 704 maxPages++ 705 } 706 707 // Calculate the minimum number of pages we can scavenge. 708 // 709 // Because we can only scavenge whole physical pages, we must 710 // ensure that we scavenge at least minPages each time, aligned 711 // to minPages*pageSize. 712 minPages := physPageSize / pageSize 713 if minPages < 1 { 714 minPages = 1 715 } 716 717 lock(p.mheapLock) 718 if p.summary[len(p.summary)-1][ci].max() >= uint(minPages) { 719 // We only bother looking for a candidate if there at least 720 // minPages free pages at all. 721 base, npages := p.chunkOf(ci).findScavengeCandidate(pallocChunkPages-1, minPages, maxPages) 722 723 // If we found something, scavenge it and return! 724 if npages != 0 { 725 // Compute the full address for the start of the range. 726 addr := chunkBase(ci) + uintptr(base)*pageSize 727 728 // Mark the range we're about to scavenge as allocated, because 729 // we don't want any allocating goroutines to grab it while 730 // the scavenging is in progress. 731 if scav := p.allocRange(addr, uintptr(npages)); scav != 0 { 732 throw("double scavenge") 733 } 734 735 // With that done, it's safe to unlock. 736 unlock(p.mheapLock) 737 738 if !p.test { 739 // Only perform the actual scavenging if we're not in a test. 740 // It's dangerous to do so otherwise. 741 sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize) 742 743 // Update global accounting only when not in test, otherwise 744 // the runtime's accounting will be wrong. 745 nbytes := int64(npages) * pageSize 746 gcController.heapReleased.add(nbytes) 747 gcController.heapFree.add(-nbytes) 748 749 stats := memstats.heapStats.acquire() 750 atomic.Xaddint64(&stats.committed, -nbytes) 751 atomic.Xaddint64(&stats.released, nbytes) 752 memstats.heapStats.release() 753 } 754 755 // Relock the heap, because now we need to make these pages 756 // available allocation. Free them back to the page allocator. 757 lock(p.mheapLock) 758 p.free(addr, uintptr(npages), true) 759 760 // Mark the range as scavenged. 761 p.chunkOf(ci).scavenged.setRange(base, npages) 762 unlock(p.mheapLock) 763 764 return uintptr(npages) * pageSize 765 } 766 } 767 // Mark this chunk as having no free pages. 768 p.scav.index.clear(ci) 769 unlock(p.mheapLock) 770 771 return 0 772 } 773 774 // fillAligned returns x but with all zeroes in m-aligned 775 // groups of m bits set to 1 if any bit in the group is non-zero. 776 // 777 // For example, fillAligned(0x0100a3, 8) == 0xff00ff. 778 // 779 // Note that if m == 1, this is a no-op. 780 // 781 // m must be a power of 2 <= maxPagesPerPhysPage. 782 func fillAligned(x uint64, m uint) uint64 { 783 apply := func(x uint64, c uint64) uint64 { 784 // The technique used it here is derived from 785 // https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 786 // and extended for more than just bytes (like nibbles 787 // and uint16s) by using an appropriate constant. 788 // 789 // To summarize the technique, quoting from that page: 790 // "[It] works by first zeroing the high bits of the [8] 791 // bytes in the word. Subsequently, it adds a number that 792 // will result in an overflow to the high bit of a byte if 793 // any of the low bits were initially set. Next the high 794 // bits of the original word are ORed with these values; 795 // thus, the high bit of a byte is set iff any bit in the 796 // byte was set. Finally, we determine if any of these high 797 // bits are zero by ORing with ones everywhere except the 798 // high bits and inverting the result." 799 return ^((((x & c) + c) | x) | c) 800 } 801 // Transform x to contain a 1 bit at the top of each m-aligned 802 // group of m zero bits. 803 switch m { 804 case 1: 805 return x 806 case 2: 807 x = apply(x, 0x5555555555555555) 808 case 4: 809 x = apply(x, 0x7777777777777777) 810 case 8: 811 x = apply(x, 0x7f7f7f7f7f7f7f7f) 812 case 16: 813 x = apply(x, 0x7fff7fff7fff7fff) 814 case 32: 815 x = apply(x, 0x7fffffff7fffffff) 816 case 64: // == maxPagesPerPhysPage 817 x = apply(x, 0x7fffffffffffffff) 818 default: 819 throw("bad m value") 820 } 821 // Now, the top bit of each m-aligned group in x is set 822 // that group was all zero in the original x. 823 824 // From each group of m bits subtract 1. 825 // Because we know only the top bits of each 826 // m-aligned group are set, we know this will 827 // set each group to have all the bits set except 828 // the top bit, so just OR with the original 829 // result to set all the bits. 830 return ^((x - (x >> (m - 1))) | x) 831 } 832 833 // findScavengeCandidate returns a start index and a size for this pallocData 834 // segment which represents a contiguous region of free and unscavenged memory. 835 // 836 // searchIdx indicates the page index within this chunk to start the search, but 837 // note that findScavengeCandidate searches backwards through the pallocData. As a 838 // a result, it will return the highest scavenge candidate in address order. 839 // 840 // min indicates a hard minimum size and alignment for runs of pages. That is, 841 // findScavengeCandidate will not return a region smaller than min pages in size, 842 // or that is min pages or greater in size but not aligned to min. min must be 843 // a non-zero power of 2 <= maxPagesPerPhysPage. 844 // 845 // max is a hint for how big of a region is desired. If max >= pallocChunkPages, then 846 // findScavengeCandidate effectively returns entire free and unscavenged regions. 847 // If max < pallocChunkPages, it may truncate the returned region such that size is 848 // max. However, findScavengeCandidate may still return a larger region if, for 849 // example, it chooses to preserve huge pages, or if max is not aligned to min (it 850 // will round up). That is, even if max is small, the returned size is not guaranteed 851 // to be equal to max. max is allowed to be less than min, in which case it is as if 852 // max == min. 853 func (m *pallocData) findScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) { 854 if min&(min-1) != 0 || min == 0 { 855 print("runtime: min = ", min, "\n") 856 throw("min must be a non-zero power of 2") 857 } else if min > maxPagesPerPhysPage { 858 print("runtime: min = ", min, "\n") 859 throw("min too large") 860 } 861 // max may not be min-aligned, so we might accidentally truncate to 862 // a max value which causes us to return a non-min-aligned value. 863 // To prevent this, align max up to a multiple of min (which is always 864 // a power of 2). This also prevents max from ever being less than 865 // min, unless it's zero, so handle that explicitly. 866 if max == 0 { 867 max = min 868 } else { 869 max = alignUp(max, min) 870 } 871 872 i := int(searchIdx / 64) 873 // Start by quickly skipping over blocks of non-free or scavenged pages. 874 for ; i >= 0; i-- { 875 // 1s are scavenged OR non-free => 0s are unscavenged AND free 876 x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min)) 877 if x != ^uint64(0) { 878 break 879 } 880 } 881 if i < 0 { 882 // Failed to find any free/unscavenged pages. 883 return 0, 0 884 } 885 // We have something in the 64-bit chunk at i, but it could 886 // extend further. Loop until we find the extent of it. 887 888 // 1s are scavenged OR non-free => 0s are unscavenged AND free 889 x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min)) 890 z1 := uint(sys.LeadingZeros64(^x)) 891 run, end := uint(0), uint(i)*64+(64-z1) 892 if x<<z1 != 0 { 893 // After shifting out z1 bits, we still have 1s, 894 // so the run ends inside this word. 895 run = uint(sys.LeadingZeros64(x << z1)) 896 } else { 897 // After shifting out z1 bits, we have no more 1s. 898 // This means the run extends to the bottom of the 899 // word so it may extend into further words. 900 run = 64 - z1 901 for j := i - 1; j >= 0; j-- { 902 x := fillAligned(m.scavenged[j]|m.pallocBits[j], uint(min)) 903 run += uint(sys.LeadingZeros64(x)) 904 if x != 0 { 905 // The run stopped in this word. 906 break 907 } 908 } 909 } 910 911 // Split the run we found if it's larger than max but hold on to 912 // our original length, since we may need it later. 913 size := run 914 if size > uint(max) { 915 size = uint(max) 916 } 917 start := end - size 918 919 // Each huge page is guaranteed to fit in a single palloc chunk. 920 // 921 // TODO(mknyszek): Support larger huge page sizes. 922 // TODO(mknyszek): Consider taking pages-per-huge-page as a parameter 923 // so we can write tests for this. 924 if physHugePageSize > pageSize && physHugePageSize > physPageSize { 925 // We have huge pages, so let's ensure we don't break one by scavenging 926 // over a huge page boundary. If the range [start, start+size) overlaps with 927 // a free-and-unscavenged huge page, we want to grow the region we scavenge 928 // to include that huge page. 929 930 // Compute the huge page boundary above our candidate. 931 pagesPerHugePage := uintptr(physHugePageSize / pageSize) 932 hugePageAbove := uint(alignUp(uintptr(start), pagesPerHugePage)) 933 934 // If that boundary is within our current candidate, then we may be breaking 935 // a huge page. 936 if hugePageAbove <= end { 937 // Compute the huge page boundary below our candidate. 938 hugePageBelow := uint(alignDown(uintptr(start), pagesPerHugePage)) 939 940 if hugePageBelow >= end-run { 941 // We're in danger of breaking apart a huge page since start+size crosses 942 // a huge page boundary and rounding down start to the nearest huge 943 // page boundary is included in the full run we found. Include the entire 944 // huge page in the bound by rounding down to the huge page size. 945 size = size + (start - hugePageBelow) 946 start = hugePageBelow 947 } 948 } 949 } 950 return start, size 951 } 952 953 // scavengeIndex is a structure for efficiently managing which pageAlloc chunks have 954 // memory available to scavenge. 955 type scavengeIndex struct { 956 // chunks is a bitmap representing the entire address space. Each bit represents 957 // a single chunk, and a 1 value indicates the presence of pages available for 958 // scavenging. Updates to the bitmap are serialized by the pageAlloc lock. 959 // 960 // The underlying storage of chunks is platform dependent and may not even be 961 // totally mapped read/write. min and max reflect the extent that is safe to access. 962 // min is inclusive, max is exclusive. 963 // 964 // searchAddr is the maximum address (in the offset address space, so we have a linear 965 // view of the address space; see mranges.go:offAddr) containing memory available to 966 // scavenge. It is a hint to the find operation to avoid O(n^2) behavior in repeated lookups. 967 // 968 // searchAddr is always inclusive and should be the base address of the highest runtime 969 // page available for scavenging. 970 // 971 // searchAddr is managed by both find and mark. 972 // 973 // Normally, find monotonically decreases searchAddr as it finds no more free pages to 974 // scavenge. However, mark, when marking a new chunk at an index greater than the current 975 // searchAddr, sets searchAddr to the *negative* index into chunks of that page. The trick here 976 // is that concurrent calls to find will fail to monotonically decrease searchAddr, and so they 977 // won't barge over new memory becoming available to scavenge. Furthermore, this ensures 978 // that some future caller of find *must* observe the new high index. That caller 979 // (or any other racing with it), then makes searchAddr positive before continuing, bringing 980 // us back to our monotonically decreasing steady-state. 981 // 982 // A pageAlloc lock serializes updates between min, max, and searchAddr, so abs(searchAddr) 983 // is always guaranteed to be >= min and < max (converted to heap addresses). 984 // 985 // TODO(mknyszek): Ideally we would use something bigger than a uint8 for faster 986 // iteration like uint32, but we lack the bit twiddling intrinsics. We'd need to either 987 // copy them from math/bits or fix the fact that we can't import math/bits' code from 988 // the runtime due to compiler instrumentation. 989 searchAddr atomicOffAddr 990 chunks []atomic.Uint8 991 minHeapIdx atomic.Int32 992 min, max atomic.Int32 993 } 994 995 // find returns the highest chunk index that may contain pages available to scavenge. 996 // It also returns an offset to start searching in the highest chunk. 997 func (s *scavengeIndex) find() (chunkIdx, uint) { 998 searchAddr, marked := s.searchAddr.Load() 999 if searchAddr == minOffAddr.addr() { 1000 // We got a cleared search addr. 1001 return 0, 0 1002 } 1003 1004 // Starting from searchAddr's chunk, and moving down to minHeapIdx, 1005 // iterate until we find a chunk with pages to scavenge. 1006 min := s.minHeapIdx.Load() 1007 searchChunk := chunkIndex(uintptr(searchAddr)) 1008 start := int32(searchChunk / 8) 1009 for i := start; i >= min; i-- { 1010 // Skip over irrelevant address space. 1011 chunks := s.chunks[i].Load() 1012 if chunks == 0 { 1013 continue 1014 } 1015 // Note that we can't have 8 leading zeroes here because 1016 // we necessarily skipped that case. So, what's left is 1017 // an index. If there are no zeroes, we want the 7th 1018 // index, if 1 zero, the 6th, and so on. 1019 n := 7 - sys.LeadingZeros8(chunks) 1020 ci := chunkIdx(uint(i)*8 + uint(n)) 1021 if searchChunk == ci { 1022 return ci, chunkPageIndex(uintptr(searchAddr)) 1023 } 1024 // Try to reduce searchAddr to newSearchAddr. 1025 newSearchAddr := chunkBase(ci) + pallocChunkBytes - pageSize 1026 if marked { 1027 // Attempt to be the first one to decrease the searchAddr 1028 // after an increase. If we fail, that means there was another 1029 // increase, or somebody else got to it before us. Either way, 1030 // it doesn't matter. We may lose some performance having an 1031 // incorrect search address, but it's far more important that 1032 // we don't miss updates. 1033 s.searchAddr.StoreUnmark(searchAddr, newSearchAddr) 1034 } else { 1035 // Decrease searchAddr. 1036 s.searchAddr.StoreMin(newSearchAddr) 1037 } 1038 return ci, pallocChunkPages - 1 1039 } 1040 // Clear searchAddr, because we've exhausted the heap. 1041 s.searchAddr.Clear() 1042 return 0, 0 1043 } 1044 1045 // mark sets the inclusive range of chunks between indices start and end as 1046 // containing pages available to scavenge. 1047 // 1048 // Must be serialized with other mark, markRange, and clear calls. 1049 func (s *scavengeIndex) mark(base, limit uintptr) { 1050 start, end := chunkIndex(base), chunkIndex(limit-pageSize) 1051 if start == end { 1052 // Within a chunk. 1053 mask := uint8(1 << (start % 8)) 1054 s.chunks[start/8].Or(mask) 1055 } else if start/8 == end/8 { 1056 // Within the same byte in the index. 1057 mask := uint8(uint16(1<<(end-start+1))-1) << (start % 8) 1058 s.chunks[start/8].Or(mask) 1059 } else { 1060 // Crosses multiple bytes in the index. 1061 startAligned := chunkIdx(alignUp(uintptr(start), 8)) 1062 endAligned := chunkIdx(alignDown(uintptr(end), 8)) 1063 1064 // Do the end of the first byte first. 1065 if width := startAligned - start; width > 0 { 1066 mask := uint8(uint16(1<<width)-1) << (start % 8) 1067 s.chunks[start/8].Or(mask) 1068 } 1069 // Do the middle aligned sections that take up a whole 1070 // byte. 1071 for ci := startAligned; ci < endAligned; ci += 8 { 1072 s.chunks[ci/8].Store(^uint8(0)) 1073 } 1074 // Do the end of the last byte. 1075 // 1076 // This width check doesn't match the one above 1077 // for start because aligning down into the endAligned 1078 // block means we always have at least one chunk in this 1079 // block (note that end is *inclusive*). This also means 1080 // that if end == endAligned+n, then what we really want 1081 // is to fill n+1 chunks, i.e. width n+1. By induction, 1082 // this is true for all n. 1083 if width := end - endAligned + 1; width > 0 { 1084 mask := uint8(uint16(1<<width) - 1) 1085 s.chunks[end/8].Or(mask) 1086 } 1087 } 1088 newSearchAddr := limit - pageSize 1089 searchAddr, _ := s.searchAddr.Load() 1090 // N.B. Because mark is serialized, it's not necessary to do a 1091 // full CAS here. mark only ever increases searchAddr, while 1092 // find only ever decreases it. Since we only ever race with 1093 // decreases, even if the value we loaded is stale, the actual 1094 // value will never be larger. 1095 if (offAddr{searchAddr}).lessThan(offAddr{newSearchAddr}) { 1096 s.searchAddr.StoreMarked(newSearchAddr) 1097 } 1098 } 1099 1100 // clear sets the chunk at index ci as not containing pages available to scavenge. 1101 // 1102 // Must be serialized with other mark, markRange, and clear calls. 1103 func (s *scavengeIndex) clear(ci chunkIdx) { 1104 s.chunks[ci/8].And(^uint8(1 << (ci % 8))) 1105 } 1106