metrics_test.go

     1  // Copyright 2020 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package runtime_test
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"internal/abi"
    11  	"internal/goexperiment"
    12  	"internal/profile"
    13  	"internal/testenv"
    14  	"os"
    15  	"reflect"
    16  	"runtime"
    17  	"runtime/debug"
    18  	"runtime/metrics"
    19  	"runtime/pprof"
    20  	"runtime/trace"
    21  	"slices"
    22  	"sort"
    23  	"strings"
    24  	"sync"
    25  	"sync/atomic"
    26  	"testing"
    27  	"time"
    28  	"unsafe"
    29  )
    30  
    31  func prepareAllMetricsSamples() (map[string]metrics.Description, []metrics.Sample) {
    32  	all := metrics.All()
    33  	samples := make([]metrics.Sample, len(all))
    34  	descs := make(map[string]metrics.Description)
    35  	for i := range all {
    36  		samples[i].Name = all[i].Name
    37  		descs[all[i].Name] = all[i]
    38  	}
    39  	return descs, samples
    40  }
    41  
    42  func TestReadMetrics(t *testing.T) {
    43  	// Run a GC cycle to get some of the stats to be non-zero.
    44  	runtime.GC()
    45  
    46  	// Set an arbitrary memory limit to check the metric for it
    47  	limit := int64(512 * 1024 * 1024)
    48  	oldLimit := debug.SetMemoryLimit(limit)
    49  	defer debug.SetMemoryLimit(oldLimit)
    50  
    51  	// Set a GC percent to check the metric for it
    52  	gcPercent := 99
    53  	oldGCPercent := debug.SetGCPercent(gcPercent)
    54  	defer debug.SetGCPercent(oldGCPercent)
    55  
    56  	// Tests whether readMetrics produces values aligning
    57  	// with ReadMemStats while the world is stopped.
    58  	var mstats runtime.MemStats
    59  	_, samples := prepareAllMetricsSamples()
    60  	runtime.ReadMetricsSlow(&mstats, unsafe.Pointer(&samples[0]), len(samples), cap(samples))
    61  
    62  	checkUint64 := func(t *testing.T, m string, got, want uint64) {
    63  		t.Helper()
    64  		if got != want {
    65  			t.Errorf("metric %q: got %d, want %d", m, got, want)
    66  		}
    67  	}
    68  
    69  	// Check to make sure the values we read line up with other values we read.
    70  	var allocsBySize, gcPauses, schedPausesTotalGC *metrics.Float64Histogram
    71  	var tinyAllocs uint64
    72  	var mallocs, frees uint64
    73  	for i := range samples {
    74  		switch name := samples[i].Name; name {
    75  		case "/cgo/go-to-c-calls:calls":
    76  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(runtime.NumCgoCall()))
    77  		case "/memory/classes/heap/free:bytes":
    78  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapIdle-mstats.HeapReleased)
    79  		case "/memory/classes/heap/released:bytes":
    80  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapReleased)
    81  		case "/memory/classes/heap/objects:bytes":
    82  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapAlloc)
    83  		case "/memory/classes/heap/unused:bytes":
    84  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapInuse-mstats.HeapAlloc)
    85  		case "/memory/classes/heap/stacks:bytes":
    86  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackInuse)
    87  		case "/memory/classes/metadata/mcache/free:bytes":
    88  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheSys-mstats.MCacheInuse)
    89  		case "/memory/classes/metadata/mcache/inuse:bytes":
    90  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheInuse)
    91  		case "/memory/classes/metadata/mspan/free:bytes":
    92  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanSys-mstats.MSpanInuse)
    93  		case "/memory/classes/metadata/mspan/inuse:bytes":
    94  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanInuse)
    95  		case "/memory/classes/metadata/other:bytes":
    96  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.GCSys)
    97  		case "/memory/classes/os-stacks:bytes":
    98  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackSys-mstats.StackInuse)
    99  		case "/memory/classes/other:bytes":
   100  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.OtherSys)
   101  		case "/memory/classes/profiling/buckets:bytes":
   102  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.BuckHashSys)
   103  		case "/memory/classes/total:bytes":
   104  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.Sys)
   105  		case "/gc/heap/allocs-by-size:bytes":
   106  			hist := samples[i].Value.Float64Histogram()
   107  			// Skip size class 0 in BySize, because it's always empty and not represented
   108  			// in the histogram.
   109  			for i, sc := range mstats.BySize[1:] {
   110  				if b, s := hist.Buckets[i+1], float64(sc.Size+1); b != s {
   111  					t.Errorf("bucket does not match size class: got %f, want %f", b, s)
   112  					// The rest of the checks aren't expected to work anyway.
   113  					continue
   114  				}
   115  				if c, m := hist.Counts[i], sc.Mallocs; c != m {
   116  					t.Errorf("histogram counts do not much BySize for class %d: got %d, want %d", i, c, m)
   117  				}
   118  			}
   119  			allocsBySize = hist
   120  		case "/gc/heap/allocs:bytes":
   121  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.TotalAlloc)
   122  		case "/gc/heap/frees-by-size:bytes":
   123  			hist := samples[i].Value.Float64Histogram()
   124  			// Skip size class 0 in BySize, because it's always empty and not represented
   125  			// in the histogram.
   126  			for i, sc := range mstats.BySize[1:] {
   127  				if b, s := hist.Buckets[i+1], float64(sc.Size+1); b != s {
   128  					t.Errorf("bucket does not match size class: got %f, want %f", b, s)
   129  					// The rest of the checks aren't expected to work anyway.
   130  					continue
   131  				}
   132  				if c, f := hist.Counts[i], sc.Frees; c != f {
   133  					t.Errorf("histogram counts do not match BySize for class %d: got %d, want %d", i, c, f)
   134  				}
   135  			}
   136  		case "/gc/heap/frees:bytes":
   137  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.TotalAlloc-mstats.HeapAlloc)
   138  		case "/gc/heap/tiny/allocs:objects":
   139  			// Currently, MemStats adds tiny alloc count to both Mallocs AND Frees.
   140  			// The reason for this is because MemStats couldn't be extended at the time
   141  			// but there was a desire to have Mallocs at least be a little more representative,
   142  			// while having Mallocs - Frees still represent a live object count.
   143  			// Unfortunately, MemStats doesn't actually export a large allocation count,
   144  			// so it's impossible to pull this number out directly.
   145  			//
   146  			// Check tiny allocation count outside of this loop, by using the allocs-by-size
   147  			// histogram in order to figure out how many large objects there are.
   148  			tinyAllocs = samples[i].Value.Uint64()
   149  			// Because the next two metrics tests are checking against Mallocs and Frees,
   150  			// we can't check them directly for the same reason: we need to account for tiny
   151  			// allocations included in Mallocs and Frees.
   152  		case "/gc/heap/allocs:objects":
   153  			mallocs = samples[i].Value.Uint64()
   154  		case "/gc/heap/frees:objects":
   155  			frees = samples[i].Value.Uint64()
   156  		case "/gc/heap/live:bytes":
   157  			// Check for "obviously wrong" values. We can't check a stronger invariant,
   158  			// such as live <= HeapAlloc, because live is not 100% accurate. It's computed
   159  			// under racy conditions, and some objects may be double-counted (this is
   160  			// intentional and necessary for GC performance).
   161  			//
   162  			// Instead, check against a much more reasonable upper-bound: the amount of
   163  			// mapped heap memory. We can't possibly overcount to the point of exceeding
   164  			// total mapped heap memory, except if there's an accounting bug.
   165  			if live := samples[i].Value.Uint64(); live > mstats.HeapSys {
   166  				t.Errorf("live bytes: %d > heap sys: %d", live, mstats.HeapSys)
   167  			} else if live == 0 {
   168  				// Might happen if we don't call runtime.GC() above.
   169  				t.Error("live bytes is 0")
   170  			}
   171  		case "/gc/gomemlimit:bytes":
   172  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(limit))
   173  		case "/gc/heap/objects:objects":
   174  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapObjects)
   175  		case "/gc/heap/goal:bytes":
   176  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.NextGC)
   177  		case "/gc/gogc:percent":
   178  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(gcPercent))
   179  		case "/gc/cycles/automatic:gc-cycles":
   180  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC-mstats.NumForcedGC))
   181  		case "/gc/cycles/forced:gc-cycles":
   182  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumForcedGC))
   183  		case "/gc/cycles/total:gc-cycles":
   184  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC))
   185  		case "/gc/pauses:seconds":
   186  			gcPauses = samples[i].Value.Float64Histogram()
   187  		case "/sched/pauses/total/gc:seconds":
   188  			schedPausesTotalGC = samples[i].Value.Float64Histogram()
   189  		}
   190  	}
   191  
   192  	// Check tinyAllocs.
   193  	nonTinyAllocs := uint64(0)
   194  	for _, c := range allocsBySize.Counts {
   195  		nonTinyAllocs += c
   196  	}
   197  	checkUint64(t, "/gc/heap/tiny/allocs:objects", tinyAllocs, mstats.Mallocs-nonTinyAllocs)
   198  
   199  	// Check allocation and free counts.
   200  	checkUint64(t, "/gc/heap/allocs:objects", mallocs, mstats.Mallocs-tinyAllocs)
   201  	checkUint64(t, "/gc/heap/frees:objects", frees, mstats.Frees-tinyAllocs)
   202  
   203  	// Verify that /gc/pauses:seconds is a copy of /sched/pauses/total/gc:seconds
   204  	if !slices.Equal(gcPauses.Buckets, schedPausesTotalGC.Buckets) {
   205  		t.Errorf("/gc/pauses:seconds buckets %v do not match /sched/pauses/total/gc:seconds buckets %v", gcPauses.Buckets, schedPausesTotalGC.Counts)
   206  	}
   207  	if !slices.Equal(gcPauses.Counts, schedPausesTotalGC.Counts) {
   208  		t.Errorf("/gc/pauses:seconds counts %v do not match /sched/pauses/total/gc:seconds counts %v", gcPauses.Counts, schedPausesTotalGC.Counts)
   209  	}
   210  }
   211  
   212  func TestReadMetricsConsistency(t *testing.T) {
   213  	// Tests whether readMetrics produces consistent, sensible values.
   214  	// The values are read concurrently with the runtime doing other
   215  	// things (e.g. allocating) so what we read can't reasonably compared
   216  	// to other runtime values (e.g. MemStats).
   217  
   218  	// Run a few GC cycles to get some of the stats to be non-zero.
   219  	runtime.GC()
   220  	runtime.GC()
   221  	runtime.GC()
   222  
   223  	// Set GOMAXPROCS high then sleep briefly to ensure we generate
   224  	// some idle time.
   225  	oldmaxprocs := runtime.GOMAXPROCS(10)
   226  	time.Sleep(time.Millisecond)
   227  	runtime.GOMAXPROCS(oldmaxprocs)
   228  
   229  	// Read all the supported metrics through the metrics package.
   230  	descs, samples := prepareAllMetricsSamples()
   231  	metrics.Read(samples)
   232  
   233  	// Check to make sure the values we read make sense.
   234  	var totalVirtual struct {
   235  		got, want uint64
   236  	}
   237  	var objects struct {
   238  		alloc, free             *metrics.Float64Histogram
   239  		allocs, frees           uint64
   240  		allocdBytes, freedBytes uint64
   241  		total, totalBytes       uint64
   242  	}
   243  	var gc struct {
   244  		numGC  uint64
   245  		pauses uint64
   246  	}
   247  	var totalScan struct {
   248  		got, want uint64
   249  	}
   250  	var cpu struct {
   251  		gcAssist    float64
   252  		gcDedicated float64
   253  		gcIdle      float64
   254  		gcPause     float64
   255  		gcTotal     float64
   256  
   257  		idle float64
   258  		user float64
   259  
   260  		scavengeAssist float64
   261  		scavengeBg     float64
   262  		scavengeTotal  float64
   263  
   264  		total float64
   265  	}
   266  	for i := range samples {
   267  		kind := samples[i].Value.Kind()
   268  		if want := descs[samples[i].Name].Kind; kind != want {
   269  			t.Errorf("supported metric %q has unexpected kind: got %d, want %d", samples[i].Name, kind, want)
   270  			continue
   271  		}
   272  		if samples[i].Name != "/memory/classes/total:bytes" && strings.HasPrefix(samples[i].Name, "/memory/classes") {
   273  			v := samples[i].Value.Uint64()
   274  			totalVirtual.want += v
   275  
   276  			// None of these stats should ever get this big.
   277  			// If they do, there's probably overflow involved,
   278  			// usually due to bad accounting.
   279  			if int64(v) < 0 {
   280  				t.Errorf("%q has high/negative value: %d", samples[i].Name, v)
   281  			}
   282  		}
   283  		switch samples[i].Name {
   284  		case "/cpu/classes/gc/mark/assist:cpu-seconds":
   285  			cpu.gcAssist = samples[i].Value.Float64()
   286  		case "/cpu/classes/gc/mark/dedicated:cpu-seconds":
   287  			cpu.gcDedicated = samples[i].Value.Float64()
   288  		case "/cpu/classes/gc/mark/idle:cpu-seconds":
   289  			cpu.gcIdle = samples[i].Value.Float64()
   290  		case "/cpu/classes/gc/pause:cpu-seconds":
   291  			cpu.gcPause = samples[i].Value.Float64()
   292  		case "/cpu/classes/gc/total:cpu-seconds":
   293  			cpu.gcTotal = samples[i].Value.Float64()
   294  		case "/cpu/classes/idle:cpu-seconds":
   295  			cpu.idle = samples[i].Value.Float64()
   296  		case "/cpu/classes/scavenge/assist:cpu-seconds":
   297  			cpu.scavengeAssist = samples[i].Value.Float64()
   298  		case "/cpu/classes/scavenge/background:cpu-seconds":
   299  			cpu.scavengeBg = samples[i].Value.Float64()
   300  		case "/cpu/classes/scavenge/total:cpu-seconds":
   301  			cpu.scavengeTotal = samples[i].Value.Float64()
   302  		case "/cpu/classes/total:cpu-seconds":
   303  			cpu.total = samples[i].Value.Float64()
   304  		case "/cpu/classes/user:cpu-seconds":
   305  			cpu.user = samples[i].Value.Float64()
   306  		case "/memory/classes/total:bytes":
   307  			totalVirtual.got = samples[i].Value.Uint64()
   308  		case "/memory/classes/heap/objects:bytes":
   309  			objects.totalBytes = samples[i].Value.Uint64()
   310  		case "/gc/heap/objects:objects":
   311  			objects.total = samples[i].Value.Uint64()
   312  		case "/gc/heap/allocs:bytes":
   313  			objects.allocdBytes = samples[i].Value.Uint64()
   314  		case "/gc/heap/allocs:objects":
   315  			objects.allocs = samples[i].Value.Uint64()
   316  		case "/gc/heap/allocs-by-size:bytes":
   317  			objects.alloc = samples[i].Value.Float64Histogram()
   318  		case "/gc/heap/frees:bytes":
   319  			objects.freedBytes = samples[i].Value.Uint64()
   320  		case "/gc/heap/frees:objects":
   321  			objects.frees = samples[i].Value.Uint64()
   322  		case "/gc/heap/frees-by-size:bytes":
   323  			objects.free = samples[i].Value.Float64Histogram()
   324  		case "/gc/cycles:gc-cycles":
   325  			gc.numGC = samples[i].Value.Uint64()
   326  		case "/gc/pauses:seconds":
   327  			h := samples[i].Value.Float64Histogram()
   328  			gc.pauses = 0
   329  			for i := range h.Counts {
   330  				gc.pauses += h.Counts[i]
   331  			}
   332  		case "/gc/scan/heap:bytes":
   333  			totalScan.want += samples[i].Value.Uint64()
   334  		case "/gc/scan/globals:bytes":
   335  			totalScan.want += samples[i].Value.Uint64()
   336  		case "/gc/scan/stack:bytes":
   337  			totalScan.want += samples[i].Value.Uint64()
   338  		case "/gc/scan/total:bytes":
   339  			totalScan.got = samples[i].Value.Uint64()
   340  		case "/sched/gomaxprocs:threads":
   341  			if got, want := samples[i].Value.Uint64(), uint64(runtime.GOMAXPROCS(-1)); got != want {
   342  				t.Errorf("gomaxprocs doesn't match runtime.GOMAXPROCS: got %d, want %d", got, want)
   343  			}
   344  		case "/sched/goroutines:goroutines":
   345  			if samples[i].Value.Uint64() < 1 {
   346  				t.Error("number of goroutines is less than one")
   347  			}
   348  		}
   349  	}
   350  	// Only check this on Linux where we can be reasonably sure we have a high-resolution timer.
   351  	if runtime.GOOS == "linux" {
   352  		if cpu.gcDedicated <= 0 && cpu.gcAssist <= 0 && cpu.gcIdle <= 0 {
   353  			t.Errorf("found no time spent on GC work: %#v", cpu)
   354  		}
   355  		if cpu.gcPause <= 0 {
   356  			t.Errorf("found no GC pauses: %f", cpu.gcPause)
   357  		}
   358  		if cpu.idle <= 0 {
   359  			t.Errorf("found no idle time: %f", cpu.idle)
   360  		}
   361  		if total := cpu.gcDedicated + cpu.gcAssist + cpu.gcIdle + cpu.gcPause; !withinEpsilon(cpu.gcTotal, total, 0.001) {
   362  			t.Errorf("calculated total GC CPU time not within %%0.1 of total: %f vs. %f", total, cpu.gcTotal)
   363  		}
   364  		if total := cpu.scavengeAssist + cpu.scavengeBg; !withinEpsilon(cpu.scavengeTotal, total, 0.001) {
   365  			t.Errorf("calculated total scavenge CPU not within %%0.1 of total: %f vs. %f", total, cpu.scavengeTotal)
   366  		}
   367  		if cpu.total <= 0 {
   368  			t.Errorf("found no total CPU time passed")
   369  		}
   370  		if cpu.user <= 0 {
   371  			t.Errorf("found no user time passed")
   372  		}
   373  		if total := cpu.gcTotal + cpu.scavengeTotal + cpu.user + cpu.idle; !withinEpsilon(cpu.total, total, 0.001) {
   374  			t.Errorf("calculated total CPU not within %%0.1 of total: %f vs. %f", total, cpu.total)
   375  		}
   376  	}
   377  	if totalVirtual.got != totalVirtual.want {
   378  		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
   379  	}
   380  	if got, want := objects.allocs-objects.frees, objects.total; got != want {
   381  		t.Errorf("mismatch between object alloc/free tallies and total: got %d, want %d", got, want)
   382  	}
   383  	if got, want := objects.allocdBytes-objects.freedBytes, objects.totalBytes; got != want {
   384  		t.Errorf("mismatch between object alloc/free tallies and total: got %d, want %d", got, want)
   385  	}
   386  	if b, c := len(objects.alloc.Buckets), len(objects.alloc.Counts); b != c+1 {
   387  		t.Errorf("allocs-by-size has wrong bucket or counts length: %d buckets, %d counts", b, c)
   388  	}
   389  	if b, c := len(objects.free.Buckets), len(objects.free.Counts); b != c+1 {
   390  		t.Errorf("frees-by-size has wrong bucket or counts length: %d buckets, %d counts", b, c)
   391  	}
   392  	if len(objects.alloc.Buckets) != len(objects.free.Buckets) {
   393  		t.Error("allocs-by-size and frees-by-size buckets don't match in length")
   394  	} else if len(objects.alloc.Counts) != len(objects.free.Counts) {
   395  		t.Error("allocs-by-size and frees-by-size counts don't match in length")
   396  	} else {
   397  		for i := range objects.alloc.Buckets {
   398  			ba := objects.alloc.Buckets[i]
   399  			bf := objects.free.Buckets[i]
   400  			if ba != bf {
   401  				t.Errorf("bucket %d is different for alloc and free hists: %f != %f", i, ba, bf)
   402  			}
   403  		}
   404  		if !t.Failed() {
   405  			var gotAlloc, gotFree uint64
   406  			want := objects.total
   407  			for i := range objects.alloc.Counts {
   408  				if objects.alloc.Counts[i] < objects.free.Counts[i] {
   409  					t.Errorf("found more allocs than frees in object dist bucket %d", i)
   410  					continue
   411  				}
   412  				gotAlloc += objects.alloc.Counts[i]
   413  				gotFree += objects.free.Counts[i]
   414  			}
   415  			if got := gotAlloc - gotFree; got != want {
   416  				t.Errorf("object distribution counts don't match count of live objects: got %d, want %d", got, want)
   417  			}
   418  			if gotAlloc != objects.allocs {
   419  				t.Errorf("object distribution counts don't match total allocs: got %d, want %d", gotAlloc, objects.allocs)
   420  			}
   421  			if gotFree != objects.frees {
   422  				t.Errorf("object distribution counts don't match total allocs: got %d, want %d", gotFree, objects.frees)
   423  			}
   424  		}
   425  	}
   426  	// The current GC has at least 2 pauses per GC.
   427  	// Check to see if that value makes sense.
   428  	if gc.pauses < gc.numGC*2 {
   429  		t.Errorf("fewer pauses than expected: got %d, want at least %d", gc.pauses, gc.numGC*2)
   430  	}
   431  	if totalScan.got <= 0 {
   432  		t.Errorf("scannable GC space is empty: %d", totalScan.got)
   433  	}
   434  	if totalScan.got != totalScan.want {
   435  		t.Errorf("/gc/scan/total:bytes doesn't line up with sum of /gc/scan*: total %d vs. sum %d", totalScan.got, totalScan.want)
   436  	}
   437  }
   438  
   439  func BenchmarkReadMetricsLatency(b *testing.B) {
   440  	stop := applyGCLoad(b)
   441  
   442  	// Spend this much time measuring latencies.
   443  	latencies := make([]time.Duration, 0, 1024)
   444  	_, samples := prepareAllMetricsSamples()
   445  
   446  	// Hit metrics.Read continuously and measure.
   447  	b.ResetTimer()
   448  	for i := 0; i < b.N; i++ {
   449  		start := time.Now()
   450  		metrics.Read(samples)
   451  		latencies = append(latencies, time.Since(start))
   452  	}
   453  	// Make sure to stop the timer before we wait! The load created above
   454  	// is very heavy-weight and not easy to stop, so we could end up
   455  	// confusing the benchmarking framework for small b.N.
   456  	b.StopTimer()
   457  	stop()
   458  
   459  	// Disable the default */op metrics.
   460  	// ns/op doesn't mean anything because it's an average, but we
   461  	// have a sleep in our b.N loop above which skews this significantly.
   462  	b.ReportMetric(0, "ns/op")
   463  	b.ReportMetric(0, "B/op")
   464  	b.ReportMetric(0, "allocs/op")
   465  
   466  	// Sort latencies then report percentiles.
   467  	sort.Slice(latencies, func(i, j int) bool {
   468  		return latencies[i] < latencies[j]
   469  	})
   470  	b.ReportMetric(float64(latencies[len(latencies)*50/100]), "p50-ns")
   471  	b.ReportMetric(float64(latencies[len(latencies)*90/100]), "p90-ns")
   472  	b.ReportMetric(float64(latencies[len(latencies)*99/100]), "p99-ns")
   473  }
   474  
   475  var readMetricsSink [1024]interface{}
   476  
   477  func TestReadMetricsCumulative(t *testing.T) {
   478  	// Set up the set of metrics marked cumulative.
   479  	descs := metrics.All()
   480  	var samples [2][]metrics.Sample
   481  	samples[0] = make([]metrics.Sample, len(descs))
   482  	samples[1] = make([]metrics.Sample, len(descs))
   483  	total := 0
   484  	for i := range samples[0] {
   485  		if !descs[i].Cumulative {
   486  			continue
   487  		}
   488  		samples[0][total].Name = descs[i].Name
   489  		total++
   490  	}
   491  	samples[0] = samples[0][:total]
   492  	samples[1] = samples[1][:total]
   493  	copy(samples[1], samples[0])
   494  
   495  	// Start some noise in the background.
   496  	var wg sync.WaitGroup
   497  	wg.Add(1)
   498  	done := make(chan struct{})
   499  	go func() {
   500  		defer wg.Done()
   501  		for {
   502  			// Add more things here that could influence metrics.
   503  			for i := 0; i < len(readMetricsSink); i++ {
   504  				readMetricsSink[i] = make([]byte, 1024)
   505  				select {
   506  				case <-done:
   507  					return
   508  				default:
   509  				}
   510  			}
   511  			runtime.GC()
   512  		}
   513  	}()
   514  
   515  	sum := func(us []uint64) uint64 {
   516  		total := uint64(0)
   517  		for _, u := range us {
   518  			total += u
   519  		}
   520  		return total
   521  	}
   522  
   523  	// Populate the first generation.
   524  	metrics.Read(samples[0])
   525  
   526  	// Check to make sure that these metrics only grow monotonically.
   527  	for gen := 1; gen < 10; gen++ {
   528  		metrics.Read(samples[gen%2])
   529  		for i := range samples[gen%2] {
   530  			name := samples[gen%2][i].Name
   531  			vNew, vOld := samples[gen%2][i].Value, samples[1-(gen%2)][i].Value
   532  
   533  			switch vNew.Kind() {
   534  			case metrics.KindUint64:
   535  				new := vNew.Uint64()
   536  				old := vOld.Uint64()
   537  				if new < old {
   538  					t.Errorf("%s decreased: %d < %d", name, new, old)
   539  				}
   540  			case metrics.KindFloat64:
   541  				new := vNew.Float64()
   542  				old := vOld.Float64()
   543  				if new < old {
   544  					t.Errorf("%s decreased: %f < %f", name, new, old)
   545  				}
   546  			case metrics.KindFloat64Histogram:
   547  				new := sum(vNew.Float64Histogram().Counts)
   548  				old := sum(vOld.Float64Histogram().Counts)
   549  				if new < old {
   550  					t.Errorf("%s counts decreased: %d < %d", name, new, old)
   551  				}
   552  			}
   553  		}
   554  	}
   555  	close(done)
   556  
   557  	wg.Wait()
   558  }
   559  
   560  func withinEpsilon(v1, v2, e float64) bool {
   561  	return v2-v2*e <= v1 && v1 <= v2+v2*e
   562  }
   563  
   564  func TestMutexWaitTimeMetric(t *testing.T) {
   565  	var sample [1]metrics.Sample
   566  	sample[0].Name = "/sync/mutex/wait/total:seconds"
   567  
   568  	locks := []locker2{
   569  		new(mutex),
   570  		new(rwmutexWrite),
   571  		new(rwmutexReadWrite),
   572  		new(rwmutexWriteRead),
   573  	}
   574  	for _, lock := range locks {
   575  		t.Run(reflect.TypeOf(lock).Elem().Name(), func(t *testing.T) {
   576  			metrics.Read(sample[:])
   577  			before := time.Duration(sample[0].Value.Float64() * 1e9)
   578  
   579  			minMutexWaitTime := generateMutexWaitTime(lock)
   580  
   581  			metrics.Read(sample[:])
   582  			after := time.Duration(sample[0].Value.Float64() * 1e9)
   583  
   584  			if wt := after - before; wt < minMutexWaitTime {
   585  				t.Errorf("too little mutex wait time: got %s, want %s", wt, minMutexWaitTime)
   586  			}
   587  		})
   588  	}
   589  }
   590  
   591  // locker2 represents an API surface of two concurrent goroutines
   592  // locking the same resource, but through different APIs. It's intended
   593  // to abstract over the relationship of two Lock calls or an RLock
   594  // and a Lock call.
   595  type locker2 interface {
   596  	Lock1()
   597  	Unlock1()
   598  	Lock2()
   599  	Unlock2()
   600  }
   601  
   602  type mutex struct {
   603  	mu sync.Mutex
   604  }
   605  
   606  func (m *mutex) Lock1()   { m.mu.Lock() }
   607  func (m *mutex) Unlock1() { m.mu.Unlock() }
   608  func (m *mutex) Lock2()   { m.mu.Lock() }
   609  func (m *mutex) Unlock2() { m.mu.Unlock() }
   610  
   611  type rwmutexWrite struct {
   612  	mu sync.RWMutex
   613  }
   614  
   615  func (m *rwmutexWrite) Lock1()   { m.mu.Lock() }
   616  func (m *rwmutexWrite) Unlock1() { m.mu.Unlock() }
   617  func (m *rwmutexWrite) Lock2()   { m.mu.Lock() }
   618  func (m *rwmutexWrite) Unlock2() { m.mu.Unlock() }
   619  
   620  type rwmutexReadWrite struct {
   621  	mu sync.RWMutex
   622  }
   623  
   624  func (m *rwmutexReadWrite) Lock1()   { m.mu.RLock() }
   625  func (m *rwmutexReadWrite) Unlock1() { m.mu.RUnlock() }
   626  func (m *rwmutexReadWrite) Lock2()   { m.mu.Lock() }
   627  func (m *rwmutexReadWrite) Unlock2() { m.mu.Unlock() }
   628  
   629  type rwmutexWriteRead struct {
   630  	mu sync.RWMutex
   631  }
   632  
   633  func (m *rwmutexWriteRead) Lock1()   { m.mu.Lock() }
   634  func (m *rwmutexWriteRead) Unlock1() { m.mu.Unlock() }
   635  func (m *rwmutexWriteRead) Lock2()   { m.mu.RLock() }
   636  func (m *rwmutexWriteRead) Unlock2() { m.mu.RUnlock() }
   637  
   638  // generateMutexWaitTime causes a couple of goroutines
   639  // to block a whole bunch of times on a sync.Mutex, returning
   640  // the minimum amount of time that should be visible in the
   641  // /sync/mutex-wait:seconds metric.
   642  func generateMutexWaitTime(mu locker2) time.Duration {
   643  	// Set up the runtime to always track casgstatus transitions for metrics.
   644  	*runtime.CasGStatusAlwaysTrack = true
   645  
   646  	mu.Lock1()
   647  
   648  	// Start up a goroutine to wait on the lock.
   649  	gc := make(chan *runtime.G)
   650  	done := make(chan bool)
   651  	go func() {
   652  		gc <- runtime.Getg()
   653  
   654  		for {
   655  			mu.Lock2()
   656  			mu.Unlock2()
   657  			if <-done {
   658  				return
   659  			}
   660  		}
   661  	}()
   662  	gp := <-gc
   663  
   664  	// Set the block time high enough so that it will always show up, even
   665  	// on systems with coarse timer granularity.
   666  	const blockTime = 100 * time.Millisecond
   667  
   668  	// Make sure the goroutine spawned above actually blocks on the lock.
   669  	for {
   670  		if runtime.GIsWaitingOnMutex(gp) {
   671  			break
   672  		}
   673  		runtime.Gosched()
   674  	}
   675  
   676  	// Let some amount of time pass.
   677  	time.Sleep(blockTime)
   678  
   679  	// Let the other goroutine acquire the lock.
   680  	mu.Unlock1()
   681  	done <- true
   682  
   683  	// Reset flag.
   684  	*runtime.CasGStatusAlwaysTrack = false
   685  	return blockTime
   686  }
   687  
   688  // See issue #60276.
   689  func TestCPUMetricsSleep(t *testing.T) {
   690  	if runtime.GOOS == "wasip1" {
   691  		// Since wasip1 busy-waits in the scheduler, there's no meaningful idle
   692  		// time. This is accurately reflected in the metrics, but it means this
   693  		// test is basically meaningless on this platform.
   694  		t.Skip("wasip1 currently busy-waits in idle time; test not applicable")
   695  	}
   696  
   697  	names := []string{
   698  		"/cpu/classes/idle:cpu-seconds",
   699  
   700  		"/cpu/classes/gc/mark/assist:cpu-seconds",
   701  		"/cpu/classes/gc/mark/dedicated:cpu-seconds",
   702  		"/cpu/classes/gc/mark/idle:cpu-seconds",
   703  		"/cpu/classes/gc/pause:cpu-seconds",
   704  		"/cpu/classes/gc/total:cpu-seconds",
   705  		"/cpu/classes/scavenge/assist:cpu-seconds",
   706  		"/cpu/classes/scavenge/background:cpu-seconds",
   707  		"/cpu/classes/scavenge/total:cpu-seconds",
   708  		"/cpu/classes/total:cpu-seconds",
   709  		"/cpu/classes/user:cpu-seconds",
   710  	}
   711  	prep := func() []metrics.Sample {
   712  		mm := make([]metrics.Sample, len(names))
   713  		for i := range names {
   714  			mm[i].Name = names[i]
   715  		}
   716  		return mm
   717  	}
   718  	m1, m2 := prep(), prep()
   719  
   720  	const (
   721  		// Expected time spent idle.
   722  		dur = 100 * time.Millisecond
   723  
   724  		// maxFailures is the number of consecutive failures requires to cause the test to fail.
   725  		maxFailures = 10
   726  	)
   727  
   728  	failureIdleTimes := make([]float64, 0, maxFailures)
   729  
   730  	// If the bug we expect is happening, then the Sleep CPU time will be accounted for
   731  	// as user time rather than idle time. In an ideal world we'd expect the whole application
   732  	// to go instantly idle the moment this goroutine goes to sleep, and stay asleep for that
   733  	// duration. However, the Go runtime can easily eat into idle time while this goroutine is
   734  	// blocked in a sleep. For example, slow platforms might spend more time expected in the
   735  	// scheduler. Another example is that a Go runtime background goroutine could run while
   736  	// everything else is idle. Lastly, if a running goroutine is descheduled by the OS, enough
   737  	// time may pass such that the goroutine is ready to wake, even though the runtime couldn't
   738  	// observe itself as idle with nanotime.
   739  	//
   740  	// To deal with all this, we give a half-proc's worth of leniency.
   741  	//
   742  	// We also retry multiple times to deal with the fact that the OS might deschedule us before
   743  	// we yield and go idle. That has a rare enough chance that retries should resolve it.
   744  	// If the issue we expect is happening, it should be persistent.
   745  	minIdleCPUSeconds := dur.Seconds() * (float64(runtime.GOMAXPROCS(-1)) - 0.5)
   746  
   747  	// Let's make sure there's no background scavenge work to do.
   748  	//
   749  	// The runtime.GC calls below ensure the background sweeper
   750  	// will not run during the idle period.
   751  	debug.FreeOSMemory()
   752  
   753  	for retries := 0; retries < maxFailures; retries++ {
   754  		// Read 1.
   755  		runtime.GC() // Update /cpu/classes metrics.
   756  		metrics.Read(m1)
   757  
   758  		// Sleep.
   759  		time.Sleep(dur)
   760  
   761  		// Read 2.
   762  		runtime.GC() // Update /cpu/classes metrics.
   763  		metrics.Read(m2)
   764  
   765  		dt := m2[0].Value.Float64() - m1[0].Value.Float64()
   766  		if dt >= minIdleCPUSeconds {
   767  			// All is well. Test passed.
   768  			return
   769  		}
   770  		failureIdleTimes = append(failureIdleTimes, dt)
   771  		// Try again.
   772  	}
   773  
   774  	// We couldn't observe the expected idle time even once.
   775  	for i, dt := range failureIdleTimes {
   776  		t.Logf("try %2d: idle time = %.5fs\n", i+1, dt)
   777  	}
   778  	t.Logf("try %d breakdown:\n", len(failureIdleTimes))
   779  	for i := range names {
   780  		if m1[i].Value.Kind() == metrics.KindBad {
   781  			continue
   782  		}
   783  		t.Logf("\t%s %0.3f\n", names[i], m2[i].Value.Float64()-m1[i].Value.Float64())
   784  	}
   785  	t.Errorf(`time.Sleep did not contribute enough to "idle" class: minimum idle time = %.5fs`, minIdleCPUSeconds)
   786  }
   787  
   788  // Call f() and verify that the correct STW metrics increment. If isGC is true,
   789  // fn triggers a GC STW. Otherwise, fn triggers an other STW.
   790  func testSchedPauseMetrics(t *testing.T, fn func(t *testing.T), isGC bool) {
   791  	m := []metrics.Sample{
   792  		{Name: "/sched/pauses/stopping/gc:seconds"},
   793  		{Name: "/sched/pauses/stopping/other:seconds"},
   794  		{Name: "/sched/pauses/total/gc:seconds"},
   795  		{Name: "/sched/pauses/total/other:seconds"},
   796  	}
   797  
   798  	stoppingGC := &m[0]
   799  	stoppingOther := &m[1]
   800  	totalGC := &m[2]
   801  	totalOther := &m[3]
   802  
   803  	sampleCount := func(s *metrics.Sample) uint64 {
   804  		h := s.Value.Float64Histogram()
   805  
   806  		var n uint64
   807  		for _, c := range h.Counts {
   808  			n += c
   809  		}
   810  		return n
   811  	}
   812  
   813  	// Read baseline.
   814  	metrics.Read(m)
   815  
   816  	baselineStartGC := sampleCount(stoppingGC)
   817  	baselineStartOther := sampleCount(stoppingOther)
   818  	baselineTotalGC := sampleCount(totalGC)
   819  	baselineTotalOther := sampleCount(totalOther)
   820  
   821  	fn(t)
   822  
   823  	metrics.Read(m)
   824  
   825  	if isGC {
   826  		if got := sampleCount(stoppingGC); got <= baselineStartGC {
   827  			t.Errorf("/sched/pauses/stopping/gc:seconds sample count %d did not increase from baseline of %d", got, baselineStartGC)
   828  		}
   829  		if got := sampleCount(totalGC); got <= baselineTotalGC {
   830  			t.Errorf("/sched/pauses/total/gc:seconds sample count %d did not increase from baseline of %d", got, baselineTotalGC)
   831  		}
   832  
   833  		if got := sampleCount(stoppingOther); got != baselineStartOther {
   834  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d changed from baseline of %d", got, baselineStartOther)
   835  		}
   836  		if got := sampleCount(totalOther); got != baselineTotalOther {
   837  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d changed from baseline of %d", got, baselineTotalOther)
   838  		}
   839  	} else {
   840  		if got := sampleCount(stoppingGC); got != baselineStartGC {
   841  			t.Errorf("/sched/pauses/stopping/gc:seconds sample count %d changed from baseline of %d", got, baselineStartGC)
   842  		}
   843  		if got := sampleCount(totalGC); got != baselineTotalGC {
   844  			t.Errorf("/sched/pauses/total/gc:seconds sample count %d changed from baseline of %d", got, baselineTotalGC)
   845  		}
   846  
   847  		if got := sampleCount(stoppingOther); got <= baselineStartOther {
   848  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d did not increase from baseline of %d", got, baselineStartOther)
   849  		}
   850  		if got := sampleCount(totalOther); got <= baselineTotalOther {
   851  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d did not increase from baseline of %d", got, baselineTotalOther)
   852  		}
   853  	}
   854  }
   855  
   856  func TestSchedPauseMetrics(t *testing.T) {
   857  	tests := []struct {
   858  		name string
   859  		isGC bool
   860  		fn   func(t *testing.T)
   861  	}{
   862  		{
   863  			name: "runtime.GC",
   864  			isGC: true,
   865  			fn: func(t *testing.T) {
   866  				runtime.GC()
   867  			},
   868  		},
   869  		{
   870  			name: "runtime.GOMAXPROCS",
   871  			fn: func(t *testing.T) {
   872  				if runtime.GOARCH == "wasm" {
   873  					t.Skip("GOMAXPROCS >1 not supported on wasm")
   874  				}
   875  
   876  				n := runtime.GOMAXPROCS(0)
   877  				defer runtime.GOMAXPROCS(n)
   878  
   879  				runtime.GOMAXPROCS(n + 1)
   880  			},
   881  		},
   882  		{
   883  			name: "runtime.GoroutineProfile",
   884  			fn: func(t *testing.T) {
   885  				var s [1]runtime.StackRecord
   886  				runtime.GoroutineProfile(s[:])
   887  			},
   888  		},
   889  		{
   890  			name: "runtime.ReadMemStats",
   891  			fn: func(t *testing.T) {
   892  				var mstats runtime.MemStats
   893  				runtime.ReadMemStats(&mstats)
   894  			},
   895  		},
   896  		{
   897  			name: "runtime.Stack",
   898  			fn: func(t *testing.T) {
   899  				var b [64]byte
   900  				runtime.Stack(b[:], true)
   901  			},
   902  		},
   903  		{
   904  			name: "runtime/debug.WriteHeapDump",
   905  			fn: func(t *testing.T) {
   906  				if runtime.GOOS == "js" {
   907  					t.Skip("WriteHeapDump not supported on js")
   908  				}
   909  
   910  				f, err := os.CreateTemp(t.TempDir(), "heapdumptest")
   911  				if err != nil {
   912  					t.Fatalf("os.CreateTemp failed: %v", err)
   913  				}
   914  				defer os.Remove(f.Name())
   915  				defer f.Close()
   916  				debug.WriteHeapDump(f.Fd())
   917  			},
   918  		},
   919  		{
   920  			name: "runtime/trace.Start",
   921  			fn: func(t *testing.T) {
   922  				if trace.IsEnabled() {
   923  					t.Skip("tracing already enabled")
   924  				}
   925  
   926  				var buf bytes.Buffer
   927  				if err := trace.Start(&buf); err != nil {
   928  					t.Errorf("trace.Start err got %v want nil", err)
   929  				}
   930  				trace.Stop()
   931  			},
   932  		},
   933  	}
   934  
   935  	// These tests count STW pauses, classified based on whether they're related
   936  	// to the GC or not. Disable automatic GC cycles during the test so we don't
   937  	// have an incidental GC pause when we're trying to observe only
   938  	// non-GC-related pauses. This is especially important for the
   939  	// runtime/trace.Start test, since (as of this writing) that will block
   940  	// until any active GC mark phase completes.
   941  	defer debug.SetGCPercent(debug.SetGCPercent(-1))
   942  	runtime.GC()
   943  
   944  	for _, tc := range tests {
   945  		t.Run(tc.name, func(t *testing.T) {
   946  			testSchedPauseMetrics(t, tc.fn, tc.isGC)
   947  		})
   948  	}
   949  }
   950  
   951  func TestRuntimeLockMetricsAndProfile(t *testing.T) {
   952  	old := runtime.SetMutexProfileFraction(0) // enabled during sub-tests
   953  	defer runtime.SetMutexProfileFraction(old)
   954  	if old != 0 {
   955  		t.Fatalf("need MutexProfileRate 0, got %d", old)
   956  	}
   957  
   958  	{
   959  		before := os.Getenv("GODEBUG")
   960  		for _, s := range strings.Split(before, ",") {
   961  			if strings.HasPrefix(s, "runtimecontentionstacks=") {
   962  				t.Logf("GODEBUG includes explicit setting %q", s)
   963  			}
   964  		}
   965  		defer func() { os.Setenv("GODEBUG", before) }()
   966  		os.Setenv("GODEBUG", fmt.Sprintf("%s,runtimecontentionstacks=1", before))
   967  	}
   968  
   969  	t.Logf("NumCPU %d", runtime.NumCPU())
   970  	t.Logf("GOMAXPROCS %d", runtime.GOMAXPROCS(0))
   971  	if minCPU := 2; runtime.NumCPU() < minCPU {
   972  		t.Skipf("creating and observing contention on runtime-internal locks requires NumCPU >= %d", minCPU)
   973  	}
   974  
   975  	loadProfile := func(t *testing.T) *profile.Profile {
   976  		var w bytes.Buffer
   977  		pprof.Lookup("mutex").WriteTo(&w, 0)
   978  		p, err := profile.Parse(&w)
   979  		if err != nil {
   980  			t.Fatalf("failed to parse profile: %v", err)
   981  		}
   982  		if err := p.CheckValid(); err != nil {
   983  			t.Fatalf("invalid profile: %v", err)
   984  		}
   985  		return p
   986  	}
   987  
   988  	measureDelta := func(t *testing.T, fn func()) (metricGrowth, profileGrowth float64, p *profile.Profile) {
   989  		beforeProfile := loadProfile(t)
   990  		beforeMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
   991  		metrics.Read(beforeMetrics)
   992  
   993  		fn()
   994  
   995  		afterProfile := loadProfile(t)
   996  		afterMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
   997  		metrics.Read(afterMetrics)
   998  
   999  		sumSamples := func(p *profile.Profile, i int) int64 {
  1000  			var sum int64
  1001  			for _, s := range p.Sample {
  1002  				sum += s.Value[i]
  1003  			}
  1004  			return sum
  1005  		}
  1006  
  1007  		metricGrowth = afterMetrics[0].Value.Float64() - beforeMetrics[0].Value.Float64()
  1008  		profileGrowth = float64(sumSamples(afterProfile, 1)-sumSamples(beforeProfile, 1)) * time.Nanosecond.Seconds()
  1009  
  1010  		// The internal/profile package does not support compaction; this delta
  1011  		// profile will include separate positive and negative entries.
  1012  		p = afterProfile.Copy()
  1013  		if len(beforeProfile.Sample) > 0 {
  1014  			err := p.Merge(beforeProfile, -1)
  1015  			if err != nil {
  1016  				t.Fatalf("Merge profiles: %v", err)
  1017  			}
  1018  		}
  1019  
  1020  		return metricGrowth, profileGrowth, p
  1021  	}
  1022  
  1023  	testcase := func(strictTiming bool, acceptStacks [][]string, workers int, fn func() bool) func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
  1024  		return func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
  1025  			metricGrowth, profileGrowth, p := measureDelta(t, func() {
  1026  				var started, stopped sync.WaitGroup
  1027  				started.Add(workers)
  1028  				stopped.Add(workers)
  1029  				for i := 0; i < workers; i++ {
  1030  					w := &contentionWorker{
  1031  						before: func() {
  1032  							started.Done()
  1033  							started.Wait()
  1034  						},
  1035  						after: func() {
  1036  							stopped.Done()
  1037  						},
  1038  						fn: fn,
  1039  					}
  1040  					go w.run()
  1041  				}
  1042  				stopped.Wait()
  1043  			})
  1044  
  1045  			if profileGrowth == 0 {
  1046  				t.Errorf("no increase in mutex profile")
  1047  			}
  1048  			if metricGrowth == 0 && strictTiming {
  1049  				// If the critical section is very short, systems with low timer
  1050  				// resolution may be unable to measure it via nanotime.
  1051  				//
  1052  				// This is sampled at 1 per gTrackingPeriod, but the explicit
  1053  				// runtime.mutex tests create 200 contention events. Observing
  1054  				// zero of those has a probability of (7/8)^200 = 2.5e-12 which
  1055  				// is acceptably low (though the calculation has a tenuous
  1056  				// dependency on cheaprandn being a good-enough source of
  1057  				// entropy).
  1058  				t.Errorf("no increase in /sync/mutex/wait/total:seconds metric")
  1059  			}
  1060  			// This comparison is possible because the time measurements in support of
  1061  			// runtime/pprof and runtime/metrics for runtime-internal locks are so close
  1062  			// together. It doesn't work as well for user-space contention, where the
  1063  			// involved goroutines are not _Grunnable the whole time and so need to pass
  1064  			// through the scheduler.
  1065  			t.Logf("lock contention growth in runtime/pprof's view  (%fs)", profileGrowth)
  1066  			t.Logf("lock contention growth in runtime/metrics' view (%fs)", metricGrowth)
  1067  
  1068  			acceptStacks = append([][]string(nil), acceptStacks...)
  1069  			for i, stk := range acceptStacks {
  1070  				if goexperiment.StaticLockRanking {
  1071  					if !slices.ContainsFunc(stk, func(s string) bool {
  1072  						return s == "runtime.systemstack" || s == "runtime.mcall" || s == "runtime.mstart"
  1073  					}) {
  1074  						// stk is a call stack that is still on the user stack when
  1075  						// it calls runtime.unlock. Add the extra function that
  1076  						// we'll see, when the static lock ranking implementation of
  1077  						// runtime.unlockWithRank switches to the system stack.
  1078  						stk = append([]string{"runtime.unlockWithRank"}, stk...)
  1079  					}
  1080  				}
  1081  				acceptStacks[i] = stk
  1082  			}
  1083  
  1084  			var stks [][]string
  1085  			values := make([][2]int64, len(acceptStacks))
  1086  			for _, s := range p.Sample {
  1087  				var have []string
  1088  				for _, loc := range s.Location {
  1089  					for _, line := range loc.Line {
  1090  						have = append(have, line.Function.Name)
  1091  					}
  1092  				}
  1093  				stks = append(stks, have)
  1094  				for i, stk := range acceptStacks {
  1095  					if slices.Equal(have, stk) {
  1096  						values[i][0] += s.Value[0]
  1097  						values[i][1] += s.Value[1]
  1098  					}
  1099  				}
  1100  			}
  1101  			for i, stk := range acceptStacks {
  1102  				n += values[i][0]
  1103  				value += values[i][1]
  1104  				t.Logf("stack %v has samples totaling n=%d value=%d", stk, values[i][0], values[i][1])
  1105  			}
  1106  			if n == 0 && value == 0 {
  1107  				t.Logf("profile:\n%s", p)
  1108  				for _, have := range stks {
  1109  					t.Logf("have stack %v", have)
  1110  				}
  1111  				for _, stk := range acceptStacks {
  1112  					t.Errorf("want stack %v", stk)
  1113  				}
  1114  			}
  1115  
  1116  			return metricGrowth, profileGrowth, n, value
  1117  		}
  1118  	}
  1119  
  1120  	name := t.Name()
  1121  
  1122  	t.Run("runtime.lock", func(t *testing.T) {
  1123  		mus := make([]runtime.Mutex, 200)
  1124  		var needContention atomic.Int64
  1125  		delay := 100 * time.Microsecond // large relative to system noise, for comparison between clocks
  1126  		delayMicros := delay.Microseconds()
  1127  
  1128  		// The goroutine that acquires the lock will only proceed when it
  1129  		// detects that its partner is contended for the lock. That will lead to
  1130  		// live-lock if anything (such as a STW) prevents the partner goroutine
  1131  		// from running. Allowing the contention workers to pause and restart
  1132  		// (to allow a STW to proceed) makes it harder to confirm that we're
  1133  		// counting the correct number of contention events, since some locks
  1134  		// will end up contended twice. Instead, disable the GC.
  1135  		defer debug.SetGCPercent(debug.SetGCPercent(-1))
  1136  
  1137  		const workers = 2
  1138  		if runtime.GOMAXPROCS(0) < workers {
  1139  			t.Skipf("contention on runtime-internal locks requires GOMAXPROCS >= %d", workers)
  1140  		}
  1141  
  1142  		fn := func() bool {
  1143  			n := int(needContention.Load())
  1144  			if n < 0 {
  1145  				return false
  1146  			}
  1147  			mu := &mus[n]
  1148  
  1149  			runtime.Lock(mu)
  1150  			for int(needContention.Load()) == n {
  1151  				if runtime.MutexContended(mu) {
  1152  					// make them wait a little while
  1153  					for start := runtime.Nanotime(); (runtime.Nanotime()-start)/1000 < delayMicros; {
  1154  						runtime.Usleep(uint32(delayMicros))
  1155  					}
  1156  					break
  1157  				}
  1158  			}
  1159  			runtime.Unlock(mu)
  1160  			needContention.Store(int64(n - 1))
  1161  
  1162  			return true
  1163  		}
  1164  
  1165  		stks := [][]string{{
  1166  			"runtime.unlock",
  1167  			"runtime_test." + name + ".func5.1",
  1168  			"runtime_test.(*contentionWorker).run",
  1169  		}}
  1170  
  1171  		t.Run("sample-1", func(t *testing.T) {
  1172  			old := runtime.SetMutexProfileFraction(1)
  1173  			defer runtime.SetMutexProfileFraction(old)
  1174  
  1175  			needContention.Store(int64(len(mus) - 1))
  1176  			metricGrowth, profileGrowth, n, _ := testcase(true, stks, workers, fn)(t)
  1177  
  1178  			t.Run("metric", func(t *testing.T) {
  1179  				// The runtime/metrics view may be sampled at 1 per
  1180  				// gTrackingPeriod, so we don't have a hard lower bound here.
  1181  				testenv.SkipFlaky(t, 64253)
  1182  
  1183  				if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); have < want {
  1184  					// The test imposes a delay with usleep, verified with calls to
  1185  					// nanotime. Compare against the runtime/metrics package's view
  1186  					// (based on nanotime) rather than runtime/pprof's view (based
  1187  					// on cputicks).
  1188  					t.Errorf("runtime/metrics reported less than the known minimum contention duration (%fs < %fs)", have, want)
  1189  				}
  1190  			})
  1191  			if have, want := n, int64(len(mus)); have != want {
  1192  				t.Errorf("mutex profile reported contention count different from the known true count (%d != %d)", have, want)
  1193  			}
  1194  
  1195  			const slop = 1.5 // account for nanotime vs cputicks
  1196  			t.Run("compare timers", func(t *testing.T) {
  1197  				testenv.SkipFlaky(t, 64253)
  1198  				if profileGrowth > slop*metricGrowth || metricGrowth > slop*profileGrowth {
  1199  					t.Errorf("views differ by more than %fx", slop)
  1200  				}
  1201  			})
  1202  		})
  1203  
  1204  		t.Run("sample-2", func(t *testing.T) {
  1205  			testenv.SkipFlaky(t, 64253)
  1206  
  1207  			old := runtime.SetMutexProfileFraction(2)
  1208  			defer runtime.SetMutexProfileFraction(old)
  1209  
  1210  			needContention.Store(int64(len(mus) - 1))
  1211  			metricGrowth, profileGrowth, n, _ := testcase(true, stks, workers, fn)(t)
  1212  
  1213  			// With 100 trials and profile fraction of 2, we expect to capture
  1214  			// 50 samples. Allow the test to pass if we get at least 20 samples;
  1215  			// the CDF of the binomial distribution says there's less than a
  1216  			// 1e-9 chance of that, which is an acceptably low flakiness rate.
  1217  			const samplingSlop = 2.5
  1218  
  1219  			if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); samplingSlop*have < want {
  1220  				// The test imposes a delay with usleep, verified with calls to
  1221  				// nanotime. Compare against the runtime/metrics package's view
  1222  				// (based on nanotime) rather than runtime/pprof's view (based
  1223  				// on cputicks).
  1224  				t.Errorf("runtime/metrics reported less than the known minimum contention duration (%f * %fs < %fs)", samplingSlop, have, want)
  1225  			}
  1226  			if have, want := n, int64(len(mus)); float64(have) > float64(want)*samplingSlop || float64(want) > float64(have)*samplingSlop {
  1227  				t.Errorf("mutex profile reported contention count too different from the expected count (%d far from %d)", have, want)
  1228  			}
  1229  
  1230  			const timerSlop = 1.5 * samplingSlop // account for nanotime vs cputicks, plus the two views' independent sampling
  1231  			if profileGrowth > timerSlop*metricGrowth || metricGrowth > timerSlop*profileGrowth {
  1232  				t.Errorf("views differ by more than %fx", timerSlop)
  1233  			}
  1234  		})
  1235  	})
  1236  
  1237  	t.Run("runtime.semrelease", func(t *testing.T) {
  1238  		testenv.SkipFlaky(t, 64253)
  1239  
  1240  		old := runtime.SetMutexProfileFraction(1)
  1241  		defer runtime.SetMutexProfileFraction(old)
  1242  
  1243  		const workers = 3
  1244  		if runtime.GOMAXPROCS(0) < workers {
  1245  			t.Skipf("creating and observing contention on runtime-internal semaphores requires GOMAXPROCS >= %d", workers)
  1246  		}
  1247  
  1248  		var sem uint32 = 1
  1249  		var tries atomic.Int32
  1250  		tries.Store(10_000_000) // prefer controlled failure to timeout
  1251  		var sawContention atomic.Int32
  1252  		var need int32 = 1
  1253  		fn := func() bool {
  1254  			if sawContention.Load() >= need {
  1255  				return false
  1256  			}
  1257  			if tries.Add(-1) < 0 {
  1258  				return false
  1259  			}
  1260  
  1261  			runtime.Semacquire(&sem)
  1262  			runtime.Semrelease1(&sem, false, 0)
  1263  			if runtime.MutexContended(runtime.SemRootLock(&sem)) {
  1264  				sawContention.Add(1)
  1265  			}
  1266  			return true
  1267  		}
  1268  
  1269  		stks := [][]string{
  1270  			{
  1271  				"runtime.unlock",
  1272  				"runtime.semrelease1",
  1273  				"runtime_test.TestRuntimeLockMetricsAndProfile.func6.1",
  1274  				"runtime_test.(*contentionWorker).run",
  1275  			},
  1276  			{
  1277  				"runtime.unlock",
  1278  				"runtime.semacquire1",
  1279  				"runtime.semacquire",
  1280  				"runtime_test.TestRuntimeLockMetricsAndProfile.func6.1",
  1281  				"runtime_test.(*contentionWorker).run",
  1282  			},
  1283  		}
  1284  
  1285  		// Verify that we get call stack we expect, with anything more than zero
  1286  		// cycles / zero samples. The duration of each contention event is too
  1287  		// small relative to the expected overhead for us to verify its value
  1288  		// more directly. Leave that to the explicit lock/unlock test.
  1289  
  1290  		testcase(false, stks, workers, fn)(t)
  1291  
  1292  		if remaining := tries.Load(); remaining >= 0 {
  1293  			t.Logf("finished test early (%d tries remaining)", remaining)
  1294  		}
  1295  	})
  1296  }
  1297  
  1298  // contentionWorker provides cleaner call stacks for lock contention profile tests
  1299  type contentionWorker struct {
  1300  	before func()
  1301  	fn     func() bool
  1302  	after  func()
  1303  }
  1304  
  1305  func (w *contentionWorker) run() {
  1306  	defer w.after()
  1307  	w.before()
  1308  
  1309  	for w.fn() {
  1310  	}
  1311  }
  1312  
  1313  func TestCPUStats(t *testing.T) {
  1314  	// Run a few GC cycles to get some of the stats to be non-zero.
  1315  	runtime.GC()
  1316  	runtime.GC()
  1317  	runtime.GC()
  1318  
  1319  	// Set GOMAXPROCS high then sleep briefly to ensure we generate
  1320  	// some idle time.
  1321  	oldmaxprocs := runtime.GOMAXPROCS(10)
  1322  	time.Sleep(time.Millisecond)
  1323  	runtime.GOMAXPROCS(oldmaxprocs)
  1324  
  1325  	stats := runtime.ReadCPUStats()
  1326  	gcTotal := stats.GCAssistTime + stats.GCDedicatedTime + stats.GCIdleTime + stats.GCPauseTime
  1327  	if gcTotal != stats.GCTotalTime {
  1328  		t.Errorf("manually computed total does not match GCTotalTime: %d cpu-ns vs. %d cpu-ns", gcTotal, stats.GCTotalTime)
  1329  	}
  1330  	scavTotal := stats.ScavengeAssistTime + stats.ScavengeBgTime
  1331  	if scavTotal != stats.ScavengeTotalTime {
  1332  		t.Errorf("manually computed total does not match ScavengeTotalTime: %d cpu-ns vs. %d cpu-ns", scavTotal, stats.ScavengeTotalTime)
  1333  	}
  1334  	total := gcTotal + scavTotal + stats.IdleTime + stats.UserTime
  1335  	if total != stats.TotalTime {
  1336  		t.Errorf("manually computed overall total does not match TotalTime: %d cpu-ns vs. %d cpu-ns", total, stats.TotalTime)
  1337  	}
  1338  	if total == 0 {
  1339  		t.Error("total time is zero")
  1340  	}
  1341  	if gcTotal == 0 {
  1342  		t.Error("GC total time is zero")
  1343  	}
  1344  	if stats.IdleTime == 0 {
  1345  		t.Error("idle time is zero")
  1346  	}
  1347  }
  1348  
  1349  func TestMetricHeapUnusedLargeObjectOverflow(t *testing.T) {
  1350  	// This test makes sure /memory/classes/heap/unused:bytes
  1351  	// doesn't overflow when allocating and deallocating large
  1352  	// objects. It is a regression test for #67019.
  1353  	done := make(chan struct{})
  1354  	var wg sync.WaitGroup
  1355  	wg.Add(1)
  1356  	go func() {
  1357  		defer wg.Done()
  1358  		for {
  1359  			for range 10 {
  1360  				abi.Escape(make([]byte, 1<<20))
  1361  			}
  1362  			runtime.GC()
  1363  			select {
  1364  			case <-done:
  1365  				return
  1366  			default:
  1367  			}
  1368  		}
  1369  	}()
  1370  	s := []metrics.Sample{
  1371  		{Name: "/memory/classes/heap/unused:bytes"},
  1372  	}
  1373  	for range 1000 {
  1374  		metrics.Read(s)
  1375  		if s[0].Value.Uint64() > 1<<40 {
  1376  			t.Errorf("overflow")
  1377  			break
  1378  		}
  1379  	}
  1380  	done <- struct{}{}
  1381  	wg.Wait()
  1382  }
  1383
View as plain text