Writing inaccurate benchmarks
Not resetting or pausing the timer
func BenchmarkFoo(b *testing.B) {
  setup()
  b.ResetTimer()
  for i := 0; i < b.N; i++ {
    foo()
  }
}
func BenchmarkFoo(b *testing.B) {
  for i := 0; i < b.N; i++ {
    b.StopTimer()
    setup()
    b.StartTimer()
    foo()
  }
}
Micro-benchmarks
func BenchmarkAtomicStoreInt32(b *testing.B) {
  var v int32
  for i := 0; i < b.N; i++{
    atomic.StoreInt32(&v, 1)
  }
}
func BenchmarkAtomicStoreInt64(b *testing.B) {
  var v int64
  for i := 0; i < b.N; i++ {
    atomic.StoreInt64(&v, 1)
  }
}
Run the benchmark multiple times and evaluate statistics using benchstat
go test -bench=. -count=10 | stats.txt
benchstat stats.txt
Compiler optimizations
This function is likely to be in-lined, making the benchmark useless
const m1 = 0x5555555555555555
const m2 = 0x3333333333333333
const m4 = 0x0f0f0f0f0f0f0f0f
const h01 = 0x0101010101010101
func popcnt(x uint64) uint64 {
  x -= (x >> 1) & m1
  x = (x & m2) + ((x >> 2) & m2)
  x = (x + (x >> 4)) & m4
  return (x * h01) >> 56
}
We can avoid in-lining by forcing the function to write to a local variable and then write to a variable outside of the function scope.
var global uint64
func BenchmarkPopcnt(b *testing.B) {
  var v uint64
  for i := 0; i < b.N; i++ {
    // assign a local variable
    v = popcnt(uint64(i))
  }
  // assign the latest value to a global
  global = v
}
Observer effect
Mistake
Reusing the same resources will result in the CPU caching the data and we end up benchmarking cache misses
const rows = 1000
var res int64
func BenchmarkCalculateSum512(b *testing.B) {
  var sum int64
  s := createMatrix512(rows)
  b.ResetTimer()
  for i := 0; i < b.N; i++ {
    // we keep reusing the same matrix
    sum = calculateSum(s)
  }
  res = sum
}
Fix
func BenchmarkCalculateSum512(b *testing.B) {
  var sum int64
  for i := 0; i < b.N; i++ {
    b.StopTimer()
    // create a new matrix to avoid caching
    s := createMatrix512(rows)
    b.StartTimer()
    sum = calculateSum512(s)
  }
  res = sum
}