ribbonGo/hash_bench_test.go at main · RibbonFilter/ribbonGo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
package ribbonGo

import (
	"fmt"
	"testing"
)

// =============================================================================
// Benchmarks — hash pipeline hot path
//
// The derive() function is the innermost loop of both banding (construction)
// and seed-retry. Every key passes through it once per seed attempt:
//
//   for _, h := range storedHashes { hr := hasher.derive(h) }
//
// Therefore its throughput directly determines construction speed.
//
// We benchmark:
//   1. derive()       — full pipeline (rehash → start + coeff + result)
//   2. rehash()       — seed remixing in isolation
//   3. getStart()     — fastRange64 mapping
//   4. getCoeffRow()  — coefficient derivation (width-dependent)
//   5. getResultRow() — fingerprint extraction
//   6. keyHash()      — Phase 1 (XXH3), for reference
//
// Each is measured across ribbon widths (32/64/128), with and without
// firstCoeffAlwaysOne, to surface any width-dependent cost.
// =============================================================================

// precomputedHashes generates N deterministic 64-bit hashes to feed into
// Phase 2 benchmarks, removing XXH3 from the measured path.
func precomputedHashes(h *standardHasher, n int) []uint64 {
	hashes := make([]uint64, n)
	for i := range hashes {
		hashes[i] = h.keyHash([]byte(fmt.Sprintf("bench_key_%d", i)))
	}
	return hashes
}

// ---------------------------------------------------------------------------
// 1. derive — full pipeline
//
// Full Phase 2 pipeline: rehash → getStart + getCoeffRow + getResultRow.
// This is the per-key cost on every seed attempt during banding.
//
// derive() is hand-inlined and branchless (no switch, no if). The compiler
// inlines it into the caller (cost 67, budget 80), so the benchmark loop
// runs the entire pipeline without a function-call boundary.
//
// Optimisations applied:
//   (a) Single h*kCoeffAndResultFactor multiply shared by coeff and result.
//   (b) Per-width switch replaced by pre-computed masks (coeffLoMask,
//       coeffHiMask, coeffXor) — branchless coefficient derivation.
//   (c) forceFirstCoeff branch replaced by pre-computed coeffOrMask.
//   (d) resultBits shift replaced by pre-computed resultMask.
//
// Reference results (Apple M3 Pro, Go 1.25, -benchtime=2s):
//
//   Width   fcao     ns/op   allocs/op
//   ─────   ─────    ─────   ─────────
//   w=32    true      0.39   0
//   w=32    false     0.39   0
//   w=64    true      0.39   0
//   w=64    false     0.39   0
//   w=128   true      0.39   0
//   w=128   false     0.39   0
//
// Key observations:
//   • ~0.39 ns/key with zero allocations — fully inlined, branchless.
//   • All widths identical (masks absorb the width difference).
//   • 6.9× faster than the pre-optimisation baseline of ~2.7 ns/key.
// ---------------------------------------------------------------------------

func BenchmarkDerive(b *testing.B) {
	for _, w := range []uint32{32, 64, 128} {
		for _, fcao := range []bool{true, false} {
			name := fmt.Sprintf("w=%d/fcao=%v", w, fcao)
			b.Run(name, func(b *testing.B) {
				h := newStandardHasher(w, 10000, 7, fcao)
				h.setOrdinalSeed(0)
				hashes := precomputedHashes(h, 4096)
				mask := len(hashes) - 1 // power-of-2 for cheap modulo

				b.ResetTimer()
				b.ReportAllocs()
				var sink hashResult
				for i := 0; i < b.N; i++ {
					sink = h.derive(hashes[i&mask])
				}
				_ = sink
			})
		}
	}
}

// ---------------------------------------------------------------------------
// 2. rehash — seed remixing
//
// Isolated cost of (hash ^ rawSeed) * kRehashFactor.
//
// Reference results (Apple M3 Pro, Go 1.25, -benchtime=2s):
//
//   ns/op   allocs/op
//   ─────   ─────────
//   0.39    0
//
// Key observation: single multiply — sub-nanosecond, fully inlined.
// ---------------------------------------------------------------------------

func BenchmarkRehash(b *testing.B) {
	h := newStandardHasher(128, 10000, 7, true)
	h.setOrdinalSeed(0)
	hashes := precomputedHashes(h, 4096)
	mask := len(hashes) - 1

	b.ResetTimer()
	b.ReportAllocs()
	var sink uint64
	for i := 0; i < b.N; i++ {
		sink = h.rehash(hashes[i&mask])
	}
	_ = sink
}

// ---------------------------------------------------------------------------
// 3. getStart — fastRange64
//
// Maps a 64-bit hash uniformly into [0, numStarts) via 128-bit multiply.
//
// Reference results (Apple M3 Pro, Go 1.25, -benchtime=2s):
//
//   ns/op   allocs/op
//   ─────   ─────────
//   0.40    0
//
// Key observation: single math/bits.Mul64 — sub-nanosecond, fully inlined.
// ---------------------------------------------------------------------------

func BenchmarkGetStart(b *testing.B) {
	h := newStandardHasher(128, 10000, 7, true)
	h.setOrdinalSeed(0)
	hashes := precomputedHashes(h, 4096)
	mask := len(hashes) - 1

	// Pre-rehash so we measure getStart in isolation
	rehashed := make([]uint64, len(hashes))
	for i, kh := range hashes {
		rehashed[i] = h.rehash(kh)
	}

	b.ResetTimer()
	b.ReportAllocs()
	var sink uint32
	for i := 0; i < b.N; i++ {
		sink = h.getStart(rehashed[i&mask])
	}
	_ = sink
}

// ---------------------------------------------------------------------------
// 4. getCoeffRow — width-dependent coefficient derivation
//
// Derives the w-bit coefficient row from a rehashed hash. Contains a
// 3-way switch on w and an optional |=1 for forceFirstCoeff.
//
// Reference results (Apple M3 Pro, Go 1.25, -benchtime=2s):
//
//   Width   fcao     ns/op   allocs/op
//   ─────   ─────    ─────   ─────────
//   w=32    true      0.39   0
//   w=32    false     0.39   0
//   w=64    true      0.39   0
//   w=64    false     0.39   0
//   w=128   true      0.39   0
//   w=128   false     0.39   0
//
// Key observations:
//   • All widths identical — the switch is branch-predicted perfectly
//     because the width is fixed for the hasher's lifetime.
//   • fcao adds no measurable cost.
// ---------------------------------------------------------------------------

func BenchmarkGetCoeffRow(b *testing.B) {
	for _, w := range []uint32{32, 64, 128} {
		for _, fcao := range []bool{true, false} {
			name := fmt.Sprintf("w=%d/fcao=%v", w, fcao)
			b.Run(name, func(b *testing.B) {
				h := newStandardHasher(w, 10000, 7, fcao)
				h.setOrdinalSeed(0)
				hashes := precomputedHashes(h, 4096)
				mask := len(hashes) - 1

				rehashed := make([]uint64, len(hashes))
				for i, kh := range hashes {
					rehashed[i] = h.rehash(kh)
				}

				b.ResetTimer()
				b.ReportAllocs()
				var sink uint128
				for i := 0; i < b.N; i++ {
					sink = h.getCoeffRow(rehashed[i&mask])
				}
				_ = sink
			})
		}
	}
}

// ---------------------------------------------------------------------------
// 5. getResultRow — fingerprint extraction
//
// Derives the r-bit fingerprint: multiply → byte-swap → mask.
//
// Reference results (Apple M3 Pro, Go 1.25, -benchtime=2s):
//
//   r-bits   ns/op   allocs/op
//   ──────   ─────   ─────────
//   r=1      0.39    0
//   r=7      0.40    0
//   r=8      0.39    0
//
// Key observation: identical cost regardless of r — the mask is a
// constant after construction, so there is no branch on r.
// ---------------------------------------------------------------------------

func BenchmarkGetResultRow(b *testing.B) {
	for _, r := range []uint{1, 7, 8} {
		name := fmt.Sprintf("r=%d", r)
		b.Run(name, func(b *testing.B) {
			h := newStandardHasher(128, 10000, r, true)
			h.setOrdinalSeed(0)
			hashes := precomputedHashes(h, 4096)
			mask := len(hashes) - 1

			rehashed := make([]uint64, len(hashes))
			for i, kh := range hashes {
				rehashed[i] = h.rehash(kh)
			}

			b.ResetTimer()
			b.ReportAllocs()
			var sink uint8
			for i := 0; i < b.N; i++ {
				sink = h.getResultRow(rehashed[i&mask])
			}
			_ = sink
		})
	}
}

// ---------------------------------------------------------------------------
// 6. keyHash (Phase 1) — XXH3, for reference baseline
//
// Phase 1 cost: XXH3_64bits over the raw key bytes. Called once per key
// (amortised across all seed attempts).
//
// Reference results (Apple M3 Pro, Go 1.25, -benchtime=2s):
//
//   Key size   ns/op    MB/s       allocs/op
//   ────────   ─────    ────────   ─────────
//   8 B        2.33     3,438      0
//   32 B       3.24     9,888      0
//   128 B      8.36     15,313     0
//   1024 B     49.66    20,622     0
//
// Key observations:
//   • For typical filter keys (8–32 B), keyHash is now the dominant cost
//     (~6× more expensive than derive at 0.39 ns).
//   • XXH3 saturates memory bandwidth around 20 GB/s for large keys.
//   • Total per-key cost (Phase 1 + Phase 2) ≈ 2.7 ns for 8 B keys.
// ---------------------------------------------------------------------------

func BenchmarkKeyHash(b *testing.B) {
	for _, size := range []int{8, 32, 128, 1024} {
		name := fmt.Sprintf("keySize=%d", size)
		b.Run(name, func(b *testing.B) {
			h := newStandardHasher(128, 10000, 7, true)
			key := make([]byte, size)
			for i := range key {
				key[i] = byte(i)
			}

			b.ResetTimer()
			b.ReportAllocs()
			b.SetBytes(int64(size))
			var sink uint64
			for i := 0; i < b.N; i++ {
				sink = h.keyHash(key)
			}
			_ = sink
		})
	}
}

// ---------------------------------------------------------------------------
// 7. Throughput — derive() in a realistic batch loop
//
// Simulates a full seed-attempt pass over 100K pre-hashed keys.
// Reports wall-clock time per pass and throughput in keys/op.
//
// Reference results (Apple M3 Pro, Go 1.25, -benchtime=2s):
//
//   Width   ns/op     keys/op   ~keys/sec     allocs/op
//   ─────   ───────   ───────   ──────────    ─────────
//   w=64    39,142    100,000   ~2.56B        0
//   w=128   38,730    100,000   ~2.58B        0
//
// Key observations:
//   • ~0.39 ns/key in a tight loop — matches per-call BenchmarkDerive.
//   • ~2.5 billion keys/sec means a 1M-key filter's Phase 2 takes ~0.39 ms
//     per seed attempt.
//   • 7× faster than pre-optimisation baseline (~271 µs → ~39 µs).
//   • w=64 and w=128 are indistinguishable at batch scale.
// ---------------------------------------------------------------------------

func BenchmarkDeriveThroughput(b *testing.B) {
	// Simulates a full seed-attempt pass over N keys.
	// Reports throughput in keys/sec.
	for _, w := range []uint32{64, 128} {
		name := fmt.Sprintf("w=%d", w)
		b.Run(name, func(b *testing.B) {
			const numKeys = 100_000
			h := newStandardHasher(w, numKeys, 7, true)
			h.setOrdinalSeed(0)
			hashes := precomputedHashes(h, numKeys)

			b.ResetTimer()
			b.ReportAllocs()
			var sink hashResult
			for i := 0; i < b.N; i++ {
				for _, kh := range hashes {
					sink = h.derive(kh)
				}
			}
			_ = sink
			b.ReportMetric(float64(numKeys), "keys/op")
		})
	}
}

// ---------------------------------------------------------------------------
// 8. Seed conversion — ordinalSeedToRaw / rawSeedToOrdinal
//
// Bijective mixing between small ordinal seeds and 64-bit raw seeds.
// Called once per seed attempt (not per key), so not on the hot path.
//
// Reference results (Apple M3 Pro, Go 1.25, -benchtime=2s):
//
//   Direction        ns/op   allocs/op
//   ─────────        ─────   ─────────
//   ordinalToRaw     0.39    0
//   rawToOrdinal     0.39    0
//
// Key observation: multiply + XOR — sub-nanosecond, negligible.
// ---------------------------------------------------------------------------

func BenchmarkSeedConversion(b *testing.B) {
	b.Run("ordinalToRaw", func(b *testing.B) {
		var sink uint64
		for i := 0; i < b.N; i++ {
			sink = ordinalSeedToRaw(uint32(i))
		}
		_ = sink
	})
	b.Run("rawToOrdinal", func(b *testing.B) {
		var sink uint32
		for i := 0; i < b.N; i++ {
			sink = rawSeedToOrdinal(uint64(i))
		}
		_ = sink
	})
}