From fafa2f1627df67747bf41a418738a341e2dca30e Mon Sep 17 00:00:00 2001
From: Magiceses <magiceses0118@gmail.com>
Date: Fri, 17 Jan 2025 15:59:54 +0800
Subject: [PATCH] Make minPenalty and initialPenalty to be configurable.

Change-Id: I4fca014e45652821316101f13dff88c65d9f2213
Signed-off-by: Magiceses <magiceses0118@gmail.com>
---
 pkg/dedup/iter.go | 52 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 13 deletions(-)

diff --git a/pkg/dedup/iter.go b/pkg/dedup/iter.go
index e3b3fe8f0f7..566acd2c640 100644
--- a/pkg/dedup/iter.go
+++ b/pkg/dedup/iter.go
@@ -5,6 +5,8 @@ package dedup
 
 import (
 	"math"
+	"os"
+	"strconv"
 
 	"github.com/prometheus/prometheus/model/histogram"
 	"github.com/prometheus/prometheus/model/labels"
@@ -15,6 +17,39 @@ import (
 	"github.com/thanos-io/thanos/pkg/store/storepb"
 )
 
+var (
+	// Add a minimum penalty value to make it possible to skip the particularly similar point.
+	// The default behavior will traverse all points.
+	// After the setting value is greater than 0, it will skip minPenalty+1 ms time to traverse.
+	// More details: https://github.com/thanos-io/thanos/pull/8061#issue-2794728444
+	minPenalty int64
+
+	// For the series we didn't pick, add a penalty twice as high as the delta of the last two
+	// samples to the next seek against it.
+	// This ensures that we don't pick a sample too close, which would increase the overall
+	// sample frequency. It also guards against clock drift and inaccuracies during
+	// timestamp assignment.
+	// If we don't know a delta yet, we pick 5000 as a constant, which is based on the knowledge
+	// that timestamps are in milliseconds and sampling frequencies typically multiple seconds long.
+	initialPenalty int64 = 5000
+)
+
+func init() {
+	// Modify minPenalty for deduplication if the environment variable is set.
+	minPtEnv := os.Getenv("THANOS_DEDUP_MIN_PENALTY")
+	minPt, err := strconv.ParseInt(minPtEnv, 10, 0)
+	if err == nil && minPt >= 0 {
+		minPenalty = minPt
+	}
+
+	// Modify initialPenalty for deduplication if the environment variable is set.
+	initialPtEnv := os.Getenv("THANOS_DEDUP_INIT_PENALTY")
+	initialPt, err := strconv.ParseInt(initialPtEnv, 10, 0)
+	if err == nil && initialPt >= 0 {
+		initialPenalty = initialPt
+	}
+}
+
 type dedupSeriesSet struct {
 	set       storage.SeriesSet
 	isCounter bool
@@ -318,7 +353,7 @@ func (it *dedupSeriesIterator) Next() chunkenc.ValueType {
 		if it.bval != chunkenc.ValNone {
 			it.lastT = it.b.AtT()
 			it.lastIter = it.b
-			it.penB = 0
+			it.penB = minPenalty
 		}
 		return it.bval
 	}
@@ -326,7 +361,7 @@ func (it *dedupSeriesIterator) Next() chunkenc.ValueType {
 		it.useA = true
 		it.lastT = it.a.AtT()
 		it.lastIter = it.a
-		it.penA = 0
+		it.penA = minPenalty
 		return it.aval
 	}
 	// General case where both iterators still have data. We pick the one
@@ -338,22 +373,13 @@ func (it *dedupSeriesIterator) Next() chunkenc.ValueType {
 
 	it.useA = ta <= tb
 
-	// For the series we didn't pick, add a penalty twice as high as the delta of the last two
-	// samples to the next seek against it.
-	// This ensures that we don't pick a sample too close, which would increase the overall
-	// sample frequency. It also guards against clock drift and inaccuracies during
-	// timestamp assignment.
-	// If we don't know a delta yet, we pick 5000 as a constant, which is based on the knowledge
-	// that timestamps are in milliseconds and sampling frequencies typically multiple seconds long.
-	const initialPenalty = 5000
-
 	if it.useA {
 		if it.lastT != math.MinInt64 {
 			it.penB = 2 * (ta - it.lastT)
 		} else {
 			it.penB = initialPenalty
 		}
-		it.penA = 0
+		it.penA = minPenalty
 		it.lastT = ta
 		it.lastIter = it.a
 
@@ -364,7 +390,7 @@ func (it *dedupSeriesIterator) Next() chunkenc.ValueType {
 	} else {
 		it.penA = initialPenalty
 	}
-	it.penB = 0
+	it.penB = minPenalty
 	it.lastT = tb
 	it.lastIter = it.b
 	return it.bval