Skip to content

Commit a94d19f

Browse files
committed
shipper: Be strict about upload order unless it's specified so.
Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>
1 parent 8cc9f79 commit a94d19f

File tree

11 files changed

+122
-98
lines changed

11 files changed

+122
-98
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel
2626
- [#2416](https://github.com/thanos-io/thanos/pull/2416) Bucket: Fixed issue #2416 bug in `inspect --sort-by` doesn't work correctly in all cases.
2727
- [#2719](https://github.com/thanos-io/thanos/pull/2719) Query: `irate` and `resets` use now counter downsampling aggregations.
2828
- [#2705](https://github.com/thanos-io/thanos/pull/2705) minio-go: Added support for `af-south-1` and `eu-south-1` regions.
29+
- [#2753](https://github.com/thanos-io/thanos/issues/2753) Sidecar,Receive,Rule: Fixed cause for compactor overlapping blocks in upload error cases.
2930

3031
### Added
3132

cmd/thanos/config.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,17 +112,23 @@ func (rc *reloaderConfig) registerFlag(cmd *kingpin.CmdClause) *reloaderConfig {
112112
}
113113

114114
type shipperConfig struct {
115-
uploadCompacted bool
116-
ignoreBlockSize bool
115+
uploadCompacted bool
116+
ignoreBlockSize bool
117+
allowOutOfOrderUpload bool
117118
}
118119

119120
func (sc *shipperConfig) registerFlag(cmd *kingpin.CmdClause) *shipperConfig {
120121
cmd.Flag("shipper.upload-compacted",
121-
"If true sidecar will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus. Do it once and then disable the flag when done.").
122+
"If true shipper will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus. Do it once and then disable the flag when done.").
122123
Default("false").BoolVar(&sc.uploadCompacted)
123124
cmd.Flag("shipper.ignore-unequal-block-size",
124-
"If true sidecar will not require prometheus min and max block size flags to be set to the same value. Only use this if you want to keep long retention and compaction enabled on your Prometheus instance, as in the worst case it can result in ~2h data loss for your Thanos bucket storage.").
125+
"If true shipper will not require prometheus min and max block size flags to be set to the same value. Only use this if you want to keep long retention and compaction enabled on your Prometheus instance, as in the worst case it can result in ~2h data loss for your Thanos bucket storage.").
125126
Default("false").Hidden().BoolVar(&sc.ignoreBlockSize)
127+
cmd.Flag("shipper.allow-out-of-order-uploads",
128+
"If true shipper will skip failed block uploads in given iteration and retry later. This means that some newer blocks might uploaded sooner than older."+
129+
"This will trigger compaction without those blocks, as a resulted create 'valid overlap situation'. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
130+
"about order.").
131+
Default("false").BoolVar(&sc.allowOutOfOrderUpload)
126132
return sc
127133
}
128134

cmd/thanos/receive.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@ func registerReceive(m map[string]setupFunc, app *kingpin.Application) {
8989

9090
walCompression := cmd.Flag("tsdb.wal-compression", "Compress the tsdb WAL.").Default("true").Bool()
9191

92+
allowOutOfOrderUpload := cmd.Flag("shipper.allow-out-of-order-uploads",
93+
"If true shipper will skip failed block uploads in given iteration and retry later. This means that some newer blocks might uploaded sooner than older."+
94+
"This will trigger compaction without those blocks, as a resulted create 'valid overlap situation'. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
95+
"about order.").
96+
Default("false").Bool()
97+
9298
m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error {
9399
lset, err := parseFlagLabels(*labelStrs)
94100
if err != nil {
@@ -157,6 +163,7 @@ func registerReceive(m map[string]setupFunc, app *kingpin.Application) {
157163
*replicationFactor,
158164
time.Duration(*forwardTimeout),
159165
comp,
166+
*allowOutOfOrderUpload,
160167
)
161168
}
162169
}
@@ -195,6 +202,7 @@ func runReceive(
195202
replicationFactor uint64,
196203
forwardTimeout time.Duration,
197204
comp component.SourceStoreAPI,
205+
allowOutOfOrderUpload bool,
198206
) error {
199207
logger = log.With(logger, "component", "receive")
200208
level.Warn(logger).Log("msg", "setting up receive; the Thanos receive component is EXPERIMENTAL, it may break significantly without notice")
@@ -246,6 +254,7 @@ func runReceive(
246254
lset,
247255
tenantLabelName,
248256
bkt,
257+
allowOutOfOrderUpload,
249258
)
250259
writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs)
251260
webHandler := receive.NewHandler(log.With(logger, "component", "receive-handler"), &receive.Options{

cmd/thanos/rule.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ func registerRule(m map[string]setupFunc, app *kingpin.Application) {
114114
dnsSDResolver := cmd.Flag("query.sd-dns-resolver", "Resolver to use. Possible options: [golang, miekgdns]").
115115
Default("golang").Hidden().String()
116116

117+
allowOutOfOrderUpload := cmd.Flag("shipper.allow-out-of-order-uploads",
118+
"If true shipper will skip failed block uploads in given iteration and retry later. This means that some newer blocks might uploaded sooner than older."+
119+
"This will trigger compaction without those blocks, as a resulted create 'valid overlap situation'. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
120+
"about order.").
121+
Default("false").Bool()
122+
117123
m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, reload <-chan struct{}, _ bool) error {
118124
lset, err := parseFlagLabels(*labelStrs)
119125
if err != nil {
@@ -197,6 +203,7 @@ func registerRule(m map[string]setupFunc, app *kingpin.Application) {
197203
time.Duration(*dnsSDInterval),
198204
*dnsSDResolver,
199205
comp,
206+
*allowOutOfOrderUpload,
200207
)
201208
}
202209
}
@@ -283,6 +290,7 @@ func runRule(
283290
dnsSDInterval time.Duration,
284291
dnsSDResolver string,
285292
comp component.Component,
293+
allowOutOfOrderUpload bool,
286294
) error {
287295
metrics := newRuleMetrics(reg)
288296

@@ -615,7 +623,7 @@ func runRule(
615623
}
616624
}()
617625

618-
s := shipper.New(logger, reg, dataDir, bkt, func() labels.Labels { return lset }, metadata.RulerSource)
626+
s := shipper.New(logger, reg, dataDir, bkt, func() labels.Labels { return lset }, metadata.RulerSource, allowOutOfOrderUpload)
619627

620628
ctx, cancel := context.WithCancel(context.Background())
621629

cmd/thanos/sidecar.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,9 +273,9 @@ func runSidecar(
273273

274274
var s *shipper.Shipper
275275
if conf.shipper.uploadCompacted {
276-
s = shipper.NewWithCompacted(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource)
276+
s = shipper.NewWithCompacted(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource, conf.shipper.allowOutOfOrderUpload)
277277
} else {
278-
s = shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource)
278+
s = shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource, conf.shipper.allowOutOfOrderUpload)
279279
}
280280

281281
return runutil.Repeat(30*time.Second, ctx.Done(), func() error {

docs/operating/troubleshooting.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ slug: /troubleshooting.md
77

88
# Troubleshooting; Common cases
99

10-
1110
## Overlaps
1211

1312
**Block overlap**: Set of blocks with exactly the same external labels in meta.json and for the same time or overlapping time period.
@@ -29,13 +28,15 @@ Checking producers log for such ULID, and checking meta.json (e.g if sample stat
2928

3029
### Reasons
3130

31+
- You are running Thanos (sidecar, ruler or receive) older than 0.13.0. During transient upload errors there was possibility to have overlaps caused by compactor not being aware of all blocks See: [this](https://github.com/thanos-io/thanos/issues/2753)
3232
- Misconfiguraiton of sidecar/ruler: Same external labels or no external labels across many block producers.
3333
- Running multiple compactors for single block "stream", even for short duration.
3434
- Manually uploading blocks to the bucket.
3535
- Eventually consistent block storage until we fully implement [RW for bucket](https://thanos.io/proposals/201901-read-write-operations-bucket.md)
3636

3737
### Solutions
3838

39+
- Upgrade sidecar, ruler and receive to 0.13.0+
3940
- Compactor can be blocked for some time, but if it is urgent. Mitigate by removing overlap or better: Backing up somewhere else (you can rename block ULID to non-ulid).
4041
- Who uploaded the block? Search for logs with this ULID across all sidecars/rulers. Check access logs to object storage. Check debug/metas or meta.json of problematic block to see how blocks looks like and what is the `source`.
4142
- Determine what you misconfigured.

pkg/receive/multitsdb.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,9 @@ type MultiTSDB struct {
3838
labels labels.Labels
3939
bucket objstore.Bucket
4040

41-
mtx *sync.RWMutex
42-
tenants map[string]*tenant
41+
mtx *sync.RWMutex
42+
tenants map[string]*tenant
43+
allowOutOfOrderUpload bool
4344
}
4445

4546
func NewMultiTSDB(
@@ -50,21 +51,23 @@ func NewMultiTSDB(
5051
labels labels.Labels,
5152
tenantLabelName string,
5253
bucket objstore.Bucket,
54+
allowOutOfOrderUpload bool,
5355
) *MultiTSDB {
5456
if l == nil {
5557
l = log.NewNopLogger()
5658
}
5759

5860
return &MultiTSDB{
59-
dataDir: dataDir,
60-
logger: l,
61-
reg: reg,
62-
tsdbOpts: tsdbOpts,
63-
mtx: &sync.RWMutex{},
64-
tenants: map[string]*tenant{},
65-
labels: labels,
66-
tenantLabelName: tenantLabelName,
67-
bucket: bucket,
61+
dataDir: dataDir,
62+
logger: l,
63+
reg: reg,
64+
tsdbOpts: tsdbOpts,
65+
mtx: &sync.RWMutex{},
66+
tenants: map[string]*tenant{},
67+
labels: labels,
68+
tenantLabelName: tenantLabelName,
69+
bucket: bucket,
70+
allowOutOfOrderUpload: allowOutOfOrderUpload,
6871
}
6972
}
7073

@@ -256,6 +259,7 @@ func (t *MultiTSDB) getOrLoadTenant(tenantID string, blockingStart bool) (*tenan
256259
t.bucket,
257260
func() labels.Labels { return lbls },
258261
metadata.ReceiveSource,
262+
t.allowOutOfOrderUpload,
259263
)
260264
}
261265

pkg/receive/multitsdb_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ func TestMultiTSDB(t *testing.T) {
4343
labels.FromStrings("replica", "01"),
4444
"tenant_id",
4545
nil,
46+
false,
4647
)
4748
defer testutil.Ok(t, m.Flush())
4849

@@ -109,6 +110,7 @@ func TestMultiTSDB(t *testing.T) {
109110
labels.FromStrings("replica", "01"),
110111
"tenant_id",
111112
nil,
113+
false,
112114
)
113115
defer testutil.Ok(t, m.Flush())
114116

0 commit comments

Comments
 (0)