Skip to content

Commit 3bf1397

Browse files
authored
shipper: Be strict about upload order unless it's specified so & cut v0.13.0-rc.2 (#2765)
* shipper: Be strict about upload order unless it's specified so. Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com> * Cut 0.13.0-rc.2 Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>
1 parent f06c13a commit 3bf1397

File tree

13 files changed

+125
-101
lines changed

13 files changed

+125
-101
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel
1111

1212
## Unreleased
1313

14-
## [v0.13.0](https://github.com/thanos-io/thanos/releases/tag/v0.13.0) - 2020.06.15
14+
## [v0.13.0-rc.2](https://github.com/thanos-io/thanos/releases/tag/v0.13.0-rc.2) - 2020.06.15
1515

1616
### Fixed
1717

@@ -26,6 +26,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel
2626
- [#2416](https://github.com/thanos-io/thanos/pull/2416) Bucket: Fixed issue #2416 bug in `inspect --sort-by` doesn't work correctly in all cases.
2727
- [#2719](https://github.com/thanos-io/thanos/pull/2719) Query: `irate` and `resets` use now counter downsampling aggregations.
2828
- [#2705](https://github.com/thanos-io/thanos/pull/2705) minio-go: Added support for `af-south-1` and `eu-south-1` regions.
29+
- [#2753](https://github.com/thanos-io/thanos/issues/2753) Sidecar, Receive, Rule: Fixed possibility of out of order uploads in error cases. This could potentially cause Compactor to create overlapping blocks.
2930

3031
### Added
3132

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.13.0
1+
0.13.0-rc.2

cmd/thanos/config.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,17 +112,23 @@ func (rc *reloaderConfig) registerFlag(cmd *kingpin.CmdClause) *reloaderConfig {
112112
}
113113

114114
type shipperConfig struct {
115-
uploadCompacted bool
116-
ignoreBlockSize bool
115+
uploadCompacted bool
116+
ignoreBlockSize bool
117+
allowOutOfOrderUpload bool
117118
}
118119

119120
func (sc *shipperConfig) registerFlag(cmd *kingpin.CmdClause) *shipperConfig {
120121
cmd.Flag("shipper.upload-compacted",
121-
"If true sidecar will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus. Do it once and then disable the flag when done.").
122+
"If true shipper will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus. Do it once and then disable the flag when done.").
122123
Default("false").BoolVar(&sc.uploadCompacted)
123124
cmd.Flag("shipper.ignore-unequal-block-size",
124-
"If true sidecar will not require prometheus min and max block size flags to be set to the same value. Only use this if you want to keep long retention and compaction enabled on your Prometheus instance, as in the worst case it can result in ~2h data loss for your Thanos bucket storage.").
125+
"If true shipper will not require prometheus min and max block size flags to be set to the same value. Only use this if you want to keep long retention and compaction enabled on your Prometheus instance, as in the worst case it can result in ~2h data loss for your Thanos bucket storage.").
125126
Default("false").Hidden().BoolVar(&sc.ignoreBlockSize)
127+
cmd.Flag("shipper.allow-out-of-order-uploads",
128+
"If true, shipper will skip failed block uploads in the given iteration and retry later. This means that some newer blocks might be uploaded sooner than older blocks."+
129+
"This can trigger compaction without those blocks and as a result will create an overlap situation. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
130+
"about order.").
131+
Default("false").Hidden().BoolVar(&sc.allowOutOfOrderUpload)
126132
return sc
127133
}
128134

cmd/thanos/receive.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@ func registerReceive(m map[string]setupFunc, app *kingpin.Application) {
8989

9090
walCompression := cmd.Flag("tsdb.wal-compression", "Compress the tsdb WAL.").Default("true").Bool()
9191

92+
allowOutOfOrderUpload := cmd.Flag("shipper.allow-out-of-order-uploads",
93+
"If true, shipper will skip failed block uploads in the given iteration and retry later. This means that some newer blocks might be uploaded sooner than older blocks."+
94+
"This can trigger compaction without those blocks and as a result will create an overlap situation. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
95+
"about order.").
96+
Default("false").Hidden().Bool()
97+
9298
m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error {
9399
lset, err := parseFlagLabels(*labelStrs)
94100
if err != nil {
@@ -157,6 +163,7 @@ func registerReceive(m map[string]setupFunc, app *kingpin.Application) {
157163
*replicationFactor,
158164
time.Duration(*forwardTimeout),
159165
comp,
166+
*allowOutOfOrderUpload,
160167
)
161168
}
162169
}
@@ -195,6 +202,7 @@ func runReceive(
195202
replicationFactor uint64,
196203
forwardTimeout time.Duration,
197204
comp component.SourceStoreAPI,
205+
allowOutOfOrderUpload bool,
198206
) error {
199207
logger = log.With(logger, "component", "receive")
200208
level.Warn(logger).Log("msg", "setting up receive; the Thanos receive component is EXPERIMENTAL, it may break significantly without notice")
@@ -246,6 +254,7 @@ func runReceive(
246254
lset,
247255
tenantLabelName,
248256
bkt,
257+
allowOutOfOrderUpload,
249258
)
250259
writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs)
251260
webHandler := receive.NewHandler(log.With(logger, "component", "receive-handler"), &receive.Options{

cmd/thanos/rule.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ func registerRule(m map[string]setupFunc, app *kingpin.Application) {
114114
dnsSDResolver := cmd.Flag("query.sd-dns-resolver", "Resolver to use. Possible options: [golang, miekgdns]").
115115
Default("golang").Hidden().String()
116116

117+
allowOutOfOrderUpload := cmd.Flag("shipper.allow-out-of-order-uploads",
118+
"If true, shipper will skip failed block uploads in the given iteration and retry later. This means that some newer blocks might be uploaded sooner than older blocks."+
119+
"This can trigger compaction without those blocks and as a result will create an overlap situation. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
120+
"about order.").
121+
Default("false").Hidden().Bool()
122+
117123
m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, reload <-chan struct{}, _ bool) error {
118124
lset, err := parseFlagLabels(*labelStrs)
119125
if err != nil {
@@ -197,6 +203,7 @@ func registerRule(m map[string]setupFunc, app *kingpin.Application) {
197203
time.Duration(*dnsSDInterval),
198204
*dnsSDResolver,
199205
comp,
206+
*allowOutOfOrderUpload,
200207
)
201208
}
202209
}
@@ -283,6 +290,7 @@ func runRule(
283290
dnsSDInterval time.Duration,
284291
dnsSDResolver string,
285292
comp component.Component,
293+
allowOutOfOrderUpload bool,
286294
) error {
287295
metrics := newRuleMetrics(reg)
288296

@@ -615,7 +623,7 @@ func runRule(
615623
}
616624
}()
617625

618-
s := shipper.New(logger, reg, dataDir, bkt, func() labels.Labels { return lset }, metadata.RulerSource)
626+
s := shipper.New(logger, reg, dataDir, bkt, func() labels.Labels { return lset }, metadata.RulerSource, allowOutOfOrderUpload)
619627

620628
ctx, cancel := context.WithCancel(context.Background())
621629

cmd/thanos/sidecar.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,9 +273,9 @@ func runSidecar(
273273

274274
var s *shipper.Shipper
275275
if conf.shipper.uploadCompacted {
276-
s = shipper.NewWithCompacted(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource)
276+
s = shipper.NewWithCompacted(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource, conf.shipper.allowOutOfOrderUpload)
277277
} else {
278-
s = shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource)
278+
s = shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource, conf.shipper.allowOutOfOrderUpload)
279279
}
280280

281281
return runutil.Repeat(30*time.Second, ctx.Done(), func() error {

docs/components/sidecar.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ Flags:
151151
details:
152152
https://thanos.io/storage.md/#configuration
153153
--shipper.upload-compacted
154-
If true sidecar will try to upload compacted
154+
If true shipper will try to upload compacted
155155
blocks as well. Useful for migration purposes.
156156
Works only if compaction is disabled on
157157
Prometheus. Do it once and then disable the

docs/operating/troubleshooting.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ slug: /troubleshooting.md
77

88
# Troubleshooting; Common cases
99

10-
1110
## Overlaps
1211

1312
**Block overlap**: Set of blocks with exactly the same external labels in meta.json and for the same time or overlapping time period.
@@ -29,13 +28,15 @@ Checking producers log for such ULID, and checking meta.json (e.g if sample stat
2928

3029
### Reasons
3130

31+
- You are running Thanos (sidecar, ruler or receive) older than 0.13.0. During transient upload errors there is a possibility to have overlaps caused by the compactor not being aware of all blocks See: [this](https://github.com/thanos-io/thanos/issues/2753)
3232
- Misconfiguraiton of sidecar/ruler: Same external labels or no external labels across many block producers.
3333
- Running multiple compactors for single block "stream", even for short duration.
3434
- Manually uploading blocks to the bucket.
3535
- Eventually consistent block storage until we fully implement [RW for bucket](https://thanos.io/proposals/201901-read-write-operations-bucket.md)
3636

3737
### Solutions
3838

39+
- Upgrade sidecar, ruler and receive to 0.13.0+
3940
- Compactor can be blocked for some time, but if it is urgent. Mitigate by removing overlap or better: Backing up somewhere else (you can rename block ULID to non-ulid).
4041
- Who uploaded the block? Search for logs with this ULID across all sidecars/rulers. Check access logs to object storage. Check debug/metas or meta.json of problematic block to see how blocks looks like and what is the `source`.
4142
- Determine what you misconfigured.

pkg/receive/multitsdb.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,9 @@ type MultiTSDB struct {
3838
labels labels.Labels
3939
bucket objstore.Bucket
4040

41-
mtx *sync.RWMutex
42-
tenants map[string]*tenant
41+
mtx *sync.RWMutex
42+
tenants map[string]*tenant
43+
allowOutOfOrderUpload bool
4344
}
4445

4546
func NewMultiTSDB(
@@ -50,21 +51,23 @@ func NewMultiTSDB(
5051
labels labels.Labels,
5152
tenantLabelName string,
5253
bucket objstore.Bucket,
54+
allowOutOfOrderUpload bool,
5355
) *MultiTSDB {
5456
if l == nil {
5557
l = log.NewNopLogger()
5658
}
5759

5860
return &MultiTSDB{
59-
dataDir: dataDir,
60-
logger: l,
61-
reg: reg,
62-
tsdbOpts: tsdbOpts,
63-
mtx: &sync.RWMutex{},
64-
tenants: map[string]*tenant{},
65-
labels: labels,
66-
tenantLabelName: tenantLabelName,
67-
bucket: bucket,
61+
dataDir: dataDir,
62+
logger: l,
63+
reg: reg,
64+
tsdbOpts: tsdbOpts,
65+
mtx: &sync.RWMutex{},
66+
tenants: map[string]*tenant{},
67+
labels: labels,
68+
tenantLabelName: tenantLabelName,
69+
bucket: bucket,
70+
allowOutOfOrderUpload: allowOutOfOrderUpload,
6871
}
6972
}
7073

@@ -256,6 +259,7 @@ func (t *MultiTSDB) getOrLoadTenant(tenantID string, blockingStart bool) (*tenan
256259
t.bucket,
257260
func() labels.Labels { return lbls },
258261
metadata.ReceiveSource,
262+
t.allowOutOfOrderUpload,
259263
)
260264
}
261265

pkg/receive/multitsdb_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ func TestMultiTSDB(t *testing.T) {
4343
labels.FromStrings("replica", "01"),
4444
"tenant_id",
4545
nil,
46+
false,
4647
)
4748
defer testutil.Ok(t, m.Flush())
4849

@@ -109,6 +110,7 @@ func TestMultiTSDB(t *testing.T) {
109110
labels.FromStrings("replica", "01"),
110111
"tenant_id",
111112
nil,
113+
false,
112114
)
113115
defer testutil.Ok(t, m.Flush())
114116

0 commit comments

Comments
 (0)