diff --git a/CHANGELOG.md b/CHANGELOG.md index d64e6410890..90a7713d05f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,19 @@ We use *breaking* word for marking changes that are not backward compatible (rel ### Fixed +- [#2501](https://github.com/thanos-io/thanos/pull/2501) Query: gracefully handle additional fields in `SeriesResponse` protobuf message that may be added in the future. + +## [v0.12.1](https://github.com/thanos-io/thanos/releases/tag/v0.12.1) - 2020.04.20 + +### Fixed + +- [#2411](https://github.com/thanos-io/thanos/pull/2411) Query: fix a bug where queries might not time out sometimes due to issues with one or more StoreAPIs. +- [#2474](https://github.com/thanos-io/thanos/pull/2474) Store: fix a panic caused by concurrent memory access during block filtering. +- [#2472](https://github.com/thanos-io/thanos/pull/2472) Compact: fix a bug where partial blocks were never deleted, causing spam of warnings. +- [#2484](https://github.com/thanos-io/thanos/pull/2484) Query/Ruler: fix issue #2483, when web.route-prefix is set, it is added twice in HTTP router prefix. + +### Fixed + - [#2416](https://github.com/thanos-io/thanos/pull/2416) Bucket: fixes issue #2416 bug in `inspect --sort-by` doesn't work correctly in all cases - [#2411](https://github.com/thanos-io/thanos/pull/2411) Query: fix a bug where queries might not time out sometimes due to issues with one or more StoreAPIs diff --git a/VERSION b/VERSION index ac454c6a1fc..34a83616bb5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.12.0 +0.12.1 diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 5e297632df7..fb1b2b207d7 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -420,7 +420,7 @@ func runCompact( } // No need to resync before partial uploads and delete marked blocks. Last sync should be valid. - compact.BestEffortCleanAbortedPartialUploads(ctx, logger, sy.Partial(), bkt, partialUploadDeleteAttempts, blocksMarkedForDeletion) + compact.BestEffortCleanAbortedPartialUploads(ctx, logger, sy.Partial(), bkt, partialUploadDeleteAttempts, blocksCleaned, blockCleanupFailures) if err := blocksCleaner.DeleteMarkedBlocks(ctx); err != nil { return errors.Wrap(err, "error cleaning blocks") } diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index a21565c59a4..2735cd76c20 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -8,7 +8,6 @@ import ( "fmt" "math" "net/http" - "path" "strings" "time" @@ -357,7 +356,7 @@ func runQuery( api := v1.NewAPI(logger, reg, engine, queryableCreator, enableAutodownsampling, enablePartialResponse, replicaLabels, instantDefaultMaxSourceResolution) - api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger, ins) + api.Register(router.WithPrefix("/api/v1"), tracer, logger, ins) srv := httpserver.New(logger, reg, comp, httpProbe, httpserver.WithListen(httpBindAddr), diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 3c021dec2b6..33889f7c996 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -8,7 +8,6 @@ import ( "math/rand" "net/http" "net/url" - "path" "path/filepath" "strconv" "strings" @@ -576,7 +575,7 @@ func runRule( ui.NewRuleUI(logger, reg, ruleMgr, alertQueryURL.String(), webExternalPrefix, webPrefixHeaderName).Register(router, ins) api := v1.NewAPI(logger, reg, ruleMgr) - api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger, ins) + api.Register(router.WithPrefix("/api/v1"), tracer, logger, ins) srv := httpserver.New(logger, reg, comp, httpProbe, httpserver.WithListen(httpBindAddr), diff --git a/docs/release-process.md b/docs/release-process.md index c22074d1af1..5beb423c661 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -33,7 +33,7 @@ Release shepherd responsibilities: | Release | Time of first RC | Shepherd (GitHub handle) | |-----------|--------------------------|--------------------------| | v0.13.0 | (planned) 2020.05.13 | `TBD` | -| v0.12.0 | (planned) 2020.04.01 | `@squat` | +| v0.12.0 | 2020.04.15 | `@squat` | | v0.11.0 | 2020.02.19 | `@metalmatze` | | v0.10.0 | 2020.01.08 | `@GiedriusS` | | v0.9.0 | 2019.11.26 | `@bwplotka` | diff --git a/pkg/block/fetcher.go b/pkg/block/fetcher.go index 7f21d4d8016..29d67735e2a 100644 --- a/pkg/block/fetcher.go +++ b/pkg/block/fetcher.go @@ -550,6 +550,7 @@ var _ MetadataFilter = &DeduplicateFilter{} // Not go-routine safe. type DeduplicateFilter struct { duplicateIDs []ulid.ULID + mu sync.Mutex } // NewDeduplicateFilter creates DeduplicateFilter. @@ -603,11 +604,13 @@ func (f *DeduplicateFilter) filterForResolution(root *Node, metaSlice []*metadat duplicateULIDs := getNonRootIDs(root) for _, id := range duplicateULIDs { + f.mu.Lock() if metas[id] != nil { f.duplicateIDs = append(f.duplicateIDs, id) } synced.WithLabelValues(duplicateMeta).Inc() delete(metas, id) + f.mu.Unlock() } } diff --git a/pkg/compact/clean.go b/pkg/compact/clean.go index 53cfb169083..8d99927d515 100644 --- a/pkg/compact/clean.go +++ b/pkg/compact/clean.go @@ -27,7 +27,8 @@ func BestEffortCleanAbortedPartialUploads( partial map[ulid.ULID]error, bkt objstore.Bucket, deleteAttempts prometheus.Counter, - blocksMarkedForDeletion prometheus.Counter, + blockCleanups prometheus.Counter, + blockCleanupFailures prometheus.Counter, ) { level.Info(logger).Log("msg", "started cleaning of aborted partial uploads") @@ -45,10 +46,15 @@ func BestEffortCleanAbortedPartialUploads( deleteAttempts.Inc() level.Info(logger).Log("msg", "found partially uploaded block; marking for deletion", "block", id) - if err := block.MarkForDeletion(ctx, logger, bkt, id, blocksMarkedForDeletion); err != nil { - level.Warn(logger).Log("msg", "failed to delete aborted partial upload; skipping", "block", id, "thresholdAge", PartialUploadThresholdAge, "err", err) - return + // We don't gather any information about deletion marks for partial blocks, so let's simply remove it. We waited + // long PartialUploadThresholdAge already. + // TODO(bwplotka): Fix some edge cases: https://github.com/thanos-io/thanos/issues/2470 . + if err := block.Delete(ctx, logger, bkt, id); err != nil { + blockCleanupFailures.Inc() + level.Warn(logger).Log("msg", "failed to delete aborted partial upload; will retry in next iteration", "block", id, "thresholdAge", PartialUploadThresholdAge, "err", err) + continue } + blockCleanups.Inc() level.Info(logger).Log("msg", "deleted aborted partial upload", "block", id, "thresholdAge", PartialUploadThresholdAge) } level.Info(logger).Log("msg", "cleaning of aborted partial uploads done") diff --git a/pkg/compact/clean_test.go b/pkg/compact/clean_test.go index 1321c888ba6..ea0f840c696 100644 --- a/pkg/compact/clean_test.go +++ b/pkg/compact/clean_test.go @@ -59,22 +59,19 @@ func TestBestEffortCleanAbortedPartialUploads(t *testing.T) { testutil.Ok(t, bkt.Upload(ctx, path.Join(shouldIgnoreID2.String(), "chunks", "000001"), &fakeChunk)) deleteAttempts := promauto.With(nil).NewCounter(prometheus.CounterOpts{}) - blocksMarkedForDeletion := promauto.With(nil).NewCounter(prometheus.CounterOpts{}) - + blockCleanups := promauto.With(nil).NewCounter(prometheus.CounterOpts{}) + blockCleanupFailures := promauto.With(nil).NewCounter(prometheus.CounterOpts{}) _, partial, err := metaFetcher.Fetch(ctx) testutil.Ok(t, err) - BestEffortCleanAbortedPartialUploads(ctx, logger, partial, bkt, deleteAttempts, blocksMarkedForDeletion) + BestEffortCleanAbortedPartialUploads(ctx, logger, partial, bkt, deleteAttempts, blockCleanups, blockCleanupFailures) testutil.Equals(t, 1.0, promtest.ToFloat64(deleteAttempts)) + testutil.Equals(t, 1.0, promtest.ToFloat64(blockCleanups)) + testutil.Equals(t, 0.0, promtest.ToFloat64(blockCleanupFailures)) exists, err := bkt.Exists(ctx, path.Join(shouldDeleteID.String(), "chunks", "000001")) testutil.Ok(t, err) - testutil.Equals(t, true, exists) - - exists, err = bkt.Exists(ctx, path.Join(shouldDeleteID.String(), metadata.DeletionMarkFilename)) - testutil.Ok(t, err) - testutil.Equals(t, true, exists) - testutil.Equals(t, 1.0, promtest.ToFloat64(blocksMarkedForDeletion)) + testutil.Equals(t, false, exists) exists, err = bkt.Exists(ctx, path.Join(shouldIgnoreID1.String(), "chunks", "000001")) testutil.Ok(t, err) diff --git a/pkg/query/querier.go b/pkg/query/querier.go index 898db30fc1b..0299374d3fe 100644 --- a/pkg/query/querier.go +++ b/pkg/query/querier.go @@ -124,10 +124,12 @@ func (s *seriesServer) Send(r *storepb.SeriesResponse) error { return nil } - if r.GetSeries() == nil { - return errors.New("no seriesSet") + if r.GetSeries() != nil { + s.seriesSet = append(s.seriesSet, *r.GetSeries()) + return nil } - s.seriesSet = append(s.seriesSet, *r.GetSeries()) + + // Unsupported field, skip. return nil } diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go index c071848c051..36da2e62fdd 100644 --- a/pkg/store/proxy.go +++ b/pkg/store/proxy.go @@ -432,13 +432,15 @@ func startStreamSeriesSet( if w := rr.r.GetWarning(); w != "" { s.warnCh.send(storepb.NewWarnSeriesResponse(errors.New(w))) - continue } - select { - case s.recvCh <- rr.r.GetSeries(): - case <-ctx.Done(): - s.handleErr(errors.Wrapf(ctx.Err(), "failed to receive any data from %s", s.name), done) - return + + if series := rr.r.GetSeries(); series != nil { + select { + case s.recvCh <- series: + case <-ctx.Done(): + s.handleErr(errors.Wrapf(ctx.Err(), "failed to receive any data from %s", s.name), done) + return + } } } }() diff --git a/pkg/store/proxy_test.go b/pkg/store/proxy_test.go index 8e8b6022d4e..32416d8b292 100644 --- a/pkg/store/proxy_test.go +++ b/pkg/store/proxy_test.go @@ -1379,10 +1379,12 @@ func (s *storeSeriesServer) Send(r *storepb.SeriesResponse) error { return nil } - if r.GetSeries() == nil { - return errors.New("no seriesSet") + if r.GetSeries() != nil { + s.SeriesSet = append(s.SeriesSet, *r.GetSeries()) + return nil } - s.SeriesSet = append(s.SeriesSet, *r.GetSeries()) + + // Unsupported field, skip. return nil } diff --git a/test/e2e/compact_test.go b/test/e2e/compact_test.go index 5e4b601dcff..9f4dd0d65fb 100644 --- a/test/e2e/compact_test.go +++ b/test/e2e/compact_test.go @@ -4,7 +4,9 @@ package e2e_test import ( + "bytes" "context" + "encoding/json" "fmt" "net/http" "os" @@ -17,9 +19,13 @@ import ( e2edb "github.com/cortexproject/cortex/integration/e2e/db" "github.com/go-kit/kit/log" "github.com/oklog/ulid" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/pkg/timestamp" + "github.com/thanos-io/thanos/pkg/block" + "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/objstore" "github.com/thanos-io/thanos/pkg/objstore/client" "github.com/thanos-io/thanos/pkg/objstore/s3" @@ -29,18 +35,26 @@ import ( "github.com/thanos-io/thanos/test/e2e/e2ethanos" ) +type blockDesc struct { + series []labels.Labels + extLset labels.Labels + mint int64 + maxt int64 +} + +func (b *blockDesc) Create(ctx context.Context, dir string, delay time.Duration) (ulid.ULID, error) { + if delay == 0*time.Second { + return e2eutil.CreateBlock(ctx, dir, b.series, 120, b.mint, b.maxt, b.extLset, 0) + } + return e2eutil.CreateBlockWithBlockDelay(ctx, dir, b.series, 120, b.mint, b.maxt, delay, b.extLset, 0) +} + func TestCompactWithStoreGateway(t *testing.T) { t.Parallel() - l := log.NewLogfmtLogger(os.Stdout) - type blockDesc struct { - series []labels.Labels - extLset labels.Labels - mint int64 - maxt int64 - } + logger := log.NewLogfmtLogger(os.Stdout) - delay := 30 * time.Minute + justAfterConsistencyDelay := 30 * time.Minute // Make sure to take realistic timestamp for start. This is to align blocks as if they would be aligned on Prometheus. // To have deterministic compaction, let's have fixed date: now, err := time.Parse(time.RFC3339, "2020-03-24T08:00:00Z") @@ -266,7 +280,7 @@ func TestCompactWithStoreGateway(t *testing.T) { m := e2edb.NewMinio(8080, bucket) testutil.Ok(t, s.StartAndWaitReady(m)) - bkt, err := s3.NewBucketWithConfig(l, s3.Config{ + bkt, err := s3.NewBucketWithConfig(logger, s3.Config{ Bucket: bucket, AccessKey: e2edb.MinioAccessKey, SecretKey: e2edb.MinioSecretKey, @@ -280,11 +294,73 @@ func TestCompactWithStoreGateway(t *testing.T) { rawBlockIDs := map[ulid.ULID]struct{}{} for _, b := range blocks { - id, err := e2eutil.CreateBlockWithBlockDelay(ctx, dir, b.series, 120, b.mint, b.maxt, delay, b.extLset, 0) + id, err := b.Create(ctx, dir, justAfterConsistencyDelay) testutil.Ok(t, err) - testutil.Ok(t, objstore.UploadDir(ctx, l, bkt, path.Join(dir, id.String()), id.String())) + testutil.Ok(t, objstore.UploadDir(ctx, logger, bkt, path.Join(dir, id.String()), id.String())) rawBlockIDs[id] = struct{}{} } + { + // On top of that, add couple of other tricky cases with different meta. + malformedBase := blockDesc{ + series: []labels.Labels{labels.FromStrings("a", "1", "b", "2")}, + extLset: labels.FromStrings("case", "malformed-things", "replica", "101"), + mint: timestamp.FromTime(now), + maxt: timestamp.FromTime(now.Add(2 * time.Hour)), + } + + // New Partial block. + id, err := malformedBase.Create(ctx, dir, 0*time.Second) + testutil.Ok(t, err) + testutil.Ok(t, os.Remove(path.Join(dir, id.String(), metadata.MetaFilename))) + testutil.Ok(t, objstore.UploadDir(ctx, logger, bkt, path.Join(dir, id.String()), id.String())) + + // New Partial block + deletion mark. + id, err = malformedBase.Create(ctx, dir, 0*time.Second) + testutil.Ok(t, err) + testutil.Ok(t, os.Remove(path.Join(dir, id.String(), metadata.MetaFilename))) + testutil.Ok(t, block.MarkForDeletion(ctx, logger, bkt, id, promauto.With(nil).NewCounter(prometheus.CounterOpts{}))) + testutil.Ok(t, objstore.UploadDir(ctx, logger, bkt, path.Join(dir, id.String()), id.String())) + + // Partial block after consistency delay. + id, err = malformedBase.Create(ctx, dir, justAfterConsistencyDelay) + testutil.Ok(t, err) + testutil.Ok(t, os.Remove(path.Join(dir, id.String(), metadata.MetaFilename))) + testutil.Ok(t, objstore.UploadDir(ctx, logger, bkt, path.Join(dir, id.String()), id.String())) + + // Partial block after consistency delay + deletion mark. + id, err = malformedBase.Create(ctx, dir, justAfterConsistencyDelay) + testutil.Ok(t, err) + testutil.Ok(t, os.Remove(path.Join(dir, id.String(), metadata.MetaFilename))) + testutil.Ok(t, block.MarkForDeletion(ctx, logger, bkt, id, promauto.With(nil).NewCounter(prometheus.CounterOpts{}))) + testutil.Ok(t, objstore.UploadDir(ctx, logger, bkt, path.Join(dir, id.String()), id.String())) + + // Partial block after consistency delay + old deletion mark ready to be deleted. + id, err = malformedBase.Create(ctx, dir, justAfterConsistencyDelay) + testutil.Ok(t, err) + testutil.Ok(t, os.Remove(path.Join(dir, id.String(), metadata.MetaFilename))) + deletionMark, err := json.Marshal(metadata.DeletionMark{ + ID: id, + // Deletion threshold is usually 2 days. + DeletionTime: time.Now().Add(-50 * time.Hour).Unix(), + Version: metadata.DeletionMarkVersion1, + }) + testutil.Ok(t, err) + testutil.Ok(t, bkt.Upload(ctx, path.Join(id.String(), metadata.DeletionMarkFilename), bytes.NewBuffer(deletionMark))) + testutil.Ok(t, objstore.UploadDir(ctx, logger, bkt, path.Join(dir, id.String()), id.String())) + + // Partial block after delete threshold. + id, err = malformedBase.Create(ctx, dir, 50*time.Hour) + testutil.Ok(t, err) + testutil.Ok(t, os.Remove(path.Join(dir, id.String(), metadata.MetaFilename))) + testutil.Ok(t, objstore.UploadDir(ctx, logger, bkt, path.Join(dir, id.String()), id.String())) + + // Partial block after delete threshold + deletion mark. + id, err = malformedBase.Create(ctx, dir, 50*time.Hour) + testutil.Ok(t, err) + testutil.Ok(t, os.Remove(path.Join(dir, id.String(), metadata.MetaFilename))) + testutil.Ok(t, block.MarkForDeletion(ctx, logger, bkt, id, promauto.With(nil).NewCounter(prometheus.CounterOpts{}))) + testutil.Ok(t, objstore.UploadDir(ctx, logger, bkt, path.Join(dir, id.String()), id.String())) + } svcConfig := client.BucketConfig{ Type: client.S3, @@ -299,7 +375,7 @@ func TestCompactWithStoreGateway(t *testing.T) { str, err := e2ethanos.NewStoreGW(s.SharedDir(), "1", svcConfig) testutil.Ok(t, err) testutil.Ok(t, s.StartAndWaitReady(str)) - testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs))), "thanos_blocks_meta_synced")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7)), "thanos_blocks_meta_synced")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_modified")) @@ -349,12 +425,50 @@ func TestCompactWithStoreGateway(t *testing.T) { {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "5", "case": "full-replica-overlap-dedup-ready", "replica": "1"}}, }, ) + // Store view: + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7)), "thanos_blocks_meta_synced")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_modified")) + + expectedEndVector := model.Vector{ + // NOTE(bwplotka): Even after deduplication some series has still replica labels. This is because those blocks did not overlap yet with anything. + // This is fine as querier deduplication will remove it if needed. + {Value: 360, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "no-compaction", "replica": "1"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "compaction-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "3", "case": "compaction-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "4", "case": "compaction-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "5", "case": "compaction-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "6", "case": "compaction-ready", "replica": "1"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "compaction-ready-after-dedup"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "3", "case": "compaction-ready-after-dedup"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "4", "case": "compaction-ready-after-dedup"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "5", "case": "compaction-ready-after-dedup"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "6", "case": "compaction-ready-after-dedup", "replica": "1"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "1", "case": "a-partial-overlap-dedup-ready"}}, + {Value: 360, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "a-partial-overlap-dedup-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "a-partial-overlap-dedup-ready", "replica": "1"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "3", "case": "a-partial-overlap-dedup-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "4", "case": "a-partial-overlap-dedup-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "5", "case": "a-partial-overlap-dedup-ready", "replica": "1"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "1", "case": "partial-multi-replica-overlap-dedup-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "partial-multi-replica-overlap-dedup-ready", "replica": "1", "rule_replica": "1"}}, + {Value: 240, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "partial-multi-replica-overlap-dedup-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "3", "case": "partial-multi-replica-overlap-dedup-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "4", "case": "partial-multi-replica-overlap-dedup-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "5", "case": "partial-multi-replica-overlap-dedup-ready", "replica": "1", "rule_replica": "1"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "1", "case": "full-replica-overlap-dedup-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "full-replica-overlap-dedup-ready"}}, + {Value: 240, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "full-replica-overlap-dedup-ready", "replica": "1"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "3", "case": "full-replica-overlap-dedup-ready"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "4", "case": "full-replica-overlap-dedup-ready", "replica": "1"}}, + {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "5", "case": "full-replica-overlap-dedup-ready", "replica": "1"}}, + } t.Run("no replica label with overlaps should halt compactor", func(t *testing.T) { c, err := e2ethanos.NewCompactor(s.SharedDir(), "expect-to-halt", svcConfig, nil) testutil.Ok(t, err) testutil.Ok(t, s.StartAndWaitReady(c)) - testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs))), "thanos_blocks_meta_synced")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7)), "thanos_blocks_meta_synced")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_modified")) @@ -364,7 +478,9 @@ func TestCompactWithStoreGateway(t *testing.T) { // We expect no ops. testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_iterations_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_blocks_cleaned_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_block_cleanup_failures_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_blocks_marked_for_deletion_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_aborted_partial_uploads_deletion_attempts_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_compactions_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_vertical_compactions_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(1), "thanos_compact_group_compactions_failures_total")) @@ -380,7 +496,8 @@ func TestCompactWithStoreGateway(t *testing.T) { testutil.Ok(t, s.Stop(c)) }) - t.Run("native vertical deduplication should kick in", func(t *testing.T) { + t.Run("dedup enabled; compactor should work as expected", func(t *testing.T) { + // We expect 2x 4-block compaction, 2-block vertical compaction, 2x 3-block compaction. c, err := e2ethanos.NewCompactor(s.SharedDir(), "working", svcConfig, nil, "--deduplication.replica-label=replica", "--deduplication.replica-label=rule_replica") testutil.Ok(t, err) testutil.Ok(t, s.StartAndWaitReady(c)) @@ -388,10 +505,10 @@ func TestCompactWithStoreGateway(t *testing.T) { // NOTE: We cannot assert on intermediate `thanos_blocks_meta_` metrics as those are gauge and change dynamically due to many // compaction groups. Wait for at least first compaction iteration (next is in 5m). testutil.Ok(t, c.WaitSumMetrics(e2e.Greater(0), "thanos_compactor_iterations_total")) - - testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(16), "thanos_compactor_blocks_cleaned_total")) - testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(16), "thanos_compactor_blocks_marked_for_deletion_total")) - + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2), "thanos_compactor_blocks_cleaned_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_block_cleanup_failures_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2*4+2+2*3), "thanos_compactor_blocks_marked_for_deletion_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(2), "thanos_compactor_aborted_partial_uploads_deletion_attempts_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(5), "thanos_compact_group_compactions_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(3), "thanos_compact_group_vertical_compactions_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_compactions_failures_total")) @@ -401,8 +518,11 @@ func TestCompactWithStoreGateway(t *testing.T) { testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_downsample_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_downsample_failures_total")) - // We had 8 deletions based on 3 compactios, so 3 new blocks. - testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)-16+5)), "thanos_blocks_meta_synced")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64( + len(rawBlockIDs)+7+ + 5+ // 5 compactions, 5 newly added blocks. + -2, // Partial block removed. + )), "thanos_blocks_meta_synced")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_halted")) @@ -418,43 +538,56 @@ func TestCompactWithStoreGateway(t *testing.T) { promclient.QueryOptions{ Deduplicate: false, // This should be false, so that we can be sure deduplication was offline. }, - model.Vector{ - // NOTE(bwplotka): Even after deduplication some series has still replica labels. This is because those blocks did not overlap yet with anything. - // This is fine as querier deduplication will remove it if needed. - {Value: 360, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "no-compaction", "replica": "1"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "compaction-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "3", "case": "compaction-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "4", "case": "compaction-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "5", "case": "compaction-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "6", "case": "compaction-ready", "replica": "1"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "compaction-ready-after-dedup"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "3", "case": "compaction-ready-after-dedup"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "4", "case": "compaction-ready-after-dedup"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "5", "case": "compaction-ready-after-dedup"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "6", "case": "compaction-ready-after-dedup", "replica": "1"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "1", "case": "a-partial-overlap-dedup-ready"}}, - {Value: 360, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "a-partial-overlap-dedup-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "a-partial-overlap-dedup-ready", "replica": "1"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "3", "case": "a-partial-overlap-dedup-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "4", "case": "a-partial-overlap-dedup-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "5", "case": "a-partial-overlap-dedup-ready", "replica": "1"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "1", "case": "partial-multi-replica-overlap-dedup-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "partial-multi-replica-overlap-dedup-ready", "replica": "1", "rule_replica": "1"}}, - {Value: 240, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "partial-multi-replica-overlap-dedup-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "3", "case": "partial-multi-replica-overlap-dedup-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "4", "case": "partial-multi-replica-overlap-dedup-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "5", "case": "partial-multi-replica-overlap-dedup-ready", "replica": "1", "rule_replica": "1"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "1", "case": "full-replica-overlap-dedup-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "full-replica-overlap-dedup-ready"}}, - {Value: 240, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "2", "case": "full-replica-overlap-dedup-ready", "replica": "1"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "3", "case": "full-replica-overlap-dedup-ready"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "4", "case": "full-replica-overlap-dedup-ready", "replica": "1"}}, - {Value: 120, Metric: map[model.LabelName]model.LabelValue{"a": "1", "b": "5", "case": "full-replica-overlap-dedup-ready", "replica": "1"}}, + expectedEndVector, + ) + // Store view: + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7+5-2)), "thanos_blocks_meta_synced")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_modified")) + }) + + t.Run("dedup enabled; no delete delay; compactor should work and remove things as expected", func(t *testing.T) { + c, err := e2ethanos.NewCompactor(s.SharedDir(), "working", svcConfig, nil, "--deduplication.replica-label=replica", "--deduplication.replica-label=rule_replica", "--delete-delay=0s") + testutil.Ok(t, err) + testutil.Ok(t, s.StartAndWaitReady(c)) + + // NOTE: We cannot assert on intermediate `thanos_blocks_meta_` metrics as those are gauge and change dynamically due to many + // compaction groups. Wait for at least first compaction iteration (next is in 5m). + testutil.Ok(t, c.WaitSumMetrics(e2e.Greater(0), "thanos_compactor_iterations_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(16), "thanos_compactor_blocks_cleaned_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_block_cleanup_failures_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_blocks_marked_for_deletion_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_aborted_partial_uploads_deletion_attempts_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_compactions_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_vertical_compactions_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_group_compactions_failures_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(6), "thanos_compact_group_compaction_runs_started_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(6), "thanos_compact_group_compaction_runs_completed_total")) + + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_downsample_total")) + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compact_downsample_failures_total")) + + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7+5-16-2)), "thanos_blocks_meta_synced")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) + + testutil.Ok(t, c.WaitSumMetrics(e2e.Equals(0), "thanos_compactor_halted")) + // Make sure compactor does not modify anything else over time. + testutil.Ok(t, s.Stop(c)) + + ctx, cancel = context.WithTimeout(context.Background(), 3*time.Minute) + defer cancel() + + // Check if query detects new blocks. + queryAndAssert(t, ctx, q.HTTPEndpoint(), + fmt.Sprintf(`count_over_time({a="1"}[13h] offset %ds)`, int64(time.Since(now.Add(12*time.Hour)).Seconds())), + promclient.QueryOptions{ + Deduplicate: false, // This should be false, so that we can be sure deduplication was offline. }, + expectedEndVector, ) // Store view: - testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)-16+5)), "thanos_blocks_meta_synced")) + testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(float64(len(rawBlockIDs)+7-16+5-2)), "thanos_blocks_meta_synced")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_sync_failures_total")) testutil.Ok(t, str.WaitSumMetrics(e2e.Equals(0), "thanos_blocks_meta_modified")) }) diff --git a/test/e2e/e2ethanos/services.go b/test/e2e/e2ethanos/services.go index c014a36d7b8..c0d4bc7f354 100644 --- a/test/e2e/e2ethanos/services.go +++ b/test/e2e/e2ethanos/services.go @@ -373,7 +373,6 @@ func NewCompactor(sharedDir string, name string, bucketConfig client.BucketConfi "--data-dir": container, "--objstore.config": string(bktConfigBytes), "--http-address": ":80", - "--delete-delay": "0s", "--block-sync-concurrency": "20", "--selector.relabel-config": string(relabelConfigBytes), "--wait": "", diff --git a/tutorials/katacoda/thanos/1-globalview/courseBase.sh b/tutorials/katacoda/thanos/1-globalview/courseBase.sh index 53b64d03ff0..c06dec0cf94 100644 --- a/tutorials/katacoda/thanos/1-globalview/courseBase.sh +++ b/tutorials/katacoda/thanos/1-globalview/courseBase.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash docker pull quay.io/prometheus/prometheus:v2.16.0 -docker pull quay.io/thanos/thanos:v0.12.0 +docker pull quay.io/thanos/thanos:v0.12.1 diff --git a/tutorials/katacoda/thanos/1-globalview/step2.md b/tutorials/katacoda/thanos/1-globalview/step2.md index ed8bcc1576f..af4b9d15b67 100644 --- a/tutorials/katacoda/thanos/1-globalview/step2.md +++ b/tutorials/katacoda/thanos/1-globalview/step2.md @@ -10,7 +10,7 @@ component and can be invoked in a single command. Let's take a look at all the Thanos commands: ``` -docker run --rm quay.io/thanos/thanos:v0.12.0 --help +docker run --rm quay.io/thanos/thanos:v0.12.1 --help ```{{execute}} You should see multiple commands that solves different purposes. @@ -53,7 +53,7 @@ docker run -d --net=host --rm \ -v $(pwd)/prometheus0_eu1.yml:/etc/prometheus/prometheus.yml \ --name prometheus-0-sidecar-eu1 \ -u root \ - quay.io/thanos/thanos:v0.12.0 \ + quay.io/thanos/thanos:v0.12.1 \ sidecar \ --http-address 0.0.0.0:19090 \ --grpc-address 0.0.0.0:19190 \ @@ -68,7 +68,7 @@ docker run -d --net=host --rm \ -v $(pwd)/prometheus0_us1.yml:/etc/prometheus/prometheus.yml \ --name prometheus-0-sidecar-us1 \ -u root \ - quay.io/thanos/thanos:v0.12.0 \ + quay.io/thanos/thanos:v0.12.1 \ sidecar \ --http-address 0.0.0.0:19091 \ --grpc-address 0.0.0.0:19191 \ @@ -81,7 +81,7 @@ docker run -d --net=host --rm \ -v $(pwd)/prometheus1_us1.yml:/etc/prometheus/prometheus.yml \ --name prometheus-1-sidecar-us1 \ -u root \ - quay.io/thanos/thanos:v0.12.0 \ + quay.io/thanos/thanos:v0.12.1 \ sidecar \ --http-address 0.0.0.0:19092 \ --grpc-address 0.0.0.0:19192 \ diff --git a/tutorials/katacoda/thanos/1-globalview/step3.md b/tutorials/katacoda/thanos/1-globalview/step3.md index c7635b46071..8d4b17e84df 100644 --- a/tutorials/katacoda/thanos/1-globalview/step3.md +++ b/tutorials/katacoda/thanos/1-globalview/step3.md @@ -28,7 +28,7 @@ Click below snippet to start the Querier. ``` docker run -d --net=host --rm \ --name querier \ - quay.io/thanos/thanos:v0.12.0 \ + quay.io/thanos/thanos:v0.12.1 \ query \ --http-address 0.0.0.0:29090 \ --query.replica-label replica \