Skip to content

Commit f4757f5

Browse files
krasi-georgievbwplotka
authored andcommitted
Query: allow multiple deduplication label. (#1362)
Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
1 parent cc17395 commit f4757f5

File tree

9 files changed

+706
-193
lines changed

9 files changed

+706
-193
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,15 @@ Accepted into CNCF:
3030
request for both Prometheus and sidecar. Single requests now should take constant amount of memory on sidecar, so resource consumption prediction is now straightforward. This will be used if you have Prometheus `2.13` or `2.12-master`.
3131
- [#1358](https://github.com/thanos-io/thanos/pull/1358) Added `part_size` configuration option for HTTP multipart requests minimum part size for S3 storage type
3232
- [#1363](https://github.com/thanos-io/thanos/pull/1363) Thanos Receive now exposes `thanos_receive_hashring_nodes` and `thanos_receive_hashring_tenants` metrics to monitor status of hash-rings
33+
- [#1362](https://github.com/thanos-io/thanos/pull/1362) Optional `replicaLabels` param for `/query` and `/query_range` querier endpoints. When provided overwrite the `query.replica-label` cli flags.
3334
- [#1395](https://github.com/thanos-io/thanos/pull/1395) Thanos Sidecar added `/-/ready` and `/-/healthy` endpoints to Thanos sidecar.
3435
- [#1297](https://github.com/thanos-io/thanos/pull/1297) Thanos Compact added `/-/ready` and `/-/healthy` endpoints to Thanos compact.
3536
- [#1431](https://github.com/thanos-io/thanos/pull/1431) Thanos Query added hidden flag to allow the use of downsampled resolution data for instant queries.
3637
- [#1408](https://github.com/thanos-io/thanos/pull/1408) Thanos Store Gateway can now allow the specifying of supported time ranges it will serve (time sharding). Flags: `min-time` & `max-time`
3738

3839
### Changed
3940

41+
- [#1362](https://github.com/thanos-io/thanos/pull/1362) `query.replica-label` configuration can be provided more than once for multiple deduplication labels like: `--query.replica-label=prometheus_replica --query.replica-label=service`.
4042
- [#1414](https://github.com/thanos-io/thanos/pull/1413) Upgraded important dependencies: Prometheus to 2.12-rc.0. TSDB is now part of Prometheus.
4143
- [#1380](https://github.com/thanos-io/thanos/pull/1380) Upgraded important dependencies: Prometheus to 2.11.1 and TSDB to 0.9.1. Some changes affecting Querier:
4244
- [ENHANCEMENT] Query performance improvement: Efficient iteration and search in HashForLabels and HashWithoutLabels. #5707

cmd/thanos/query.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ func registerQuery(m map[string]setupFunc, app *kingpin.Application, name string
6363
maxConcurrentQueries := cmd.Flag("query.max-concurrent", "Maximum number of queries processed concurrently by query node.").
6464
Default("20").Int()
6565

66-
replicaLabel := cmd.Flag("query.replica-label", "Label to treat as a replica indicator along which data is deduplicated. Still you will be able to query without deduplication using 'dedup=false' parameter.").
67-
String()
66+
replicaLabels := cmd.Flag("query.replica-label", "Labels to treat as a replica indicator along which data is deduplicated. Still you will be able to query without deduplication using 'dedup=false' parameter.").
67+
Strings()
6868

6969
instantDefaultMaxSourceResolution := modelDuration(cmd.Flag("query.instant.default.max_source_resolution", "default value for max_source_resolution for instant queries. If not set, defaults to 0s only taking raw resolution into account. 1h can be a good value if you use instant queries over time ranges that incorporate times outside of your raw-retention.").Default("0s").Hidden())
7070

@@ -146,7 +146,7 @@ func registerQuery(m map[string]setupFunc, app *kingpin.Application, name string
146146
*maxConcurrentQueries,
147147
time.Duration(*queryTimeout),
148148
time.Duration(*storeResponseTimeout),
149-
*replicaLabel,
149+
*replicaLabels,
150150
selectorLset,
151151
*stores,
152152
*enableAutodownsampling,
@@ -264,7 +264,7 @@ func runQuery(
264264
maxConcurrentQueries int,
265265
queryTimeout time.Duration,
266266
storeResponseTimeout time.Duration,
267-
replicaLabel string,
267+
replicaLabels []string,
268268
selectorLset labels.Labels,
269269
storeAddrs []string,
270270
enableAutodownsampling bool,
@@ -312,7 +312,7 @@ func runQuery(
312312
unhealthyStoreTimeout,
313313
)
314314
proxy = store.NewProxyStore(logger, stores.Get, component.Query, selectorLset, storeResponseTimeout)
315-
queryableCreator = query.NewQueryableCreator(logger, proxy, replicaLabel)
315+
queryableCreator = query.NewQueryableCreator(logger, proxy)
316316
engine = promql.NewEngine(
317317
promql.EngineOpts{
318318
Logger: logger,
@@ -404,7 +404,7 @@ func runQuery(
404404
ins := extpromhttp.NewInstrumentationMiddleware(reg)
405405
ui.NewQueryUI(logger, reg, stores, flagsMap).Register(router.WithPrefix(webRoutePrefix), ins)
406406

407-
api := v1.NewAPI(logger, reg, engine, queryableCreator, enableAutodownsampling, enablePartialResponse, instantDefaultMaxSourceResolution)
407+
api := v1.NewAPI(logger, reg, engine, queryableCreator, enableAutodownsampling, enablePartialResponse, replicaLabels, instantDefaultMaxSourceResolution)
408408

409409
api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger, ins)
410410

docs/components/query.md

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@ $ thanos query \
2323
## Deduplication
2424

2525
The query layer can deduplicate series that were collected from high-availability pairs of data sources such as Prometheus.
26-
A fixed replica label must be chosen for the entire cluster and can then be passed to query nodes on startup.
26+
A fixed single or multiple replica labels must be chosen for the entire cluster and can then be passed to query nodes on startup.
2727

2828
Two or more series that are only distinguished by the given replica label, will be merged into a single time series.
29-
This also hides gaps in collection of a single data source. For example:
29+
This also hides gaps in collection of a single data source.
30+
31+
### An example with a single replica labels:
3032

3133
* Prometheus + sidecar "A": `cluster=1,env=2,replica=A`
3234
* Prometheus + sidecar "B": `cluster=1,env=2,replica=B`
@@ -47,12 +49,28 @@ And we query for metric `up{job="prometheus",env="2"}` with this option we will
4749
* `up{job="prometheus",env="2",cluster="1"} 1`
4850
* `up{job="prometheus",env="2",cluster="2"} 1`
4951

50-
WITHOUT this replica flag (so deduplication turned off), we will get 3 results:
52+
WITHOUT this replica flag (deduplication turned off), we will get 3 results:
5153

5254
* `up{job="prometheus",env="2",cluster="1",replica="A"} 1`
5355
* `up{job="prometheus",env="2",cluster="1",replica="B"} 1`
5456
* `up{job="prometheus",env="2",cluster="2",replica="A"} 1`
5557

58+
### The same output will be present for this example with multiple replica labels:
59+
60+
* Prometheus + sidecar "A": `cluster=1,env=2,replica=A,replicaX=A`
61+
* Prometheus + sidecar "B": `cluster=1,env=2,replica=B,replicaX=B`
62+
* Prometheus + sidecar "A" in different cluster: `cluster=2,env=2,replica=A,replicaX=A`
63+
64+
```
65+
$ thanos query \
66+
--http-address "0.0.0.0:9090" \
67+
--query.replica-label "replica" \
68+
--query.replica-label "replicaX" \
69+
--store "<store-api>:<grpc-port>" \
70+
--store "<store-api2>:<grpc-port>" \
71+
```
72+
73+
5674
This logic can also be controlled via parameter on QueryAPI. More details below.
5775

5876
## Query API
@@ -89,14 +107,23 @@ Querier also allows to configure different timeouts:
89107
If you prefer availability over accuracy you can set tighter timeout to underlying StoreAPI than overall query timeout. If partial response
90108
strategy is NOT `abort`, this will "ignore" slower StoreAPIs producing just warning with 200 status code response.
91109

110+
### Deduplication replica labels.
111+
112+
| HTTP URL/FORM parameter | Type | Default | Example |
113+
|----|----|----|----|
114+
| `replicaLabels` | `[]string` | `query.replica-label` flag (default: empty). | `replicaLabels=replicaA&replicaLabels=replicaB` |
115+
| | | | |
116+
117+
This overwrites the `query.replica-label` cli flag to allow dynamic replica labels at query time.
118+
92119
### Deduplication Enabled
93120

94121
| HTTP URL/FORM parameter | Type | Default | Example |
95122
|----|----|----|----|
96123
| `dedup` | `Boolean` | True, but effect depends on `query.replica` configuration flag. | `1, t, T, TRUE, true, True` for "True" |
97124
| | | | |
98125

99-
This controls if query should use `replica` label for deduplication or not.
126+
This controls if query results should be deduplicated using the replica labels.
100127

101128
### Auto downsampling
102129

@@ -236,8 +263,8 @@ Flags:
236263
--query.timeout=2m Maximum time to process query by query node.
237264
--query.max-concurrent=20 Maximum number of queries processed
238265
concurrently by query node.
239-
--query.replica-label=QUERY.REPLICA-LABEL
240-
Label to treat as a replica indicator along
266+
--query.replica-label=QUERY.REPLICA-LABEL ...
267+
Labels to treat as a replica indicator along
241268
which data is deduplicated. Still you will be
242269
able to query without deduplication using
243270
'dedup=false' parameter.

docs/getting-started.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ thanos query \
197197
--store 1.2.3.4:19090 \
198198
--store 1.2.3.5:19090 \
199199
--query.replica-label replica # Replica label for de-duplication
200+
--query.replica-label replicaX # Supports multiple replica labels for de-duplication
200201
```
201202

202203
Go to the configured HTTP address, and you should now be able to query across all Prometheus instances and receive de-duplicated data.

pkg/query/api/v1.go

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ type API struct {
102102

103103
enableAutodownsampling bool
104104
enablePartialResponse bool
105+
replicaLabels []string
105106
reg prometheus.Registerer
106107
defaultInstantQueryMaxSourceResolution time.Duration
107108

@@ -116,6 +117,7 @@ func NewAPI(
116117
c query.QueryableCreator,
117118
enableAutodownsampling bool,
118119
enablePartialResponse bool,
120+
replicaLabels []string,
119121
defaultInstantQueryMaxSourceResolution time.Duration,
120122
) *API {
121123
return &API{
@@ -124,6 +126,7 @@ func NewAPI(
124126
queryableCreate: c,
125127
enableAutodownsampling: enableAutodownsampling,
126128
enablePartialResponse: enablePartialResponse,
129+
replicaLabels: replicaLabels,
127130
reg: reg,
128131
defaultInstantQueryMaxSourceResolution: defaultInstantQueryMaxSourceResolution,
129132

@@ -185,6 +188,21 @@ func (api *API) parseEnableDedupParam(r *http.Request) (enableDeduplication bool
185188
return enableDeduplication, nil
186189
}
187190

191+
func (api *API) parseReplicaLabelsParam(r *http.Request) (replicaLabels []string, _ *ApiError) {
192+
const replicaLabelsParam = "replicaLabels[]"
193+
if err := r.ParseForm(); err != nil {
194+
return nil, &ApiError{ErrorInternal, errors.Wrap(err, "parse form")}
195+
}
196+
197+
replicaLabels = api.replicaLabels
198+
// Overwrite the cli flag when provided as a query parameter.
199+
if len(r.Form[replicaLabelsParam]) > 0 {
200+
replicaLabels = r.Form[replicaLabelsParam]
201+
}
202+
203+
return replicaLabels, nil
204+
}
205+
188206
func (api *API) parseDownsamplingParamMillis(r *http.Request, defaultVal time.Duration) (maxResolutionMillis int64, _ *ApiError) {
189207
const maxSourceResolutionParam = "max_source_resolution"
190208
maxSourceResolution := 0 * time.Second
@@ -212,6 +230,7 @@ func (api *API) parsePartialResponseParam(r *http.Request) (enablePartialRespons
212230
const partialResponseParam = "partial_response"
213231
enablePartialResponse = api.enablePartialResponse
214232

233+
// Overwrite the cli flag when provided as a query parameter.
215234
if val := r.FormValue(partialResponseParam); val != "" {
216235
var err error
217236
enablePartialResponse, err = strconv.ParseBool(val)
@@ -255,6 +274,11 @@ func (api *API) query(r *http.Request) (interface{}, []error, *ApiError) {
255274
return nil, nil, apiErr
256275
}
257276

277+
replicaLabels, apiErr := api.parseReplicaLabelsParam(r)
278+
if apiErr != nil {
279+
return nil, nil, apiErr
280+
}
281+
258282
enablePartialResponse, apiErr := api.parsePartialResponseParam(r)
259283
if apiErr != nil {
260284
return nil, nil, apiErr
@@ -269,7 +293,7 @@ func (api *API) query(r *http.Request) (interface{}, []error, *ApiError) {
269293
span, ctx := tracing.StartSpan(ctx, "promql_instant_query")
270294
defer span.Finish()
271295

272-
qry, err := api.queryEngine.NewInstantQuery(api.queryableCreate(enableDedup, maxSourceResolution, enablePartialResponse), r.FormValue("query"), ts)
296+
qry, err := api.queryEngine.NewInstantQuery(api.queryableCreate(enableDedup, replicaLabels, maxSourceResolution, enablePartialResponse), r.FormValue("query"), ts)
273297
if err != nil {
274298
return nil, nil, &ApiError{errorBadData, err}
275299
}
@@ -341,6 +365,11 @@ func (api *API) queryRange(r *http.Request) (interface{}, []error, *ApiError) {
341365
return nil, nil, apiErr
342366
}
343367

368+
replicaLabels, apiErr := api.parseReplicaLabelsParam(r)
369+
if apiErr != nil {
370+
return nil, nil, apiErr
371+
}
372+
344373
// If no max_source_resolution is specified fit at least 5 samples between steps.
345374
maxSourceResolution, apiErr := api.parseDownsamplingParamMillis(r, step/5)
346375
if apiErr != nil {
@@ -357,7 +386,7 @@ func (api *API) queryRange(r *http.Request) (interface{}, []error, *ApiError) {
357386
defer span.Finish()
358387

359388
qry, err := api.queryEngine.NewRangeQuery(
360-
api.queryableCreate(enableDedup, maxSourceResolution, enablePartialResponse),
389+
api.queryableCreate(enableDedup, replicaLabels, maxSourceResolution, enablePartialResponse),
361390
r.FormValue("query"),
362391
start,
363392
end,
@@ -397,7 +426,7 @@ func (api *API) labelValues(r *http.Request) (interface{}, []error, *ApiError) {
397426
return nil, nil, apiErr
398427
}
399428

400-
q, err := api.queryableCreate(true, 0, enablePartialResponse).Querier(ctx, math.MinInt64, math.MaxInt64)
429+
q, err := api.queryableCreate(true, nil, 0, enablePartialResponse).Querier(ctx, math.MinInt64, math.MaxInt64)
401430
if err != nil {
402431
return nil, nil, &ApiError{errorExec, err}
403432
}
@@ -463,13 +492,18 @@ func (api *API) series(r *http.Request) (interface{}, []error, *ApiError) {
463492
return nil, nil, apiErr
464493
}
465494

495+
replicaLabels, apiErr := api.parseReplicaLabelsParam(r)
496+
if apiErr != nil {
497+
return nil, nil, apiErr
498+
}
499+
466500
enablePartialResponse, apiErr := api.parsePartialResponseParam(r)
467501
if apiErr != nil {
468502
return nil, nil, apiErr
469503
}
470504

471505
// TODO(bwplotka): Support downsampling?
472-
q, err := api.queryableCreate(enableDedup, 0, enablePartialResponse).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end))
506+
q, err := api.queryableCreate(enableDedup, replicaLabels, 0, enablePartialResponse).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end))
473507
if err != nil {
474508
return nil, nil, &ApiError{errorExec, err}
475509
}
@@ -572,7 +606,7 @@ func (api *API) labelNames(r *http.Request) (interface{}, []error, *ApiError) {
572606
return nil, nil, apiErr
573607
}
574608

575-
q, err := api.queryableCreate(true, 0, enablePartialResponse).Querier(ctx, math.MinInt64, math.MaxInt64)
609+
q, err := api.queryableCreate(true, nil, 0, enablePartialResponse).Querier(ctx, math.MinInt64, math.MaxInt64)
576610
if err != nil {
577611
return nil, nil, &ApiError{errorExec, err}
578612
}

0 commit comments

Comments
 (0)