Skip to content

Commit 56abeab

Browse files
simonpasquierbwplotka
authored andcommitted
*: Added support for authentication and TLS for Alertmanager (#1838)
* *: support authentication and TLS for Alertmanager This change adds support for authentication with basic auth, client certificates and bearer tokens. It also enables to configure TLS settings for the Alertmanager endpoints. Most of the work leverages the existing Prometheus configuration format and code. In particular TLS certificate files are automatically reloaded whenever they change. Signed-off-by: Simon Pasquier <spasquie@redhat.com> * Fail hard when --alertmanagers.url and --alertmanagers.config flags are both defined Signed-off-by: Simon Pasquier <spasquie@redhat.com> * Update CHANGELOG.md Signed-off-by: Simon Pasquier <spasquie@redhat.com> * Move tests from cmd/thanos to pkg/alert Signed-off-by: Simon Pasquier <spasquie@redhat.com> * Add end-to-end for Alertmanager file SD Signed-off-by: Simon Pasquier <spasquie@redhat.com> * test/e2e: add test with different alerting HTTP clients Signed-off-by: Simon Pasquier <spasquie@redhat.com> * Fix panic in pkg/alert/client_test.go Signed-off-by: Simon Pasquier <spasquie@redhat.com> * Address Bartek's comments Signed-off-by: Simon Pasquier <spasquie@redhat.com> * Re-use dns.Provider for resolving Alertmanager addresses Signed-off-by: Simon Pasquier <spasquie@redhat.com> * update documentation Signed-off-by: Simon Pasquier <spasquie@redhat.com>
1 parent 612f533 commit 56abeab

File tree

13 files changed

+1020
-372
lines changed

13 files changed

+1020
-372
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ We use *breaking* word for marking changes that are not backward compatible (rel
1818
### Added
1919
- [#1852](https://github.com/thanos-io/thanos/pull/1852) Add support for `AWS_CONTAINER_CREDENTIALS_FULL_URI` by upgrading to minio-go v6.0.44
2020
- [#1854](https://github.com/thanos-io/thanos/pull/1854) Update Rule UI to support alerts count displaying and filtering.
21+
- [#1838](https://github.com/thanos-io/thanos/pull/1838) Ruler: Add TLS and authentication support for Alertmanager with the `--alertmanagers.config` and `--alertmanagers.config-file` CLI flags. See [documentation](docs/components/rule.md/#configuration) for further information.
22+
- [#1838](https://github.com/thanos-io/thanos/pull/1838) Ruler: Add a new `--alertmanagers.sd-dns-interval` CLI option to specify the interval between DNS resolutions of Alertmanager hosts.
2123

2224
## [v0.9.0](https://github.com/thanos-io/thanos/releases/tag/v0.9.0) - 2019.12.03
2325

cmd/thanos/rule.go

Lines changed: 84 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"context"
55
"fmt"
66
"math/rand"
7-
"net"
87
"net/http"
98
"net/url"
109
"os"
@@ -13,7 +12,6 @@ import (
1312
"path/filepath"
1413
"strconv"
1514
"strings"
16-
"sync"
1715
"syscall"
1816
"time"
1917

@@ -83,8 +81,10 @@ func registerRule(m map[string]setupFunc, app *kingpin.Application) {
8381

8482
alertmgrs := cmd.Flag("alertmanagers.url", "Alertmanager replica URLs to push firing alerts. Ruler claims success if push to at least one alertmanager from discovered succeeds. The scheme should not be empty e.g `http` might be used. The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect Alertmanager IPs through respective DNS lookups. The port defaults to 9093 or the SRV record's value. The URL path is used as a prefix for the regular Alertmanager API path.").
8583
Strings()
86-
87-
alertmgrsTimeout := cmd.Flag("alertmanagers.send-timeout", "Timeout for sending alerts to alertmanager").Default("10s").Duration()
84+
alertmgrsTimeout := cmd.Flag("alertmanagers.send-timeout", "Timeout for sending alerts to Alertmanager").Default("10s").Duration()
85+
alertmgrsConfig := extflag.RegisterPathOrContent(cmd, "alertmanagers.config", "YAML file that contains alerting configuration. See format details: https://thanos.io/components/rule.md/#configuration. If defined, it takes precedence over the '--alertmanagers.url' and '--alertmanagers.send-timeout' flags.", false)
86+
alertmgrsDNSSDInterval := modelDuration(cmd.Flag("alertmanagers.sd-dns-interval", "Interval between DNS resolutions of Alertmanager hosts.").
87+
Default("30s"))
8888

8989
alertQueryURL := cmd.Flag("alert.query-url", "The external Thanos Query URL that would be set in all alerts 'Source' field").String()
9090

@@ -157,6 +157,8 @@ func registerRule(m map[string]setupFunc, app *kingpin.Application) {
157157
lset,
158158
*alertmgrs,
159159
*alertmgrsTimeout,
160+
alertmgrsConfig,
161+
time.Duration(*alertmgrsDNSSDInterval),
160162
*grpcBindAddr,
161163
time.Duration(*grpcGracePeriod),
162164
*grpcCert,
@@ -194,6 +196,8 @@ func runRule(
194196
lset labels.Labels,
195197
alertmgrURLs []string,
196198
alertmgrsTimeout time.Duration,
199+
alertmgrsConfig *extflag.PathOrContent,
200+
alertmgrsDNSSDInterval time.Duration,
197201
grpcBindAddr string,
198202
grpcGracePeriod time.Duration,
199203
grpcCert string,
@@ -286,11 +290,56 @@ func runRule(
286290
dns.ResolverType(dnsSDResolver),
287291
)
288292

293+
// Build the Alertmanager clients.
294+
alertmgrsConfigYAML, err := alertmgrsConfig.Content()
295+
if err != nil {
296+
return err
297+
}
298+
var (
299+
alertingCfg alert.AlertingConfig
300+
alertmgrs []*alert.Alertmanager
301+
)
302+
if len(alertmgrsConfigYAML) > 0 {
303+
if len(alertmgrURLs) != 0 {
304+
return errors.New("--alertmanagers.url and --alertmanagers.config* flags cannot be defined at the same time")
305+
}
306+
alertingCfg, err = alert.LoadAlertingConfig(alertmgrsConfigYAML)
307+
if err != nil {
308+
return err
309+
}
310+
} else {
311+
// Build the Alertmanager configuration from the legacy flags.
312+
for _, addr := range alertmgrURLs {
313+
cfg, err := alert.BuildAlertmanagerConfig(logger, addr, alertmgrsTimeout)
314+
if err != nil {
315+
return err
316+
}
317+
alertingCfg.Alertmanagers = append(alertingCfg.Alertmanagers, cfg)
318+
}
319+
}
320+
321+
if len(alertingCfg.Alertmanagers) == 0 {
322+
level.Warn(logger).Log("msg", "no alertmanager configured")
323+
}
324+
325+
amProvider := dns.NewProvider(
326+
logger,
327+
extprom.WrapRegistererWithPrefix("thanos_ruler_alertmanagers_", reg),
328+
dns.ResolverType(dnsSDResolver),
329+
)
330+
for _, cfg := range alertingCfg.Alertmanagers {
331+
// Each Alertmanager client has a different list of targets thus each needs its own DNS provider.
332+
am, err := alert.NewAlertmanager(logger, cfg, amProvider.Clone())
333+
if err != nil {
334+
return err
335+
}
336+
alertmgrs = append(alertmgrs, am)
337+
}
338+
289339
// Run rule evaluation and alert notifications.
290340
var (
291-
alertmgrs = newAlertmanagerSet(logger, alertmgrURLs, dns.ResolverType(dnsSDResolver))
292-
alertQ = alert.NewQueue(logger, reg, 10000, 100, labelsTSDBToProm(lset), alertExcludeLabels)
293-
ruleMgr = thanosrule.NewManager(dataDir)
341+
alertQ = alert.NewQueue(logger, reg, 10000, 100, labelsTSDBToProm(lset), alertExcludeLabels)
342+
ruleMgr = thanosrule.NewManager(dataDir)
294343
)
295344
{
296345
notify := func(ctx context.Context, expr string, alerts ...*rules.Alert) {
@@ -351,9 +400,35 @@ func runRule(
351400
})
352401
}
353402
}
403+
// Discover and resolve Alertmanager addresses.
404+
{
405+
for i := range alertmgrs {
406+
am := alertmgrs[i]
407+
ctx, cancel := context.WithCancel(context.Background())
408+
g.Add(func() error {
409+
am.Discover(ctx)
410+
return nil
411+
}, func(error) {
412+
cancel()
413+
})
414+
415+
g.Add(func() error {
416+
return runutil.Repeat(alertmgrsDNSSDInterval, ctx.Done(), func() error {
417+
am.Resolve(ctx)
418+
return nil
419+
})
420+
}, func(error) {
421+
cancel()
422+
})
423+
}
424+
}
425+
// Run the alert sender.
354426
{
355-
// TODO(bwplotka): https://github.com/thanos-io/thanos/issues/660.
356-
sdr := alert.NewSender(logger, reg, alertmgrs.get, nil, alertmgrsTimeout)
427+
clients := make([]alert.AlertmanagerClient, len(alertmgrs))
428+
for i := range alertmgrs {
429+
clients[i] = alertmgrs[i]
430+
}
431+
sdr := alert.NewSender(logger, reg, clients)
357432
ctx, cancel := context.WithCancel(context.Background())
358433

359434
g.Add(func() error {
@@ -370,21 +445,6 @@ func runRule(
370445
cancel()
371446
})
372447
}
373-
{
374-
ctx, cancel := context.WithCancel(context.Background())
375-
376-
g.Add(func() error {
377-
return runutil.Repeat(30*time.Second, ctx.Done(), func() error {
378-
if err := alertmgrs.update(ctx); err != nil {
379-
level.Error(logger).Log("msg", "refreshing alertmanagers failed", "err", err)
380-
alertMngrAddrResolutionErrors.Inc()
381-
}
382-
return nil
383-
})
384-
}, func(error) {
385-
cancel()
386-
})
387-
}
388448
// Run File Service Discovery and update the query addresses when the files are modified.
389449
if fileSD != nil {
390450
var fileSDUpdates chan []*targetgroup.Group
@@ -615,90 +675,6 @@ func runRule(
615675
return nil
616676
}
617677

618-
type alertmanagerSet struct {
619-
resolver dns.Resolver
620-
addrs []string
621-
mtx sync.Mutex
622-
current []*url.URL
623-
}
624-
625-
func newAlertmanagerSet(logger log.Logger, addrs []string, dnsSDResolver dns.ResolverType) *alertmanagerSet {
626-
return &alertmanagerSet{
627-
resolver: dns.NewResolver(dnsSDResolver.ToResolver(logger)),
628-
addrs: addrs,
629-
}
630-
}
631-
632-
func (s *alertmanagerSet) get() []*url.URL {
633-
s.mtx.Lock()
634-
defer s.mtx.Unlock()
635-
return s.current
636-
}
637-
638-
const defaultAlertmanagerPort = 9093
639-
640-
func parseAlertmanagerAddress(addr string) (qType dns.QType, parsedUrl *url.URL, err error) {
641-
qType = ""
642-
parsedUrl, err = url.Parse(addr)
643-
if err != nil {
644-
return qType, nil, err
645-
}
646-
// The Scheme might contain DNS resolver type separated by + so we split it a part.
647-
if schemeParts := strings.Split(parsedUrl.Scheme, "+"); len(schemeParts) > 1 {
648-
parsedUrl.Scheme = schemeParts[len(schemeParts)-1]
649-
qType = dns.QType(strings.Join(schemeParts[:len(schemeParts)-1], "+"))
650-
}
651-
return qType, parsedUrl, err
652-
}
653-
654-
func (s *alertmanagerSet) update(ctx context.Context) error {
655-
var result []*url.URL
656-
for _, addr := range s.addrs {
657-
var (
658-
qtype dns.QType
659-
resolvedDomain []string
660-
)
661-
662-
qtype, u, err := parseAlertmanagerAddress(addr)
663-
if err != nil {
664-
return errors.Wrapf(err, "parse URL %q", addr)
665-
}
666-
667-
// Get only the host and resolve it if needed.
668-
host := u.Host
669-
if qtype != "" {
670-
if qtype == dns.A {
671-
_, _, err = net.SplitHostPort(host)
672-
if err != nil {
673-
// The host could be missing a port. Append the defaultAlertmanagerPort.
674-
host = host + ":" + strconv.Itoa(defaultAlertmanagerPort)
675-
}
676-
}
677-
resolvedDomain, err = s.resolver.Resolve(ctx, host, qtype)
678-
if err != nil {
679-
return errors.Wrap(err, "alertmanager resolve")
680-
}
681-
} else {
682-
resolvedDomain = []string{host}
683-
}
684-
685-
for _, resolved := range resolvedDomain {
686-
result = append(result, &url.URL{
687-
Scheme: u.Scheme,
688-
Host: resolved,
689-
Path: u.Path,
690-
User: u.User,
691-
})
692-
}
693-
}
694-
695-
s.mtx.Lock()
696-
s.current = result
697-
s.mtx.Unlock()
698-
699-
return nil
700-
}
701-
702678
func parseFlagLabels(s []string) (labels.Labels, error) {
703679
var lset labels.Labels
704680
for _, l := range s {

cmd/thanos/rule_test.go

Lines changed: 0 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
11
package main
22

33
import (
4-
"context"
5-
"net/url"
64
"testing"
75

8-
"github.com/pkg/errors"
9-
"github.com/thanos-io/thanos/pkg/discovery/dns"
106
"github.com/thanos-io/thanos/pkg/testutil"
117
)
128

@@ -49,97 +45,3 @@ func Test_parseFlagLabels(t *testing.T) {
4945
testutil.Equals(t, err != nil, td.expectErr)
5046
}
5147
}
52-
53-
func TestRule_AlertmanagerResolveWithoutPort(t *testing.T) {
54-
mockResolver := mockResolver{
55-
resultIPs: map[string][]string{
56-
"alertmanager.com:9093": {"1.1.1.1:9300"},
57-
},
58-
}
59-
am := alertmanagerSet{resolver: mockResolver, addrs: []string{"dns+http://alertmanager.com"}}
60-
61-
ctx := context.TODO()
62-
err := am.update(ctx)
63-
testutil.Ok(t, err)
64-
65-
expected := []*url.URL{
66-
{
67-
Scheme: "http",
68-
Host: "1.1.1.1:9300",
69-
},
70-
}
71-
gotURLs := am.get()
72-
testutil.Equals(t, expected, gotURLs)
73-
}
74-
75-
func TestRule_AlertmanagerResolveWithPort(t *testing.T) {
76-
mockResolver := mockResolver{
77-
resultIPs: map[string][]string{
78-
"alertmanager.com:19093": {"1.1.1.1:9300"},
79-
},
80-
}
81-
am := alertmanagerSet{resolver: mockResolver, addrs: []string{"dns+http://alertmanager.com:19093"}}
82-
83-
ctx := context.TODO()
84-
err := am.update(ctx)
85-
testutil.Ok(t, err)
86-
87-
expected := []*url.URL{
88-
{
89-
Scheme: "http",
90-
Host: "1.1.1.1:9300",
91-
},
92-
}
93-
gotURLs := am.get()
94-
testutil.Equals(t, expected, gotURLs)
95-
}
96-
97-
type mockResolver struct {
98-
resultIPs map[string][]string
99-
err error
100-
}
101-
102-
func (m mockResolver) Resolve(ctx context.Context, name string, qtype dns.QType) ([]string, error) {
103-
if m.err != nil {
104-
return nil, m.err
105-
}
106-
if res, ok := m.resultIPs[name]; ok {
107-
return res, nil
108-
}
109-
return nil, errors.Errorf("mockResolver not found response for name: %s", name)
110-
}
111-
112-
func Test_ParseAlertmanagerAddress(t *testing.T) {
113-
var tData = []struct {
114-
address string
115-
expectQueryType dns.QType
116-
expectUrl *url.URL
117-
expectError error
118-
}{
119-
{
120-
address: "http://user:pass+word@foo.bar:3289",
121-
expectQueryType: dns.QType(""),
122-
expectUrl: &url.URL{Host: "foo.bar:3289", Scheme: "http", User: url.UserPassword("user", "pass+word")},
123-
expectError: nil,
124-
},
125-
{
126-
address: "dnssrvnoa+http://user:pass+word@foo.bar:3289",
127-
expectQueryType: dns.QType("dnssrvnoa"),
128-
expectUrl: &url.URL{Host: "foo.bar:3289", Scheme: "http", User: url.UserPassword("user", "pass+word")},
129-
expectError: nil,
130-
},
131-
{
132-
address: "foo+bar+http://foo.bar:3289",
133-
expectQueryType: dns.QType("foo+bar"),
134-
expectUrl: &url.URL{Host: "foo.bar:3289", Scheme: "http"},
135-
expectError: nil,
136-
},
137-
}
138-
139-
for _, d := range tData {
140-
q, u, e := parseAlertmanagerAddress(d.address)
141-
testutil.Equals(t, d.expectError, e)
142-
testutil.Equals(t, d.expectUrl, u)
143-
testutil.Equals(t, d.expectQueryType, q)
144-
}
145-
}

0 commit comments

Comments
 (0)