Skip to content

Commit 9d4d0bf

Browse files
kakkoyunbwplotka
authored andcommitted
.*: Introduce graceful shutdown for gRPC Servers (#1687)
* Inroduce graceful shutdown for gRPC Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> * Add missed cancel branch Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> * Remove stutter from server structs Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> * Close servers immediately if grace period is not specified Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> * Update CHANGELOG Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> * Rename TLS methods, clarify log messages Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> * Document public functions Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> * Fix review issues Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> * Update bucket docs Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> * trigger checks Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com> * trigger checks Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com>
1 parent c7e787d commit 9d4d0bf

File tree

27 files changed

+566
-408
lines changed

27 files changed

+566
-408
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel
1313

1414
### Added
1515

16+
- [#1687](https://github.com/thanos-io/thanos/pull/1687) Add a new `--grpc-grace-period` CLI option to components which serve gRPC to set how long to wait until gRPC Server shuts down.
1617
- [#1660](https://github.com/thanos-io/thanos/pull/1660) Add a new `--prometheus.ready_timeout` CLI option to the sidecar to set how long to wait until Prometheus starts up.
1718
- [#1573](https://github.com/thanos-io/thanos/pull/1573) `AliYun OSS` object storage, see [documents](docs/storage.md#aliyun-oss) for further information.
1819
- [#1680](https://github.com/thanos-io/thanos/pull/1680) Add a new `--http-grace-period` CLI option to components which serve HTTP to set how long to wait until HTTP Server shuts down.

cmd/thanos/bucket.go

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,15 @@ import (
1010
"text/template"
1111
"time"
1212

13+
"github.com/go-kit/kit/log"
14+
"github.com/go-kit/kit/log/level"
15+
"github.com/oklog/run"
16+
"github.com/oklog/ulid"
17+
"github.com/olekukonko/tablewriter"
18+
opentracing "github.com/opentracing/opentracing-go"
19+
"github.com/pkg/errors"
20+
"github.com/prometheus/client_golang/prometheus"
21+
"github.com/prometheus/prometheus/tsdb/labels"
1322
"github.com/thanos-io/thanos/pkg/block"
1423
"github.com/thanos-io/thanos/pkg/block/metadata"
1524
"github.com/thanos-io/thanos/pkg/compact"
@@ -20,19 +29,9 @@ import (
2029
"github.com/thanos-io/thanos/pkg/objstore/client"
2130
"github.com/thanos-io/thanos/pkg/prober"
2231
"github.com/thanos-io/thanos/pkg/runutil"
23-
"github.com/thanos-io/thanos/pkg/server"
32+
httpserver "github.com/thanos-io/thanos/pkg/server/http"
2433
"github.com/thanos-io/thanos/pkg/ui"
2534
"github.com/thanos-io/thanos/pkg/verifier"
26-
27-
"github.com/go-kit/kit/log"
28-
"github.com/go-kit/kit/log/level"
29-
"github.com/oklog/run"
30-
"github.com/oklog/ulid"
31-
"github.com/olekukonko/tablewriter"
32-
opentracing "github.com/opentracing/opentracing-go"
33-
"github.com/pkg/errors"
34-
"github.com/prometheus/client_golang/prometheus"
35-
"github.com/prometheus/prometheus/tsdb/labels"
3635
"golang.org/x/text/language"
3736
"golang.org/x/text/message"
3837
kingpin "gopkg.in/alecthomas/kingpin.v2"
@@ -311,7 +310,7 @@ func registerBucketInspect(m map[string]setupFunc, root *kingpin.CmdClause, name
311310
func registerBucketWeb(m map[string]setupFunc, root *kingpin.CmdClause, name string, objStoreConfig *extflag.PathOrContent) {
312311
cmd := root.Command("web", "Web interface for remote storage bucket")
313312
bind := cmd.Flag("listen", "HTTP host:port to listen on").Default("0.0.0.0:8080").String()
314-
httpGracePeriod := regHTTPGracePeriodFlag(cmd)
313+
_, httpGracePeriod := regHTTPFlags(cmd)
315314
interval := cmd.Flag("refresh", "Refresh interval to download metadata from remote storage").Default("30m").Duration()
316315
timeout := cmd.Flag("timeout", "Timeout to download metadata from remote storage").Default("5m").Duration()
317316
label := cmd.Flag("label", "Prometheus label to use as timeline title").String()
@@ -321,9 +320,9 @@ func registerBucketWeb(m map[string]setupFunc, root *kingpin.CmdClause, name str
321320

322321
statusProber := prober.NewProber(component.Bucket, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg))
323322
// Initiate HTTP listener providing metrics endpoint and readiness/liveness probes.
324-
srv := server.NewHTTP(logger, reg, component.Bucket, statusProber,
325-
server.WithListen(*bind),
326-
server.WithGracePeriod(time.Duration(*httpGracePeriod)),
323+
srv := httpserver.New(logger, reg, component.Bucket, statusProber,
324+
httpserver.WithListen(*bind),
325+
httpserver.WithGracePeriod(time.Duration(*httpGracePeriod)),
327326
)
328327

329328
bucketUI := ui.NewBucketUI(logger, *label)

cmd/thanos/compact.go

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,6 @@ import (
1010
"strings"
1111
"time"
1212

13-
"github.com/thanos-io/thanos/pkg/extflag"
14-
"github.com/thanos-io/thanos/pkg/server"
15-
1613
"github.com/go-kit/kit/log"
1714
"github.com/go-kit/kit/log/level"
1815
"github.com/oklog/run"
@@ -25,10 +22,12 @@ import (
2522
"github.com/thanos-io/thanos/pkg/compact"
2623
"github.com/thanos-io/thanos/pkg/compact/downsample"
2724
"github.com/thanos-io/thanos/pkg/component"
25+
"github.com/thanos-io/thanos/pkg/extflag"
2826
"github.com/thanos-io/thanos/pkg/objstore"
2927
"github.com/thanos-io/thanos/pkg/objstore/client"
3028
"github.com/thanos-io/thanos/pkg/prober"
3129
"github.com/thanos-io/thanos/pkg/runutil"
30+
httpserver "github.com/thanos-io/thanos/pkg/server/http"
3231
"gopkg.in/alecthomas/kingpin.v2"
3332
)
3433

@@ -79,8 +78,7 @@ func registerCompact(m map[string]setupFunc, app *kingpin.Application) {
7978
"Compaction index verification will ignore out of order label names.").
8079
Hidden().Default("false").Bool()
8180

82-
httpAddr := regHTTPAddrFlag(cmd)
83-
httpGracePeriod := regHTTPGracePeriodFlag(cmd)
81+
httpAddr, httpGracePeriod := regHTTPFlags(cmd)
8482

8583
dataDir := cmd.Flag("data-dir", "Data directory in which to cache blocks and process compactions.").
8684
Default("./data").String()
@@ -179,9 +177,9 @@ func runCompact(
179177

180178
statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg))
181179
// Initiate HTTP listener providing metrics endpoint and readiness/liveness probes.
182-
srv := server.NewHTTP(logger, reg, component, statusProber,
183-
server.WithListen(httpBindAddr),
184-
server.WithGracePeriod(httpGracePeriod),
180+
srv := httpserver.New(logger, reg, component, statusProber,
181+
httpserver.WithListen(httpBindAddr),
182+
httpserver.WithGracePeriod(httpGracePeriod),
185183
)
186184

187185
g.Add(srv.ListenAndServe, srv.Shutdown)

cmd/thanos/downsample.go

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,6 @@ import (
66
"path/filepath"
77
"time"
88

9-
"github.com/thanos-io/thanos/pkg/extflag"
10-
"github.com/thanos-io/thanos/pkg/server"
11-
129
"github.com/go-kit/kit/log"
1310
"github.com/go-kit/kit/log/level"
1411
"github.com/oklog/run"
@@ -23,19 +20,20 @@ import (
2320
"github.com/thanos-io/thanos/pkg/compact"
2421
"github.com/thanos-io/thanos/pkg/compact/downsample"
2522
"github.com/thanos-io/thanos/pkg/component"
23+
"github.com/thanos-io/thanos/pkg/extflag"
2624
"github.com/thanos-io/thanos/pkg/objstore"
2725
"github.com/thanos-io/thanos/pkg/objstore/client"
2826
"github.com/thanos-io/thanos/pkg/prober"
2927
"github.com/thanos-io/thanos/pkg/runutil"
28+
httpserver "github.com/thanos-io/thanos/pkg/server/http"
3029
kingpin "gopkg.in/alecthomas/kingpin.v2"
3130
)
3231

3332
func registerDownsample(m map[string]setupFunc, app *kingpin.Application) {
3433
comp := component.Downsample
3534
cmd := app.Command(comp.String(), "continuously downsamples blocks in an object store bucket")
3635

37-
httpAddr := regHTTPAddrFlag(cmd)
38-
httpGracePeriod := regHTTPGracePeriodFlag(cmd)
36+
httpAddr, httpGracePeriod := regHTTPFlags(cmd)
3937

4038
dataDir := cmd.Flag("data-dir", "Data directory in which to cache blocks and process downsamplings.").
4139
Default("./data").String()
@@ -126,9 +124,9 @@ func runDownsample(
126124
}
127125

128126
// Initiate HTTP listener providing metrics endpoint and readiness/liveness probes.
129-
srv := server.NewHTTP(logger, reg, comp, statusProber,
130-
server.WithListen(httpBindAddr),
131-
server.WithGracePeriod(httpGracePeriod),
127+
srv := httpserver.New(logger, reg, comp, statusProber,
128+
httpserver.WithListen(httpBindAddr),
129+
httpserver.WithGracePeriod(httpGracePeriod),
132130
)
133131
g.Add(srv.ListenAndServe, srv.Shutdown)
134132

cmd/thanos/flags.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,29 +19,31 @@ func modelDuration(flags *kingpin.FlagClause) *model.Duration {
1919

2020
func regGRPCFlags(cmd *kingpin.CmdClause) (
2121
grpcBindAddr *string,
22+
grpcGracePeriod *model.Duration,
2223
grpcTLSSrvCert *string,
2324
grpcTLSSrvKey *string,
2425
grpcTLSSrvClientCA *string,
2526
) {
2627
grpcBindAddr = cmd.Flag("grpc-address", "Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components.").
2728
Default("0.0.0.0:10901").String()
29+
grpcGracePeriod = modelDuration(cmd.Flag("grpc-grace-period", "Time to wait after an interrupt received for GRPC Server.").Default("2m")) // by default it's the same as query.timeout.
2830

2931
grpcTLSSrvCert = cmd.Flag("grpc-server-tls-cert", "TLS Certificate for gRPC server, leave blank to disable TLS").Default("").String()
3032
grpcTLSSrvKey = cmd.Flag("grpc-server-tls-key", "TLS Key for the gRPC server, leave blank to disable TLS").Default("").String()
3133
grpcTLSSrvClientCA = cmd.Flag("grpc-server-tls-client-ca", "TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert)").Default("").String()
3234

3335
return grpcBindAddr,
36+
grpcGracePeriod,
3437
grpcTLSSrvCert,
3538
grpcTLSSrvKey,
3639
grpcTLSSrvClientCA
3740
}
3841

39-
func regHTTPAddrFlag(cmd *kingpin.CmdClause) *string {
40-
return cmd.Flag("http-address", "Listen host:port for HTTP endpoints.").Default("0.0.0.0:10902").String()
41-
}
42+
func regHTTPFlags(cmd *kingpin.CmdClause) (httpBindAddr *string, httpGracePeriod *model.Duration) {
43+
httpBindAddr = cmd.Flag("http-address", "Listen host:port for HTTP endpoints.").Default("0.0.0.0:10902").String()
44+
httpGracePeriod = modelDuration(cmd.Flag("http-grace-period", "Time to wait after an interrupt received for HTTP Server.").Default("2m")) // by default it's the same as query.timeout.
4245

43-
func regHTTPGracePeriodFlag(cmd *kingpin.CmdClause) *model.Duration {
44-
return modelDuration(cmd.Flag("http-grace-period", "Time to wait after an interrupt received for HTTP Server.").Default("5s"))
46+
return httpBindAddr, httpGracePeriod
4547
}
4648

4749
func regCommonObjStoreFlags(cmd *kingpin.CmdClause, suffix string, required bool, extraDesc ...string) *extflag.PathOrContent {

cmd/thanos/main.go

Lines changed: 0 additions & 156 deletions
Original file line numberDiff line numberDiff line change
@@ -2,39 +2,25 @@ package main
22

33
import (
44
"context"
5-
"crypto/tls"
6-
"crypto/x509"
75
"fmt"
86
"io"
9-
"io/ioutil"
10-
"math"
117
"os"
128
"os/signal"
139
"path/filepath"
1410
"runtime"
15-
"runtime/debug"
1611
"syscall"
1712

1813
gmetrics "github.com/armon/go-metrics"
1914
gprom "github.com/armon/go-metrics/prometheus"
2015
"github.com/go-kit/kit/log"
2116
"github.com/go-kit/kit/log/level"
22-
grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware"
23-
grpc_recovery "github.com/grpc-ecosystem/go-grpc-middleware/recovery"
24-
grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
2517
"github.com/oklog/run"
2618
"github.com/opentracing/opentracing-go"
2719
"github.com/pkg/errors"
2820
"github.com/prometheus/client_golang/prometheus"
2921
"github.com/prometheus/common/version"
30-
"github.com/thanos-io/thanos/pkg/store/storepb"
31-
"github.com/thanos-io/thanos/pkg/tracing"
3222
"github.com/thanos-io/thanos/pkg/tracing/client"
3323
"go.uber.org/automaxprocs/maxprocs"
34-
"google.golang.org/grpc"
35-
"google.golang.org/grpc/codes"
36-
"google.golang.org/grpc/credentials"
37-
"google.golang.org/grpc/status"
3824
"gopkg.in/alecthomas/kingpin.v2"
3925
)
4026

@@ -222,145 +208,3 @@ func interrupt(logger log.Logger, cancel <-chan struct{}) error {
222208
return errors.New("canceled")
223209
}
224210
}
225-
226-
func defaultGRPCTLSServerOpts(logger log.Logger, cert, key, clientCA string) ([]grpc.ServerOption, error) {
227-
opts := []grpc.ServerOption{}
228-
tlsCfg, err := defaultTLSServerOpts(log.With(logger, "protocol", "gRPC"), cert, key, clientCA)
229-
if err != nil {
230-
return opts, err
231-
}
232-
if tlsCfg != nil {
233-
opts = append(opts, grpc.Creds(credentials.NewTLS(tlsCfg)))
234-
}
235-
return opts, nil
236-
}
237-
238-
func defaultTLSServerOpts(logger log.Logger, cert, key, clientCA string) (*tls.Config, error) {
239-
if key == "" && cert == "" {
240-
if clientCA != "" {
241-
return nil, errors.New("when a client CA is used a server key and certificate must also be provided")
242-
}
243-
244-
level.Info(logger).Log("msg", "disabled TLS, key and cert must be set to enable")
245-
return nil, nil
246-
}
247-
248-
level.Info(logger).Log("msg", "enabling server side TLS")
249-
250-
if key == "" || cert == "" {
251-
return nil, errors.New("both server key and certificate must be provided")
252-
}
253-
254-
tlsCfg := &tls.Config{
255-
MinVersion: tls.VersionTLS12,
256-
}
257-
258-
tlsCert, err := tls.LoadX509KeyPair(cert, key)
259-
if err != nil {
260-
return nil, errors.Wrap(err, "server credentials")
261-
}
262-
263-
tlsCfg.Certificates = []tls.Certificate{tlsCert}
264-
265-
if clientCA != "" {
266-
caPEM, err := ioutil.ReadFile(clientCA)
267-
if err != nil {
268-
return nil, errors.Wrap(err, "reading client CA")
269-
}
270-
271-
certPool := x509.NewCertPool()
272-
if !certPool.AppendCertsFromPEM(caPEM) {
273-
return nil, errors.Wrap(err, "building client CA")
274-
}
275-
tlsCfg.ClientCAs = certPool
276-
tlsCfg.ClientAuth = tls.RequireAndVerifyClientCert
277-
278-
level.Info(logger).Log("msg", "server TLS client verification enabled")
279-
}
280-
281-
return tlsCfg, nil
282-
}
283-
284-
func defaultTLSClientOpts(logger log.Logger, cert, key, caCert, serverName string) (*tls.Config, error) {
285-
var certPool *x509.CertPool
286-
if caCert != "" {
287-
caPEM, err := ioutil.ReadFile(caCert)
288-
if err != nil {
289-
return nil, errors.Wrap(err, "reading client CA")
290-
}
291-
292-
certPool = x509.NewCertPool()
293-
if !certPool.AppendCertsFromPEM(caPEM) {
294-
return nil, errors.Wrap(err, "building client CA")
295-
}
296-
level.Info(logger).Log("msg", "TLS client using provided certificate pool")
297-
} else {
298-
var err error
299-
certPool, err = x509.SystemCertPool()
300-
if err != nil {
301-
return nil, errors.Wrap(err, "reading system certificate pool")
302-
}
303-
level.Info(logger).Log("msg", "TLS client using system certificate pool")
304-
}
305-
306-
tlsCfg := &tls.Config{
307-
RootCAs: certPool,
308-
}
309-
310-
if serverName != "" {
311-
tlsCfg.ServerName = serverName
312-
}
313-
314-
if (key != "") != (cert != "") {
315-
return nil, errors.New("both client key and certificate must be provided")
316-
}
317-
318-
if cert != "" {
319-
cert, err := tls.LoadX509KeyPair(cert, key)
320-
if err != nil {
321-
return nil, errors.Wrap(err, "client credentials")
322-
}
323-
tlsCfg.Certificates = []tls.Certificate{cert}
324-
level.Info(logger).Log("msg", "TLS client authentication enabled")
325-
}
326-
return tlsCfg, nil
327-
}
328-
329-
func newStoreGRPCServer(logger log.Logger, reg prometheus.Registerer, tracer opentracing.Tracer, srv storepb.StoreServer, opts []grpc.ServerOption) *grpc.Server {
330-
met := grpc_prometheus.NewServerMetrics()
331-
met.EnableHandlingTimeHistogram(
332-
grpc_prometheus.WithHistogramBuckets([]float64{
333-
0.001, 0.01, 0.05, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4,
334-
}),
335-
)
336-
panicsTotal := prometheus.NewCounter(prometheus.CounterOpts{
337-
Name: "thanos_grpc_req_panics_recovered_total",
338-
Help: "Total number of gRPC requests recovered from internal panic.",
339-
})
340-
reg.MustRegister(met, panicsTotal)
341-
342-
grpcPanicRecoveryHandler := func(p interface{}) (err error) {
343-
panicsTotal.Inc()
344-
level.Error(logger).Log("msg", "recovered from panic", "panic", p, "stack", debug.Stack())
345-
return status.Errorf(codes.Internal, "%s", p)
346-
}
347-
opts = append(opts,
348-
grpc.MaxSendMsgSize(math.MaxInt32),
349-
grpc_middleware.WithUnaryServerChain(
350-
met.UnaryServerInterceptor(),
351-
tracing.UnaryServerInterceptor(tracer),
352-
grpc_recovery.UnaryServerInterceptor(grpc_recovery.WithRecoveryHandler(grpcPanicRecoveryHandler)),
353-
),
354-
grpc_middleware.WithStreamServerChain(
355-
met.StreamServerInterceptor(),
356-
tracing.StreamServerInterceptor(tracer),
357-
grpc_recovery.StreamServerInterceptor(grpc_recovery.WithRecoveryHandler(grpcPanicRecoveryHandler)),
358-
),
359-
)
360-
361-
s := grpc.NewServer(opts...)
362-
storepb.RegisterStoreServer(s, srv)
363-
met.InitializeMetrics(s)
364-
365-
return s
366-
}

0 commit comments

Comments
 (0)