From fa9d48d1e7a26723c6d1ec3849d29aeec3cd9162 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 15 Aug 2024 15:36:25 +0100 Subject: [PATCH 1/2] Refactor basic Prometheus metrics handling Refacor the basic Prometheus metrics handling to: - Allow a split between web and metrics endpoints, so general requests to the /alive and /healthz metrics in the metrics endpoint don't skew the metrics for the web endpoint. - Add support for response time summary and quantiles on a per-service basis, rather than per-method/path/status. - Add support for response size metrics. - Add support for counting in-flight metrics. --- internal/serve/metrics/main.go | 2 +- internal/serve/middleware/metrics.go | 2 +- internal/serve/middleware/prometheus.go | 169 +++++++++++------------- internal/serve/web/main.go | 2 +- 4 files changed, 81 insertions(+), 94 deletions(-) diff --git a/internal/serve/metrics/main.go b/internal/serve/metrics/main.go index 0e17b1d..deadeb8 100644 --- a/internal/serve/metrics/main.go +++ b/internal/serve/metrics/main.go @@ -28,7 +28,7 @@ func NewService() *Service { router := gin.New() router.Use(middleware.Logger()) - router.Use(middleware.Prometheus()) + router.Use(middleware.Prometheus("metrics")) router.Use(gin.Recovery()) proxies := viper.GetStringSlice("endpoints.proxies") diff --git a/internal/serve/middleware/metrics.go b/internal/serve/middleware/metrics.go index 774062b..2643f52 100644 --- a/internal/serve/middleware/metrics.go +++ b/internal/serve/middleware/metrics.go @@ -26,7 +26,7 @@ func NewMetrics(namespace string) *Metrics { Name: "request_duration_seconds", Help: "The latency of the HTTP requests.", //nolint:mnd // these are the building blocks for buckets - Buckets: prometheus.ExponentialBuckets(0.005, 2, 10), + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 12), }, []string{"service", "handler", "method", "path", "status"}, ), diff --git a/internal/serve/middleware/prometheus.go b/internal/serve/middleware/prometheus.go index 8a94605..7ae0853 100644 --- a/internal/serve/middleware/prometheus.go +++ b/internal/serve/middleware/prometheus.go @@ -2,6 +2,8 @@ package middleware import ( "fmt" + "strings" + "time" "github.com/gin-gonic/gin" "github.com/prometheus/client_golang/prometheus" @@ -9,102 +11,87 @@ import ( ) var ( - PrometheusDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ - Name: "http_duration_seconds", - Help: "Duration of HTTP requests.", - }, []string{"path"}) - - PrometheusCounter = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: "http_requests_total", - Help: "Count of HTTP requests.", - }, []string{"path", "status"}) + summary = promauto.NewSummaryVec(prometheus.SummaryOpts{ + Subsystem: "http", + Name: "response_seconds", + Help: "Duration of HTTP requests.", + //nolint:mnd // ignore + MaxAge: 15 * time.Second, + //nolint:mnd // ignore + Objectives: map[float64]float64{0.25: 0.01, 0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }, []string{"service"}) + + duration = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: "http", + Name: "response_endpoints_seconds", + Help: "Duration of HTTP requests.", + //nolint:mnd // ignore + Buckets: prometheus.ExponentialBucketsRange(0.00001, 2, 15), + }, []string{"service", "method", "path", "status"}) + + requests = promauto.NewCounterVec(prometheus.CounterOpts{ + Subsystem: "http", + Name: "request_total", + Help: "Count of HTTP requests.", + }, []string{"service", "method", "path", "status"}) + + requestSize = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: "http", + Name: "request_size_bytes", + Help: "Size of the HTTP requests.", + //nolint:mnd // ignore + Buckets: prometheus.ExponentialBuckets(64, 2, 10), + }, []string{"service", "method", "path", "status"}) + + responseSize = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: "http", + Name: "response_size_bytes", + Help: "Size of the HTTP responses.", + //nolint:mnd // ignore + Buckets: prometheus.ExponentialBuckets(2, 2, 16), + }, []string{"service", "method", "path", "status"}) + + active = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: "http", + Name: "request_open", + Help: "Number of requests being actively handled.", + }, []string{"service"}) ) -// PrometheusMiddleware provides instrumentation for the API calls made to a -// connected service, counting both the number of requests being processed, the -// number requested in total, and the time taken to process those requests. -func Prometheus() gin.HandlerFunc { +// Prometheus provides instrumentation for the API calls made to a connected +// service, counting both the number of requests being processed, the number +// requested in total, and the time taken to process those requests. +func Prometheus(service string) gin.HandlerFunc { return func(c *gin.Context) { - var timer *prometheus.Timer + method := strings.ToUpper(c.Request.Method) + path := c.FullPath() - if c.FullPath() != "" { - timer = prometheus.NewTimer( - PrometheusDuration.WithLabelValues(c.FullPath()), - ) - } + active.WithLabelValues(service).Inc() + defer active.WithLabelValues(service).Dec() - c.Next() + timer := time.Now() + defer func(c *gin.Context, t time.Time) { + taken := time.Since(t).Seconds() - if timer != nil { - timer.ObserveDuration() - PrometheusCounter.WithLabelValues(c.FullPath(), fmt.Sprintf("%d", c.Writer.Status())).Inc() - } + status := fmt.Sprintf("%d", c.Writer.Status()) + if status == "0" { + status = "200" + } + + responseBytes := float64(c.Writer.Size()) + + requestBytes := float64(c.Request.ContentLength) + if requestBytes < 0 { + requestBytes = 0 + } + + requests.WithLabelValues(service, method, path, status).Inc() + duration.WithLabelValues(service, method, path, status).Observe(taken) + summary.WithLabelValues(service).Observe(taken) + requestSize.WithLabelValues(service, method, path, status).Observe(requestBytes) + responseSize.WithLabelValues(service, method, path, status).Observe(responseBytes) + }(c, timer) + + c.Next() } } - -// Measure abstracts the HTTP handler implementation by only requesting a reporter, this -// reporter will return the required data to be measured. -// it accepts a next function that will be called as the wrapped logic before and after -// measurement actions. -// func (m Middleware) Measure(handlerID string, reporter Reporter, next func()) { -// ctx := reporter.Context() -// -// // func (r *reporter) Method() string { return r.c.Request.Method } -// // -// // func (r *reporter) Context() context.Context { return r.c.Request.Context() } -// // -// // func (r *reporter) URLPath() string { return r.c.FullPath() } -// // -// // func (r *reporter) StatusCode() int { return r.c.Writer.Status() } -// // -// // func (r *reporter) BytesWritten() int64 { return int64(r.c.Writer.Size()) } -// -// // If there isn't predefined handler ID we -// // set that ID as the URL path. -// hid := handlerID -// if handlerID == "" { -// hid = reporter.URLPath() -// } -// -// // Measure inflights if required. -// if !m.cfg.DisableMeasureInflight { -// props := metrics.HTTPProperties{ -// Service: m.cfg.Service, -// ID: hid, -// } -// m.cfg.Recorder.AddInflightRequests(ctx, props, 1) -// defer m.cfg.Recorder.AddInflightRequests(ctx, props, -1) -// } -// -// // Start the timer and when finishing measure the duration. -// start := time.Now() -// defer func() { -// duration := time.Since(start) -// -// // If we need to group the status code, it uses the -// // first number of the status code because is the least -// // required identification way. -// var code string -// if m.cfg.GroupedStatus { -// code = fmt.Sprintf("%dxx", reporter.StatusCode()/100) -// } else { -// code = strconv.Itoa(reporter.StatusCode()) -// } -// -// props := metrics.HTTPReqProperties{ -// Service: m.cfg.Service, -// ID: hid, -// Method: reporter.Method(), -// Code: code, -// } -// m.cfg.Recorder.ObserveHTTPRequestDuration(ctx, props, duration) -// -// // Measure size of response if required. -// if !m.cfg.DisableMeasureSize { -// m.cfg.Recorder.ObserveHTTPResponseSize(ctx, props, reporter.BytesWritten()) -// } -// }() -// -// // Call the wrapped logic. -// next() -// }. diff --git a/internal/serve/web/main.go b/internal/serve/web/main.go index c215152..e2ae5b4 100644 --- a/internal/serve/web/main.go +++ b/internal/serve/web/main.go @@ -25,7 +25,7 @@ func NewService() *Service { router := gin.New() router.Use(middleware.Logger()) - router.Use(middleware.Prometheus()) + router.Use(middleware.Prometheus("web")) router.Use(gin.Recovery()) proxies := viper.GetStringSlice("endpoints.proxies") From d7aa62ed38df80444b77db2bd98d87b10d062461 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Thu, 15 Aug 2024 17:15:47 +0100 Subject: [PATCH 2/2] Disable metrics endpoint access logs by default Switch off the metrics endpoint access logs as it's unlikely they're going to be of significant value, but allow them to be re-enabled by the command-line or configuration file. --- config/serve.yaml | 1 + internal/cmd/serve.go | 4 ++++ internal/serve/metrics/main.go | 5 ++++- internal/serve/middleware/logger.go | 5 ----- schemas/serve.json | 11 ++++++++++- 5 files changed, 19 insertions(+), 7 deletions(-) diff --git a/config/serve.yaml b/config/serve.yaml index 4b1aa31..b30579a 100644 --- a/config/serve.yaml +++ b/config/serve.yaml @@ -11,4 +11,5 @@ endpoints: logging: json: true + metrics: false level: debug diff --git a/internal/cmd/serve.go b/internal/cmd/serve.go index d60d495..029bc0d 100644 --- a/internal/cmd/serve.go +++ b/internal/cmd/serve.go @@ -124,6 +124,10 @@ func init() { flags.Int("shutdown-timeout", shutdownTimeout, "Timeout (in seconds) to wait for requests to finish") _ = viper.BindPFlag("endpoints.timeouts.shutdown", flags.Lookup("shutdown-timeout")) + viper.SetDefault("logging.metrics", false) + flags.Bool("log-metrics", false, "Set whether to log metrics port requests") + _ = viper.BindPFlag("logging.metrics", flags.Lookup("log-metrics")) + rootCmd.AddCommand(serveCmd) } diff --git a/internal/serve/metrics/main.go b/internal/serve/metrics/main.go index deadeb8..c223f59 100644 --- a/internal/serve/metrics/main.go +++ b/internal/serve/metrics/main.go @@ -27,7 +27,10 @@ type Service struct { func NewService() *Service { router := gin.New() - router.Use(middleware.Logger()) + if viper.GetBool("logging.metrics") { + router.Use(middleware.Logger()) + } + router.Use(middleware.Prometheus("metrics")) router.Use(gin.Recovery()) diff --git a/internal/serve/middleware/logger.go b/internal/serve/middleware/logger.go index bc4e65a..56207f8 100644 --- a/internal/serve/middleware/logger.go +++ b/internal/serve/middleware/logger.go @@ -21,11 +21,6 @@ func Logger() gin.HandlerFunc { http.StatusUnauthorized, http.StatusNotFound, ), - // slogg.IgnorePath( - // "/alive", - // "/healthz", - // "/metrics", - // ), }, }, ) diff --git a/schemas/serve.json b/schemas/serve.json index 6eb88d8..4fdb12b 100644 --- a/schemas/serve.json +++ b/schemas/serve.json @@ -171,6 +171,9 @@ "level": { "$ref": "#/$defs/logging-level" }, + "metrics": { + "$ref": "#/$defs/logging-metrics" + }, "json": { "$ref": "#/$defs/logging-json" } @@ -181,9 +184,15 @@ "type": "string", "enum": ["debug", "info", "warning", "error"] }, + "logging-metrics": { + "description": "Set whether or not to log requests to the metrics port", + "type": "boolean", + "default": false + }, "logging-json": { "description": "Set whether or not to use JSON-based structured logging", - "type": "boolean" + "type": "boolean", + "default": false } }, "type": "object",