From c4a4b233003dcf59b801a3a7a33e06ac79f8900a Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Fri, 16 Aug 2024 22:54:25 +0100 Subject: [PATCH 1/2] Prepare for clustering with improved name support Prepare for the setup of the clustering by adding configuration support for setting the cluster name. This will be used for logging, providing a differentiator, and for metrics. Additionally, re-rename the endpoint label to component in order to avoid conflicts with pre-defined Prometheus labels, which is causing endpoint to be renamed to exported_endpoint, just like with service. --- charts/dashboard/templates/deployment.yaml | 1 + charts/dashboard/templates/stateful-set.yaml | 1 + config/serve.yaml | 3 ++ internal/cmd/serve.go | 8 +++++ internal/serve/metrics/main.go | 12 ++++--- internal/serve/middleware/prometheus.go | 36 ++++++++++++-------- internal/serve/web/main.go | 12 ++++--- schemas/serve.json | 21 ++++++++++++ 8 files changed, 69 insertions(+), 25 deletions(-) diff --git a/charts/dashboard/templates/deployment.yaml b/charts/dashboard/templates/deployment.yaml index f17c7e9..46c4821 100644 --- a/charts/dashboard/templates/deployment.yaml +++ b/charts/dashboard/templates/deployment.yaml @@ -60,6 +60,7 @@ spec: - --log-level={{ .Values.pod.logging.level }} - --log-json={{ if .Values.pod.logging.json }}true{{ else }}false{{ end }} - --address=0.0.0.0 + - --cluster-name={{ include "dashboard.fullname" . }} - --web-port={{ .Values.service.webPort }} - --metrics-port={{ .Values.service.metricsPort }} {{- range .Values.pod.extraArgs }} diff --git a/charts/dashboard/templates/stateful-set.yaml b/charts/dashboard/templates/stateful-set.yaml index 406a511..19a15b7 100644 --- a/charts/dashboard/templates/stateful-set.yaml +++ b/charts/dashboard/templates/stateful-set.yaml @@ -70,6 +70,7 @@ spec: - --log-level={{ .Values.pod.logging.level }} - --log-json={{ if .Values.pod.logging.json }}true{{ else }}false{{ end }} - --address=0.0.0.0 + - --cluster-name={{ include "dashboard.fullname" . }} - --web-port={{ .Values.service.webPort }} - --metrics-port={{ .Values.service.metricsPort }} {{- range .Values.pod.extraArgs }} diff --git a/config/serve.yaml b/config/serve.yaml index b30579a..881b83a 100644 --- a/config/serve.yaml +++ b/config/serve.yaml @@ -1,4 +1,7 @@ --- +cluster: + name: dashboard + endpoints: bind: address: 0.0.0.0 diff --git a/internal/cmd/serve.go b/internal/cmd/serve.go index 029bc0d..70ca7f8 100644 --- a/internal/cmd/serve.go +++ b/internal/cmd/serve.go @@ -27,6 +27,10 @@ const ( ) var ( + // name is the defaut name for the cluster when one or more dashboard + // instances operate together. + name = "dashboard" + // host is the hostname or IPv4/IPv6 address to bind the service to on // startup. host = "localhost" @@ -128,6 +132,10 @@ func init() { flags.Bool("log-metrics", false, "Set whether to log metrics port requests") _ = viper.BindPFlag("logging.metrics", flags.Lookup("log-metrics")) + viper.SetDefault("cluster.name", name) + flags.StringP("cluster-name", "c", name, "The name of the cluster") + _ = viper.BindPFlag("cluster.name", flags.Lookup("cluster-name")) + rootCmd.AddCommand(serveCmd) } diff --git a/internal/serve/metrics/main.go b/internal/serve/metrics/main.go index c223f59..147042b 100644 --- a/internal/serve/metrics/main.go +++ b/internal/serve/metrics/main.go @@ -27,11 +27,15 @@ type Service struct { func NewService() *Service { router := gin.New() + name := viper.GetString("cluster.name") + address := viper.GetString("endpoints.bind.address") + port := viper.GetString("endpoints.bind.port.metrics") + if viper.GetBool("logging.metrics") { router.Use(middleware.Logger()) } - router.Use(middleware.Prometheus("metrics")) + router.Use(middleware.Prometheus(name, "metrics")) router.Use(gin.Recovery()) proxies := viper.GetStringSlice("endpoints.proxies") @@ -48,9 +52,6 @@ func NewService() *Service { } } - address := viper.GetString("endpoints.bind.address") - port := viper.GetString("endpoints.bind.port.metrics") - service := &Service{ router: router, server: &http.Server{ @@ -66,7 +67,8 @@ func NewService() *Service { shuttingDown: false, attr: slog.Group( - "server", + "cluster", + slog.String("name", name), slog.String("service", "metrics"), slog.String("address", address), slog.String("port", port), diff --git a/internal/serve/middleware/prometheus.go b/internal/serve/middleware/prometheus.go index 1f6dcea..09d7455 100644 --- a/internal/serve/middleware/prometheus.go +++ b/internal/serve/middleware/prometheus.go @@ -19,49 +19,55 @@ var ( MaxAge: 15 * time.Second, //nolint:mnd // ignore Objectives: map[float64]float64{0.25: 0.01, 0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, - }, []string{"endpoint"}) + }, []string{"cluster", "component"}) duration = promauto.NewHistogramVec(prometheus.HistogramOpts{ Subsystem: "http", Name: "response_endpoints_seconds", Help: "Duration of HTTP requests.", //nolint:mnd // ignore + NativeHistogramMinResetDuration: 15 * time.Second, + //nolint:mnd // ignore Buckets: prometheus.ExponentialBucketsRange(0.00001, 2, 15), - }, []string{"endpoint", "method", "path", "status"}) + }, []string{"cluster", "component", "method", "path", "status"}) requests = promauto.NewCounterVec(prometheus.CounterOpts{ Subsystem: "http", Name: "request_total", Help: "Count of HTTP requests.", - }, []string{"endpoint", "method", "path", "status"}) + }, []string{"cluster", "component", "method", "path", "status"}) requestSize = promauto.NewHistogramVec(prometheus.HistogramOpts{ Subsystem: "http", Name: "request_size_bytes", Help: "Size of the HTTP requests.", //nolint:mnd // ignore + NativeHistogramMinResetDuration: 15 * time.Second, + //nolint:mnd // ignore Buckets: prometheus.ExponentialBuckets(64, 2, 10), - }, []string{"endpoint", "method", "path", "status"}) + }, []string{"cluster", "component", "method", "path", "status"}) responseSize = promauto.NewHistogramVec(prometheus.HistogramOpts{ Subsystem: "http", Name: "response_size_bytes", Help: "Size of the HTTP responses.", //nolint:mnd // ignore + NativeHistogramMinResetDuration: 15 * time.Second, + //nolint:mnd // ignore Buckets: prometheus.ExponentialBuckets(2, 2, 16), - }, []string{"endpoint", "method", "path", "status"}) + }, []string{"cluster", "component", "method", "path", "status"}) active = promauto.NewGaugeVec(prometheus.GaugeOpts{ Subsystem: "http", Name: "request_open", Help: "Number of requests being actively handled.", - }, []string{"endpoint"}) + }, []string{"cluster", "component"}) ) // Prometheus provides instrumentation for the API calls made to a connected -// endpoint, counting both the number of requests being processed, the number +// component, counting both the number of requests being processed, the number // requested in total, and the time taken to process those requests. -func Prometheus(endpoint string) gin.HandlerFunc { +func Prometheus(cluster, component string) gin.HandlerFunc { return func(c *gin.Context) { method := strings.ToUpper(c.Request.Method) @@ -70,8 +76,8 @@ func Prometheus(endpoint string) gin.HandlerFunc { path = "404" } - active.WithLabelValues(endpoint).Inc() - defer active.WithLabelValues(endpoint).Dec() + active.WithLabelValues(cluster, component).Inc() + defer active.WithLabelValues(cluster, component).Dec() timer := time.Now() defer func(c *gin.Context, t time.Time) { @@ -89,11 +95,11 @@ func Prometheus(endpoint string) gin.HandlerFunc { requestBytes = 0 } - requests.WithLabelValues(endpoint, method, path, status).Inc() - duration.WithLabelValues(endpoint, method, path, status).Observe(taken) - summary.WithLabelValues(endpoint).Observe(taken) - requestSize.WithLabelValues(endpoint, method, path, status).Observe(requestBytes) - responseSize.WithLabelValues(endpoint, method, path, status).Observe(responseBytes) + requests.WithLabelValues(cluster, component, method, path, status).Inc() + duration.WithLabelValues(cluster, component, method, path, status).Observe(taken) + summary.WithLabelValues(cluster, component).Observe(taken) + requestSize.WithLabelValues(cluster, component, method, path, status).Observe(requestBytes) + responseSize.WithLabelValues(cluster, component, method, path, status).Observe(responseBytes) }(c, timer) c.Next() diff --git a/internal/serve/web/main.go b/internal/serve/web/main.go index e2ae5b4..fa48c81 100644 --- a/internal/serve/web/main.go +++ b/internal/serve/web/main.go @@ -24,8 +24,12 @@ type Service struct { func NewService() *Service { router := gin.New() + name := viper.GetString("cluster.name") + address := viper.GetString("endpoints.bind.address") + port := viper.GetString("endpoints.bind.port.web") + router.Use(middleware.Logger()) - router.Use(middleware.Prometheus("web")) + router.Use(middleware.Prometheus(name, "web")) router.Use(gin.Recovery()) proxies := viper.GetStringSlice("endpoints.proxies") @@ -42,9 +46,6 @@ func NewService() *Service { } } - address := viper.GetString("endpoints.bind.address") - port := viper.GetString("endpoints.bind.port.web") - service := &Service{ router: router, server: &http.Server{ @@ -58,7 +59,8 @@ func NewService() *Service { }, attr: slog.Group( - "server", + "cluster", + slog.String("name", name), slog.String("service", "web"), slog.String("address", address), slog.String("port", port), diff --git a/schemas/serve.json b/schemas/serve.json index 4fdb12b..dd5aa4b 100644 --- a/schemas/serve.json +++ b/schemas/serve.json @@ -4,6 +4,24 @@ "title": "dashboard serve Configuration File", "description": "The configuration file for the dashboard application when running as a service", "$defs": { + "cluster": { + "title": "Cluster Configuration", + "description": "The configuration for the cluster", + "type": "object", + "additionalProperties": false, + "properties": { + "name": { + "$ref": "#/$defs/cluster-name" + } + } + }, + "cluster-name": { + "title": "Cluster Name", + "description": "The name of the cluster when connecting one or more dashboard instances", + "type": "string", + "format": "hostname", + "examples": ["dashboard", "dashboard.n3t.uk"] + }, "endpoints": { "title": "Application Endpoints Configuration", "description": "The configuration for the application endpoints", @@ -198,6 +216,9 @@ "type": "object", "additionalProperties": false, "properties": { + "cluster": { + "$ref": "#/$defs/cluster" + }, "endpoints": { "$ref": "#/$defs/endpoints" }, From 228071aff407282df5a5eb46712f222cbe6eb898 Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 21 Aug 2024 22:54:42 +0100 Subject: [PATCH 2/2] Remove all references and configuration for internal store Remove all references, and all configuration, including the StatefulSet support, for managing state and storage for the application as the app will be going in another direction with nats.io and JetStream potentially. --- charts/dashboard/README.md | 4 - charts/dashboard/templates/deployment.yaml | 2 - charts/dashboard/templates/stateful-set.yaml | 157 ------------------- charts/dashboard/values.yaml | 13 -- internal/cmd/serve.go | 4 +- internal/logger/main.go | 2 + internal/serve/metrics/healthz/main.go | 75 +++++++-- internal/serve/metrics/main.go | 31 +++- internal/serve/middleware/metrics.go | 90 ----------- internal/serve/web/main.go | 14 +- 10 files changed, 99 insertions(+), 293 deletions(-) delete mode 100644 charts/dashboard/templates/stateful-set.yaml delete mode 100644 internal/serve/middleware/metrics.go diff --git a/charts/dashboard/README.md b/charts/dashboard/README.md index 4fe835a..2172b00 100644 --- a/charts/dashboard/README.md +++ b/charts/dashboard/README.md @@ -51,10 +51,6 @@ configure it through the `values.yaml` file. | deployment.revisionHistoryLimit | int | `10` | Set the number of deployments which should be kept to enable a rollback of the deployment in the event of any issues or failures | | deployment.annotations | object | `{}` | Set any additional annotations which should be added to the Deployment resource | | deployment.labels | object | `{}` | Set any additional labels which should be added to the Deployment resource | -| persistentVolumeClaim.create | bool | `false` | Set whether or not to create a PersistentVolumeClaim resource for the dashboard service and attach it to the Pods | -| persistentVolumeClaim.storageClassName | string | `nil` | Set the name of the StorageClass to use for the volumes in the PersistentVolumeClaim | -| persistentVolumeClaim.size | string | `"32Gi"` | Set the size of each PersistentVolumeClaim to be created | -| persistentVolumeClaim.accessModes | list | `["ReadWriteOnce"]` | Configure the access modes to be set on the PersistentVolumeClaim | | pod.image.repository | string | `"ghcr.io/n3tuk/dashboard"` | Set the URI for the container image to be deployed for the dashboard Deployment | | pod.image.pullPolicy | string | `"IfNotPresent"` | Set the pull policy for the host running each Pod of the deployment | | pod.annotations | object | `{}` | Set any additional annotations which should be added to the Ingress resource | diff --git a/charts/dashboard/templates/deployment.yaml b/charts/dashboard/templates/deployment.yaml index 46c4821..732e757 100644 --- a/charts/dashboard/templates/deployment.yaml +++ b/charts/dashboard/templates/deployment.yaml @@ -1,5 +1,4 @@ --- -{{- if (not .Values.persistentVolumeClaim.create) }} apiVersion: apps/v1 kind: Deployment metadata: @@ -141,4 +140,3 @@ spec: topologySpreadConstraints: {{- toYaml . | nindent 8 }} {{- end }} -{{- end }} diff --git a/charts/dashboard/templates/stateful-set.yaml b/charts/dashboard/templates/stateful-set.yaml deleted file mode 100644 index 19a15b7..0000000 --- a/charts/dashboard/templates/stateful-set.yaml +++ /dev/null @@ -1,157 +0,0 @@ ---- -{{- if .Values.persistentVolumeClaim.create }} -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: {{ include "dashboard.fullname" . }} - namespace: {{ .Release.Namespace }} - {{- with .Values.deployment.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} - labels: - {{- include "dashboard.labels" . | nindent 4 }} - {{- with .Values.deployment.labels }} - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - replicas: {{ .Values.deployment.replicaCount }} - revisionHistoryLimit: {{ .Values.deployment.revisionHistoryLimit }} - selector: - matchLabels: - {{- include "dashboard.selectorLabels" . | nindent 6 }} - serviceName: {{ include "dashboard.fullname" . }} - updateStrategy: - type: RollingUpdate - volumeClaimTemplates: - - metadata: - name: {{ include "dashboard.fullname" . }}-vault - spec: - storageClassName: {{ .Values.persistentVolumeClaim.storageClassName }} - {{- with .Values.persistentVolumeClaim.accessModes }} - accessModes: - {{- toYaml . | nindent 10 }} - {{- end }} - resources: - requests: - storage: {{ .Values.persistentVolumeClaim.size }} - template: - metadata: - {{- with .Values.pod.annotations }} - annotations: - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "dashboard.labels" . | nindent 8 }} - {{- with .Values.pod.labels }} - {{- toYaml . | nindent 8 }} - {{- end }} - spec: - terminationGracePeriodSeconds: 45 - {{- if .Values.serviceAccount.create }} - serviceAccountName: {{ include "dashboard.serviceAccountName" . }} - {{- end }} - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - securityContext: - seccompProfile: - type: RuntimeDefault - runAsUser: 65534 - runAsGroup: 65534 - fsGroup: 65534 - containers: - - name: {{ .Chart.Name | quote }} - image: "{{ .Values.pod.image.repository }}:{{ .Values.pod.image.tag | default (printf "v%s" .Chart.AppVersion) }}" - imagePullPolicy: {{ .Values.pod.image.pullPolicy }} - args: - - serve - - --log-level={{ .Values.pod.logging.level }} - - --log-json={{ if .Values.pod.logging.json }}true{{ else }}false{{ end }} - - --address=0.0.0.0 - - --cluster-name={{ include "dashboard.fullname" . }} - - --web-port={{ .Values.service.webPort }} - - --metrics-port={{ .Values.service.metricsPort }} - {{- range .Values.pod.extraArgs }} - - {{ . }} - {{- end }} - volumeMounts: - - name: {{ include "dashboard.fullname" . }}-vault - mountPath: /data/vault - ports: - - containerPort: {{ .Values.service.webPort }} - name: web - - containerPort: {{ .Values.service.metricsPort }} - name: metrics - securityContext: - runAsNonRoot: true - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true - capabilities: - drop: - - ALL - add: - - NET_BIND_SERVICE - runAsUser: 65534 - runAsGroup: 65534 - {{- if .Values.pod.probes.startup.create }} - startupProbe: - httpGet: - path: /alive - port: metrics - {{- with .Values.pod.probes.startup }} - periodSeconds: {{ .periodSeconds }} - initialDelaySeconds: {{ .initialDelaySeconds }} - timeoutSeconds: {{ .timeoutSeconds }} - successThreshold: {{ .successThreshold }} - failureThreshold: {{ .failureThreshold }} - {{- end }} - {{- end }} - {{- if .Values.pod.probes.liveness.create }} - livenessProbe: - httpGet: - path: /alive - port: metrics - {{- with .Values.pod.probes.liveness }} - periodSeconds: {{ .periodSeconds }} - initialDelaySeconds: {{ .initialDelaySeconds }} - timeoutSeconds: {{ .timeoutSeconds }} - successThreshold: {{ .successThreshold }} - failureThreshold: {{ .failureThreshold }} - {{- end }} - {{- end }} - {{- if .Values.pod.probes.readiness.create }} - readinessProbe: - httpGet: - path: /healthz - port: metrics - {{- with .Values.pod.probes.readiness }} - periodSeconds: {{ .periodSeconds }} - initialDelaySeconds: {{ .initialDelaySeconds }} - timeoutSeconds: {{ .timeoutSeconds }} - successThreshold: {{ .successThreshold }} - failureThreshold: {{ .failureThreshold }} - {{- end }} - {{- end }} - {{- with .Values.pod.resources }} - resources: -{{ toYaml . | indent 12 }} - {{- end }} - {{- with .Values.pod.nodeSelector }} - nodeSelector: - {{- toYaml . | indent 8 }} - {{- end }} - {{- with .Values.pod.tolerations }} - tolerations: - {{- toYaml . | indent 8 }} - {{- end }} - {{- with .Values.pod.affinity }} - affinity: - {{- toYaml . | indent 8 }} - {{- end }} - {{- with .Values.pod.topologySpreadConstraints }} - topologySpreadConstraints: - {{- toYaml . | nindent 8 }} - {{- end }} -{{- end }} diff --git a/charts/dashboard/values.yaml b/charts/dashboard/values.yaml index c74c2be..fc24b75 100644 --- a/charts/dashboard/values.yaml +++ b/charts/dashboard/values.yaml @@ -100,19 +100,6 @@ deployment: # resource labels: {} -persistentVolumeClaim: - # -- Set whether or not to create a PersistentVolumeClaim resource for the - # dashboard service and attach it to the Pods - create: false - # -- (string) Set the name of the StorageClass to use for the volumes in the - # PersistentVolumeClaim - storageClassName: - # -- Set the size of each PersistentVolumeClaim to be created - size: 32Gi - # -- Configure the access modes to be set on the PersistentVolumeClaim - accessModes: - - ReadWriteOnce - pod: image: # -- Set the URI for the container image to be deployed for the dashboard diff --git a/internal/cmd/serve.go b/internal/cmd/serve.go index 70ca7f8..5b44f9c 100644 --- a/internal/cmd/serve.go +++ b/internal/cmd/serve.go @@ -133,7 +133,7 @@ func init() { _ = viper.BindPFlag("logging.metrics", flags.Lookup("log-metrics")) viper.SetDefault("cluster.name", name) - flags.StringP("cluster-name", "c", name, "The name of the cluster") + flags.StringP("cluster-name", "n", name, "The name of the cluster") _ = viper.BindPFlag("cluster.name", flags.Lookup("cluster-name")) rootCmd.AddCommand(serveCmd) @@ -175,7 +175,7 @@ func runServe(_ *cobra.Command, _ []string) error { // Start the web service first as the metrics service will report the health // of the service, so we should be ready to receive requests before the // service is reporting as healthy - go w.Start(e) + go w.Start(e, m.SetWebHealth) go m.Start(e) // Restore default behaviour on the interrupt signal and notify user of shutdown. diff --git a/internal/logger/main.go b/internal/logger/main.go index 8e35136..e0cff62 100644 --- a/internal/logger/main.go +++ b/internal/logger/main.go @@ -47,6 +47,8 @@ func Start(attrs *map[string]string) { ), ) } + + slog.SetLogLoggerLevel(level) } // getLevel retrieves the required logging level from either the defaults, the diff --git a/internal/serve/metrics/healthz/main.go b/internal/serve/metrics/healthz/main.go index 559c635..9e8bb87 100644 --- a/internal/serve/metrics/healthz/main.go +++ b/internal/serve/metrics/healthz/main.go @@ -9,12 +9,32 @@ import ( slogg "github.com/samber/slog-gin" ) -var shuttingDown *bool +type Health struct { + Web bool + Metrics bool + Terminating bool +} + +const ( + healthy = "healthy" + unhealthy = "unhealthy" + terminating = "terminating" +) + +var health *Health + +func NewHealth() *Health { + return &Health{ + Web: false, + Metrics: false, + Terminating: false, + } +} // Attach takes a reference to the Gin engine and attaches all the expected // endpoints which cam be used by clients through this package. -func Attach(r *gin.Engine, shutdown *bool) { - shuttingDown = shutdown +func Attach(r *gin.Engine, h *Health) { + health = h r.GET("/healthz", healthz) } @@ -24,19 +44,40 @@ func Attach(r *gin.Engine, shutdown *bool) { // on their overall status, allowing the service to be marked as unhealthy and // to stop processing further requests if there are known issues. func healthz(c *gin.Context) { - if shuttingDown == nil || *shuttingDown { - slogg.AddCustomAttributes(c, slog.Group("healthz", slog.String("status", "not-ok"))) - c.JSON(http.StatusGone, gin.H{ - "status": "shutting-down", - "database": "unknown", - "queue": "unknown", - }) - } else { - slogg.AddCustomAttributes(c, slog.Group("healthz", slog.String("status", "ok"))) - c.JSON(http.StatusOK, gin.H{ - "status": "healthy", - "database": "unknown", - "queue": "unknown", - }) + code := http.StatusOK + status := healthy + web := healthy + metrics := healthy + + if !health.Web { + code = http.StatusServiceUnavailable + status = unhealthy + web = unhealthy + } + + if !health.Metrics { + code = http.StatusServiceUnavailable + status = unhealthy + metrics = unhealthy } + + if health.Terminating { + code = http.StatusGone + status = terminating + } + + slogg.AddCustomAttributes(c, + slog.Group("healthz", + slog.Int("code", code), + slog.String("status", status), + slog.String("web", web), + slog.String("metrics", metrics), + ), + ) + + c.JSON(code, gin.H{ + "status": status, + "web": web, + "metrics": metrics, + }) } diff --git a/internal/serve/metrics/main.go b/internal/serve/metrics/main.go index 147042b..4dae248 100644 --- a/internal/serve/metrics/main.go +++ b/internal/serve/metrics/main.go @@ -18,12 +18,14 @@ import ( ) type Service struct { - attr slog.Attr - router *gin.Engine - server *http.Server - shuttingDown bool + attr slog.Attr + router *gin.Engine + server *http.Server + health *healthz.Health } +var ErrServiceNotConfigured = errors.New("service not configured") + func NewService() *Service { router := gin.New() @@ -64,7 +66,7 @@ func NewService() *Service { Handler: router, }, - shuttingDown: false, + health: healthz.NewHealth(), attr: slog.Group( "cluster", @@ -77,7 +79,7 @@ func NewService() *Service { Attach(router) alive.Attach(router) - healthz.Attach(router, &service.shuttingDown) + healthz.Attach(router, service.health) // Set up the default 404 handler router.NoRoute(notFound) @@ -87,17 +89,22 @@ func NewService() *Service { func (s *Service) Start(e chan error) { if s.server == nil { + s.health.Metrics = false slog.Error( "Failed to start metrics service", slog.Group("error", slog.String("message", "service not configured")), s.attr, ) + e <- ErrServiceNotConfigured } slog.Info("Starting dashboard metrics service", s.attr) + s.health.Metrics = true + err := s.server.ListenAndServe() if err != nil && !errors.Is(err, http.ErrServerClosed) { + s.health.Metrics = false slog.Error( "Failed to start metrics service", slog.Group("error", slog.String("message", err.Error())), @@ -109,7 +116,15 @@ func (s *Service) Start(e chan error) { func (s *Service) PrepareShutdown() { slog.Info("Preparing for metrics service for web service shutdown", s.attr) - s.shuttingDown = true + s.health.Terminating = true +} + +func (s *Service) SetWebHealth(status bool) { + s.health.Web = status +} + +func (s *Service) SetMetricsHealth(status bool) { + s.health.Metrics = status } func (s *Service) Shutdown(timeout time.Duration) error { @@ -120,6 +135,8 @@ func (s *Service) Shutdown(timeout time.Duration) error { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() + s.SetMetricsHealth(false) + if err := s.server.Shutdown(ctx); err != nil { return err } diff --git a/internal/serve/middleware/metrics.go b/internal/serve/middleware/metrics.go deleted file mode 100644 index 2643f52..0000000 --- a/internal/serve/middleware/metrics.go +++ /dev/null @@ -1,90 +0,0 @@ -package middleware - -import ( - "context" - "strconv" - "time" - - "github.com/prometheus/client_golang/prometheus" -) - -type Metrics struct { - duration *prometheus.HistogramVec - count *prometheus.CounterVec - size *prometheus.HistogramVec - active *prometheus.GaugeVec -} - -// NewMetrics returns a new metrics recorder that implements the recorder -// using Prometheus as the backend. -func NewMetrics(namespace string) *Metrics { - metrics := &Metrics{ - duration: prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Namespace: namespace, - Subsystem: "http", - Name: "request_duration_seconds", - Help: "The latency of the HTTP requests.", - //nolint:mnd // these are the building blocks for buckets - Buckets: prometheus.ExponentialBuckets(0.0005, 2, 12), - }, - []string{"service", "handler", "method", "path", "status"}, - ), - count: prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Subsystem: "http", - Name: "requests_total", - Help: "The count of HTTP requests.", - }, - []string{"service", "handler", "method", "path", "status"}, - ), - size: prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Namespace: namespace, - Subsystem: "http", - Name: "response_size_bytes", - Help: "The size of the HTTP responses.", - //nolint:mnd // these are the building blocks for buckets - Buckets: prometheus.ExponentialBuckets(100, 2, 15), - }, - []string{"service", "handler", "method", "path", "status"}, - ), - active: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: "http", - Name: "requests_inflight", - Help: "The number of inflight requests being handled at the same time.", - }, - []string{"service", "handler"}, - ), - } - - prometheus.MustRegister(metrics.duration, metrics.count, metrics.size, metrics.active) - - return metrics -} - -//nolint:lll // acknowledged -func (m Metrics) Record(ctx context.Context, service, handler, method string, status int, duration time.Duration, size int64) { - m.Count(ctx, service, handler, method, status) - m.Duration(ctx, service, handler, method, status, duration) - m.Size(ctx, service, handler, method, status, size) -} - -func (m Metrics) Count(_ context.Context, service, handler, method string, status int) { - m.count.WithLabelValues(service, handler, method, strconv.Itoa(status)).Inc() -} - -func (m Metrics) Duration(_ context.Context, service, handler, method string, status int, duration time.Duration) { - m.duration.WithLabelValues(service, handler, method, strconv.Itoa(status)).Observe(duration.Seconds()) -} - -func (m Metrics) Size(_ context.Context, service, handler, method string, status int, size int64) { - m.size.WithLabelValues(service, handler, method, strconv.Itoa(status)).Observe(float64(size)) -} - -func (m Metrics) Active(_ context.Context, service, handler string, quantity int) { - m.active.WithLabelValues(service, handler).Add(float64(quantity)) -} diff --git a/internal/serve/web/main.go b/internal/serve/web/main.go index fa48c81..1f8d1c8 100644 --- a/internal/serve/web/main.go +++ b/internal/serve/web/main.go @@ -19,8 +19,11 @@ type Service struct { attr slog.Attr router *gin.Engine server *http.Server + health func(bool) } +var ErrServiceNotConfigured = errors.New("service not configured") + func NewService() *Service { router := gin.New() @@ -73,19 +76,26 @@ func NewService() *Service { return service } -func (s *Service) Start(e chan error) { +func (s *Service) Start(e chan error, health func(bool)) { + s.health = health + if s.server == nil { + health(false) slog.Error( "Failed to start web service", slog.Group("error", slog.String("message", "service not configured")), s.attr, ) + e <- ErrServiceNotConfigured } slog.Info("Starting dashboard web service", s.attr) + health(true) + err := s.server.ListenAndServe() if err != nil && !errors.Is(err, http.ErrServerClosed) { + health(false) slog.Error( "Failed to start web service", slog.Group("error", slog.String("message", err.Error())), @@ -103,6 +113,8 @@ func (s *Service) Shutdown(timeout time.Duration) error { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() + s.health(false) + if err := s.server.Shutdown(ctx); err != nil { return err }