diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a8267c2..04276e4 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -30,6 +30,9 @@ jobs: ext: '' steps: - uses: actions/checkout@v3 + with: + # We need all tags + fetch-depth: 0 - name: Set up Go uses: actions/setup-go@v4 with: @@ -37,7 +40,7 @@ jobs: check-latest: true - name: Build run: | - GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} go build cmd/autometrics/main.go + GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} ./scripts/build_generator mv main${{ matrix.ext }} autometrics${{ matrix.ext }} - name: Pack (Zip) diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..7ab9faf --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "autometrics-shared"] + path = configs/shared + url = https://github.com/autometrics-dev/autometrics-shared.git diff --git a/README.md b/README.md index 6a39b64..8e8f010 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,16 @@ trigger alerts directly from production usage: ![a Slack bot is posting an alert directly in the channel](./assets/slack-alert-example.png) A fully working use-case and example of library usage is available in the -[examples/web](./examples/web) subdirectory +[examples/web](./examples/web) subdirectory. You can build and run load on the +example server using: + +```console +git submodule update --init +docker compose -f docker-compose.prometheus-example.yaml up +``` + +And then explore the generated links by opening the [main +file](./examples/web/cmd/main.go) in your editor. ## How to use @@ -43,14 +52,25 @@ In the main entrypoint of your program, you need to both add package ``` go import ( - amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" + autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" ) ``` And then in your main function initialize the metrics ``` go -amImpl.Init(nil, am.DefBuckets) + // Everything in BuildInfo is optional. + // You can also use any string variable whose value is + // injected at build time by ldflags. + autometrics.Init( + nil, + autometrics.DefBuckets, + autometrics.BuildInfo{ + Version: "0.4.0", + Commit: "anySHA", + Branch: "", + }, + ) ``` > **Warning** @@ -59,11 +79,11 @@ have the `--latency-ms` values to match the values given in your buckets. The values in the buckets are given in _seconds_. By default, the generator will error and tell you the valid default values if they don't match. If the default values do not match your use case, you can change the buckets in -the init call, and add a `-custom-latency` argument to the `//go:generate` invocation. +the init call, and add a `--custom-latency` argument to the `//go:generate` invocation. ```patch -//go:generate autometrics -+//go:generate autometrics -custom-latency ++//go:generate autometrics --custom-latency ``` ### Add cookies in your code @@ -128,13 +148,21 @@ For Prometheus the shortest way is to add the handler code in your main entrypoi ``` go import ( - amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" + autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) func main() { - amImpl.Init(nil, am.DefBuckets) + autometrics.Init( + nil, + autometrics.DefBuckets, + autometrics.BuildInfo{ + Version: "0.4.0", + Commit: "anySHA", + Branch: "", + }, + ) http.Handle("/metrics", promhttp.Handler()) } ``` @@ -154,11 +182,11 @@ func RouteHandler(args interface{}) (err error) { } ``` -Then **you need to add** the [bundled](./configs/autometrics.rules.yml) +Then **you need to add** the [bundled](./configs/shared/autometrics.rules.yml) recording rules to your prometheus configuration. The valid arguments for alert generation are: -- `--slo` (*MANDATORY*): name of the service for which the objective is relevant +- `--slo` (*MANDATORY* for alert generation): name of the service for which the objective is relevant - `--success-rate` : target success rate of the function, between 0 and 100 (you must name the `error` return value of the function for detection to work.) - `--latency-ms` : maximum latency allowed for the function, in milliseconds. @@ -168,7 +196,7 @@ The valid arguments for alert generation are: > **Warning** > The generator will error out if you use targets that are not -supported by the bundled [Alerting rules file](./configs/autometrics.rules.yml). +supported by the bundled [Alerting rules file](./configs/shared/autometrics.rules.yml). Support for custom target is planned but not present at the moment ## (OPTIONAL) OpenTelemetry Support @@ -176,27 +204,35 @@ Support for custom target is planned but not present at the moment Autometrics supports using OpenTelemetry with a prometheus exporter instead of using Prometheus to publish the metrics. The changes you need to make are: -- change where the `amImpl` import points to +- change where the `autometrics` import points to ```patch import ( -- amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" -+ amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel" +- autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" ++ autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel" ) ``` -- change the call to `amImpl.Init` to the new signature: instead of a registry, +- change the call to `autometrics.Init` to the new signature: instead of a registry, the `Init` function takes a meter name for the `otel_scope` label of the exported metric. You can use the name of the application or its version for example ``` patch -- amImpl.Init(nil, am.DefBuckets) -+ amImpl.Init("myApp/v2/prod", am.DefBuckets) + autometrics.Init( +- nil, ++ "myApp/v2/prod", + autometrics.DefBuckets, + autometrics.BuildInfo{ + Version: "2.1.37", + Commit: "anySHA", + Branch: "", + }, + ) ``` -- add the `-otel` flag to the `//go:generate` directive +- add the `--otel` flag to the `//go:generate` directive ```patch -//go:generate autometrics -+//go:generate autometrics -otel ++//go:generate autometrics --otel ``` ## (OPTIONAL) Git hook @@ -228,4 +264,4 @@ The alerting system for SLOs that Autometrics uses is based on [Sloth](https://github.com/slok/sloth), and it has native Go types for marshalling/unmarshalling rules, so it should be possible to provide an extra binary in this repository, that only takes care of generating a new [rules -file](./configs/autometrics.rules.yml) with custom objectives. +file](./configs/shared/autometrics.rules.yml) with custom objectives. diff --git a/cmd/am-alertsgen/main.go b/cmd/am-alertsgen/main.go deleted file mode 100644 index e6da4cb..0000000 --- a/cmd/am-alertsgen/main.go +++ /dev/null @@ -1,81 +0,0 @@ -package main - -import ( - "log" - - _ "github.com/slok/sloth/pkg/prometheus/api/v1" -) - -func main() { - // TODO Replicate these rules from autometrics-rs/autometrics-cli/src/sloth.rs - // - // With default values - // #[clap(long, default_values = &["90", "95", "99", "99.9"])] - // objectives: Vec, - /* - fn generate_success_rate_slo(objective: &Decimal) -> String { - let objective_fraction = (objective / Decimal::from(100)).normalize(); - let objective_no_decimal = objective.to_string().replace(".", ""); - - format!(" - name: success-rate-{objective_no_decimal} - objective: {objective} - description: Common SLO based on function success rates - sli: - events: - error_query: sum by (slo_name, objective) (rate(function_calls_count{{objective=\"{objective_fraction}\",result=\"error\"}}[{{{{.window}}}}])) - total_query: sum by (slo_name, objective) (rate(function_calls_count{{objective=\"{objective_fraction}\"}}[{{{{.window}}}}])) - alerting: - name: High Error Rate SLO - {objective}% - labels: - category: success-rate - annotations: - summary: \"High error rate on SLO: {{{{$labels.slo_name}}}}\" - page_alert: - labels: - severity: page - ticket_alert: - labels: - severity: ticket -") -} - -fn generate_latency_slo(objective: &Decimal) -> String { - let objective_fraction = (objective / Decimal::from(100)).normalize(); - let objective_no_decimal = objective.to_string().replace(".", ""); - - format!(" - name: latency-{objective_no_decimal} - objective: {objective} - description: Common SLO based on function latency - sli: - events: - error_query: > - sum by (slo_name, objective) (rate(function_calls_duration_bucket{{objective=\"{objective_fraction}\"}}[{{{{.window}}}}])) - - - (sum by (slo_name, objective) ( - label_join(rate(function_calls_duration_bucket{{objective=\"{objective_fraction}\"}}[{{{{.window}}}}]), \"autometrics_check_label_equality\", \"\", \"target_latency\") - and - label_join(rate(function_calls_duration_bucket{{objective=\"{objective_fraction}\"}}[{{{{.window}}}}]), \"autometrics_check_label_equality\", \"\", \"le\") - )) - total_query: sum by (slo_name, objective) (rate(function_calls_duration_bucket{{objective=\"{objective_fraction}\"}}[{{{{.window}}}}])) - alerting: - name: High Latency SLO - {objective}% - labels: - category: latency - annotations: - summary: \"High latency on SLO: {{{{$labels.slo_name}}}}\" - page_alert: - labels: - severity: page - ticket_alert: - labels: - severity: ticket -") -} - */ - - - // TODO: Once the sloth rules have been made, we should be able to call - // the "binary" part of the sloth dep to generate the prom rules directly. - - log.Fatalf("unimplemented") -} diff --git a/cmd/autometrics/doc.go b/cmd/autometrics/doc.go index 7e6e552..196e477 100644 --- a/cmd/autometrics/doc.go +++ b/cmd/autometrics/doc.go @@ -1,22 +1,36 @@ -// Autometrics runs as Go generator and updates a source file to add usage queries and metric collection to annotated functions. -// -// As a Go generator, it relies on the environment variables `GOFILE` and -// `GOPACKAGE` to find the target file to edit. +// Autometrics instruments annotated functions, and adds links in their doc comments to graphs of their live usage. // // By default, `autometrics` generates metric collection code for usage with the // [Prometheus client library]. If you want to use [OpenTelemetry metrics] -// instead (with a prometheus exporter for the metrics), pass the `-otel` flag +// instead (with a prometheus exporter for the metrics), pass the `--otel` flag // to the invocation. // // By default, when activating Service Level Objectives (SLOs) `autometrics` // does not allow to use latency targets that are outside the default latencies // defined in [autometrics.DefBuckets]. If you want to use custom latencies for -// your latency SLOs, pass the `-custom-latency` flag to the invocation. +// your latency SLOs, pass the `--custom-latency` flag to the invocation. +// +// It is meant to be used in a Go generator context. As such, it takes mandatory arguments in the form of environment variables. +// You can also control the base URL of the prometheus instance in doc comments with an environment variable. +// Note: If you do not use the custom latencies in the SLO, the allowed latencies (in seconds) are in [autometrics.DefBuckets]. +// +// Check https://github.com/autometrics-dev/autometrics-go for more help (including examples) and information. +// Autometrics is built by Fiberplane -- https://autometrics.dev +// +// Usage: autometrics -f FILE_NAME -m MODULE_NAME [--prom_url PROMETHEUS_URL] [--otel] [--custom-latency] // -// By default, the generated links in the documentation point to a Prometheus -// instance at http://localhost:9090. You can use the environment variable -// `AM_PROMETHEUS_URL` to change the base URL in the documentation links. +// Options: +// -f FILE_NAME File to transform. [env: GOFILE] +// -m MODULE_NAME Module containing the file to transform. [env: GOPACKAGE] +// --prom_url PROMETHEUS_URL +// Base URL of the Prometheus instance to generate links to. [default: http://localhost:9090, env: AM_PROMETHEUS_URL] +// --otel Use [OpenTelemetry client library] to instrument code instead of default [Prometheus client library]. [default: false] +// --custom-latency Allow non-default latencies to be used in latency-based SLOs. [default: false] +// --help, -h display this help and exit +// --version display version and exit // // [Prometheus client library]: https://github.com/prometheus/client_golang +// [OpenTelemetry client library]: https://github.com/open-telemetry/opentelemetry-go // [OpenTelemetry metrics]: https://opentelemetry.io/docs/instrumentation/go/ +// [autometrics.DefBuckets]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics#DefBuckets package main diff --git a/cmd/autometrics/main.go b/cmd/autometrics/main.go index 75c6af8..1fac70f 100644 --- a/cmd/autometrics/main.go +++ b/cmd/autometrics/main.go @@ -1,51 +1,74 @@ package main import ( + "fmt" "log" - "os" + "strings" internal "github.com/autometrics-dev/autometrics-go/internal/autometrics" + "github.com/autometrics-dev/autometrics-go/internal/build" "github.com/autometrics-dev/autometrics-go/internal/generate" "github.com/autometrics-dev/autometrics-go/pkg/autometrics" + + arg "github.com/alexflint/go-arg" ) const ( - prometheusAddressEnvironmentVariable = "AM_PROMETHEUS_URL" - useOtelFlag = "-otel" - allowCustomLatencies = "-custom-latency" - DefaultPrometheusInstanceUrl = "http://localhost:9090/" + DefaultPrometheusInstanceUrl = "http://localhost:9090/" ) -func main() { - fileName := os.Getenv("GOFILE") - moduleName := os.Getenv("GOPACKAGE") - args := os.Args +type args struct { + FileName string `arg:"-f,--,required,env:GOFILE" placeholder:"FILE_NAME" help:"File to transform."` + ModuleName string `arg:"-m,--,required,env:GOPACKAGE" placeholder:"MODULE_NAME" help:"Module containing the file to transform."` + PrometheusUrl string `arg:"--prom_url,env:AM_PROMETHEUS_URL" placeholder:"PROMETHEUS_URL" default:"http://localhost:9090" help:"Base URL of the Prometheus instance to generate links to."` + UseOtel bool `arg:"--otel" default:"false" help:"Use OpenTelemetry client library to instrument code instead of default Prometheus."` + AllowCustomLatencies bool `arg:"--custom-latency" default:"false" help:"Allow non-default latencies to be used in latency-based SLOs."` +} - prometheusUrl, envVarExists := os.LookupEnv(prometheusAddressEnvironmentVariable) - if !envVarExists { - prometheusUrl = DefaultPrometheusInstanceUrl - } +func (args) Version() string { + var buf strings.Builder + + fmt.Fprintf(&buf, "Autometrics %s", build.Version) + + return buf.String() +} + +func (args) Description() string { + var buf strings.Builder + + fmt.Fprintf(&buf, + "Autometrics instruments annotated functions, and adds links in their doc comments to graphs of their live usage.\n\n") + + fmt.Fprintf(&buf, + "It is meant to be used in a Go generator context. As such, it takes mandatory arguments in the form of environment variables.\n"+ + "You can also control the base URL of the prometheus instance in doc comments with an environment variable.\n") + fmt.Fprintf(&buf, + "\tNote: If you do not use the custom latencies in the SLO, the allowed latencies (in seconds) are %v\n\n", + autometrics.DefBuckets) + + fmt.Fprintln(&buf, + "Check https://github.com/autometrics-dev/autometrics-go for more help (including examples) and information.") + fmt.Fprintf(&buf, + "Autometrics is built by Fiberplane -- https://autometrics.dev\n") + + return buf.String() +} + +func main() { + var args args + arg.MustParse(&args) implementation := autometrics.PROMETHEUS - if contains(args, useOtelFlag) { + if args.UseOtel { implementation = autometrics.OTEL } - ctx, err := internal.NewGeneratorContext(implementation, prometheusUrl, contains(args, allowCustomLatencies)) + ctx, err := internal.NewGeneratorContext(implementation, args.PrometheusUrl, args.AllowCustomLatencies) if err != nil { log.Fatalf("error initialising autometrics context: %s", err) } - if err := generate.TransformFile(ctx, fileName, moduleName); err != nil { - log.Fatalf("error transforming %s: %s", fileName, err) - } -} - -func contains[T comparable](s []T, e T) bool { - for _, v := range s { - if v == e { - return true - } + if err := generate.TransformFile(ctx, args.FileName, args.ModuleName); err != nil { + log.Fatalf("error transforming %s: %s", args.FileName, err) } - return false } diff --git a/examples/otel/alertmanager.yml b/configs/alertmanager.yml similarity index 100% rename from examples/otel/alertmanager.yml rename to configs/alertmanager.yml diff --git a/configs/autometrics.rules.yml b/configs/autometrics.rules.yml deleted file mode 100644 index 57d3038..0000000 --- a/configs/autometrics.rules.yml +++ /dev/null @@ -1,1594 +0,0 @@ - ---- -# Code generated by Sloth (v0.11.0): https://github.com/slok/sloth. -# DO NOT EDIT. - -groups: -- name: sloth-slo-sli-recordings-autometrics-success-rate-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", - sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-90 - rules: - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", - sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-95 - rules: - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", - sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99 - rules: - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99_9", - sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99_9 - rules: - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", - sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: latency-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-90 - rules: - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", - sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: latency-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-95 - rules: - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", - sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: latency-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99 - rules: - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", - sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99_9 - rules: - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. diff --git a/configs/shared b/configs/shared new file mode 160000 index 0000000..09d5384 --- /dev/null +++ b/configs/shared @@ -0,0 +1 @@ +Subproject commit 09d538449cef6a9af35900b4d91213f4f681d566 diff --git a/examples/web/docker-compose.yaml b/docker-compose.open-telemetry-example.yaml similarity index 71% rename from examples/web/docker-compose.yaml rename to docker-compose.open-telemetry-example.yaml index 335e10d..5582f85 100644 --- a/examples/web/docker-compose.yaml +++ b/docker-compose.open-telemetry-example.yaml @@ -20,8 +20,8 @@ services: container_name: alertmanager restart: unless-stopped volumes: - - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml - - ./slack_url.txt:/etc/alertmanager/slack_url + - ./examples/otel/configs/alertmanager.yml:/etc/alertmanager/alertmanager.yml + - ./examples/otel/configs/slack_url.txt:/etc/alertmanager/slack_url command: - '--config.file=/etc/alertmanager/alertmanager.yml' expose: @@ -34,8 +34,8 @@ services: container_name: prometheus restart: unless-stopped volumes: - - ./prometheus.yaml:/etc/prometheus/prometheus.yaml - - ./autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml + - ./examples/otel/configs/prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./examples/otel/configs/autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml command: - '--config.file=/etc/prometheus/prometheus.yaml' expose: @@ -48,7 +48,9 @@ services: - web-server web-server: - build: . + build: + context: . + dockerfile: examples/otel/Dockerfile container_name: web-server restart: unless-stopped expose: diff --git a/examples/otel/docker-compose.yaml b/docker-compose.prometheus-example.yaml similarity index 61% rename from examples/otel/docker-compose.yaml rename to docker-compose.prometheus-example.yaml index 335e10d..aaa805d 100644 --- a/examples/otel/docker-compose.yaml +++ b/docker-compose.prometheus-example.yaml @@ -20,8 +20,8 @@ services: container_name: alertmanager restart: unless-stopped volumes: - - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml - - ./slack_url.txt:/etc/alertmanager/slack_url + - ./examples/web/configs/alertmanager.yml:/etc/alertmanager/alertmanager.yml + - ./examples/web/configs/slack_url.txt:/etc/alertmanager/slack_url command: - '--config.file=/etc/alertmanager/alertmanager.yml' expose: @@ -34,8 +34,8 @@ services: container_name: prometheus restart: unless-stopped volumes: - - ./prometheus.yaml:/etc/prometheus/prometheus.yaml - - ./autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml + - ./examples/web/configs/prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./examples/web/configs/autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml command: - '--config.file=/etc/prometheus/prometheus.yaml' expose: @@ -48,10 +48,23 @@ services: - web-server web-server: - build: . + build: + context: . + dockerfile: examples/web/Dockerfile container_name: web-server restart: unless-stopped expose: - 62086 ports: - "62086:62086" + + load-server: + build: + context: . + dockerfile: examples/web/load.Dockerfile + environment: + TARGET_HOST: web-server + container_name: load-server + restart: unless-stopped + depends_on: + - web-server diff --git a/examples/otel/Dockerfile b/examples/otel/Dockerfile index 782f7a9..1a87b34 100644 --- a/examples/otel/Dockerfile +++ b/examples/otel/Dockerfile @@ -1,16 +1,24 @@ FROM golang:1.20-alpine MAINTAINER Fiberplane +ARG version=development -# Cannot really build the demo image from -# the examples subfolder because of -# relative imports shenanigans that go out of build context (i.e. upwards) -# -# Use -# GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -# -# To build the web-server app +RUN apk update && apk add git -COPY web-server / +WORKDIR /app + +COPY . ./ + +RUN go mod download + +WORKDIR /app/examples/web + +RUN go generate cmd/main.go + +ENV VERSION="$version" + +RUN scripts/build_server + +RUN cp web-server / EXPOSE 62086 diff --git a/examples/otel/README.md b/examples/otel/README.md index 3e29afd..1f0ed30 100644 --- a/examples/otel/README.md +++ b/examples/otel/README.md @@ -7,7 +7,12 @@ The only difference is that the metrics implementation used is OpenTelemetry with a Prometheus exporter instead of using a Prometheus only client crate. You can notice the 3 differences that are mentionned in the top-level README: -- The amImpl import has been changed to `otel` -- The autometrics call in the Go generator has the `-otel` flag -- The `amImpl.Init` call uses a different first argument, with the name of the +- The autometrics import has been changed to `otel` +- The autometrics call in the Go generator has the `--otel` flag +- The `autometrics.Init` call uses a different first argument, with the name of the OpenTelemetry scope to use + +## Quickstart + +You can build and run the example by using the +`docker-compose.open-telemetry-example.yaml` file at the root of the repo. diff --git a/examples/otel/autometrics.rules.yml b/examples/otel/autometrics.rules.yml deleted file mode 100644 index 57d3038..0000000 --- a/examples/otel/autometrics.rules.yml +++ /dev/null @@ -1,1594 +0,0 @@ - ---- -# Code generated by Sloth (v0.11.0): https://github.com/slok/sloth. -# DO NOT EDIT. - -groups: -- name: sloth-slo-sli-recordings-autometrics-success-rate-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", - sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-90 - rules: - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", - sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-95 - rules: - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", - sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99 - rules: - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99_9", - sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99_9 - rules: - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", - sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: latency-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-90 - rules: - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", - sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: latency-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-95 - rules: - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", - sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: latency-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99 - rules: - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", - sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99_9 - rules: - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. diff --git a/examples/otel/cmd/main.go b/examples/otel/cmd/main.go index ae84fef..424a6a6 100644 --- a/examples/otel/cmd/main.go +++ b/examples/otel/cmd/main.go @@ -8,18 +8,35 @@ import ( "net/http" "time" - amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel" + autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel" "github.com/prometheus/client_golang/prometheus/promhttp" ) // This should be `//go:generate autometrics` in practice. Those are hacks to get the example working, see // README -//go:generate go run ../../../cmd/autometrics/main.go -otel +//go:generate go run ../../../cmd/autometrics/main.go --otel + +var ( + Version = "development" + Commit = "n/a" + Branch string +) func main() { rand.Seed(time.Now().UnixNano()) - amImpl.Init("web-server", amImpl.DefBuckets) + // Everything in BuildInfo is optional. + // You can also use any string variable whose value is + // injected at build time by ldflags. + autometrics.Init( + "web-server", + autometrics.DefBuckets, + autometrics.BuildInfo{ + Version: Version, + Commit: Commit, + Branch: Branch, + }, + ) http.HandleFunc("/", errorable(indexHandler)) http.HandleFunc("/random-error", errorable(randomErrorHandler)) @@ -51,20 +68,20 @@ func main() { // // autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 // //autometrics:doc --slo "API" --latency-target 99 --latency-ms 250 func indexHandler(w http.ResponseWriter, _ *http.Request) error { - defer amImpl.Instrument(amImpl.PreInstrument(amImpl.NewContext( - amImpl.WithConcurrentCalls(true), - amImpl.WithCallerName(true), - amImpl.WithSloName("API"), - amImpl.WithAlertLatency(250000000*time.Nanosecond, 99), + defer autometrics.Instrument(autometrics.PreInstrument(autometrics.NewContext( + autometrics.WithConcurrentCalls(true), + autometrics.WithCallerName(true), + autometrics.WithSloName("API"), + autometrics.WithAlertLatency(250000000*time.Nanosecond, 99), )), nil) //autometrics:defer time.Sleep(time.Duration(rand.Intn(500)) * time.Millisecond) @@ -97,20 +114,20 @@ var handlerError = errors.New("failed to handle request") // // autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 // //autometrics:doc --slo "API" --success-target 90 func randomErrorHandler(w http.ResponseWriter, _ *http.Request) (err error) { - defer amImpl.Instrument(amImpl.PreInstrument(amImpl.NewContext( - amImpl.WithConcurrentCalls(true), - amImpl.WithCallerName(true), - amImpl.WithSloName("API"), - amImpl.WithAlertSuccess(90), + defer autometrics.Instrument(autometrics.PreInstrument(autometrics.NewContext( + autometrics.WithConcurrentCalls(true), + autometrics.WithCallerName(true), + autometrics.WithSloName("API"), + autometrics.WithAlertSuccess(90), )), &err) //autometrics:defer isErr := rand.Intn(2) == 0 diff --git a/examples/otel/configs/alertmanager.yml b/examples/otel/configs/alertmanager.yml new file mode 120000 index 0000000..2b1bf0c --- /dev/null +++ b/examples/otel/configs/alertmanager.yml @@ -0,0 +1 @@ +../../../configs/alertmanager.yml \ No newline at end of file diff --git a/examples/otel/configs/autometrics.rules.yml b/examples/otel/configs/autometrics.rules.yml new file mode 120000 index 0000000..2048540 --- /dev/null +++ b/examples/otel/configs/autometrics.rules.yml @@ -0,0 +1 @@ +../../../configs/shared/autometrics.rules.yml \ No newline at end of file diff --git a/examples/otel/prometheus.yaml b/examples/otel/configs/prometheus.yaml similarity index 100% rename from examples/otel/prometheus.yaml rename to examples/otel/configs/prometheus.yaml diff --git a/examples/otel/load.Dockerfile b/examples/otel/load.Dockerfile new file mode 100644 index 0000000..b20451d --- /dev/null +++ b/examples/otel/load.Dockerfile @@ -0,0 +1,8 @@ +FROM golang:1.20-alpine +MAINTAINER Fiberplane + +RUN apk update && apk add curl + +COPY examples/web/scripts/poll_server / + +CMD [ "/poll_server" ] diff --git a/examples/otel/scripts/build_server b/examples/otel/scripts/build_server new file mode 100755 index 0000000..132fe64 --- /dev/null +++ b/examples/otel/scripts/build_server @@ -0,0 +1,11 @@ +#!/bin/sh + +set -euo pipefail + +VERSION=${VERSION:-development} +COMMIT=`git log -1 --format="%H"` +BRANCH=`git branch --show-current` +SCRIPT_DIR="$( dirname -- "$( readlink -f -- "$0"; )"; )" + + +go build -v -ldflags="-X 'main.Version=${VERSION}' -X 'main.Commit=${COMMIT}' -X 'main.Branch=${BRANCH}'" -o web-server ${SCRIPT_DIR}/../cmd/main.go diff --git a/examples/otel/poll_server b/examples/otel/scripts/poll_server similarity index 50% rename from examples/otel/poll_server rename to examples/otel/scripts/poll_server index 2632d28..e395463 100755 --- a/examples/otel/poll_server +++ b/examples/otel/scripts/poll_server @@ -2,13 +2,15 @@ set -euo pipefail +TARGET_HOST="${TARGET_HOST:-localhost}" + while true do if [ "$(($RANDOM % 2))" == "0" ]; then - curl http://localhost:62086/random-error + curl "http://${TARGET_HOST}:62086/random-error" fi if [ "$(($RANDOM % 4))" == "0" ]; then - curl http://localhost:62086/ + curl "http://${TARGET_HOST}:62086/" fi - sleep 1 + sleep 0.2 done diff --git a/examples/web/Dockerfile b/examples/web/Dockerfile index 782f7a9..1a87b34 100644 --- a/examples/web/Dockerfile +++ b/examples/web/Dockerfile @@ -1,16 +1,24 @@ FROM golang:1.20-alpine MAINTAINER Fiberplane +ARG version=development -# Cannot really build the demo image from -# the examples subfolder because of -# relative imports shenanigans that go out of build context (i.e. upwards) -# -# Use -# GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -# -# To build the web-server app +RUN apk update && apk add git -COPY web-server / +WORKDIR /app + +COPY . ./ + +RUN go mod download + +WORKDIR /app/examples/web + +RUN go generate cmd/main.go + +ENV VERSION="$version" + +RUN scripts/build_server + +RUN cp web-server / EXPOSE 62086 diff --git a/examples/web/README.md b/examples/web/README.md index c52d733..9cdb9ca 100644 --- a/examples/web/README.md +++ b/examples/web/README.md @@ -8,9 +8,12 @@ It shows the generator usage and sets up Prometheus to showcase the ## Quick start ``` sh -GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -docker compose up -d -./poll_server +# Go to the root of the repo +cd ../.. +# Build all the images +docker compose -f docker-compose.prometheus-example.yaml build +# Run all the services +docker compose -f docker-compose.prometheus-example.yaml up ``` Then open [main](./cmd/main.go) in your editor and interact with the documentation links! @@ -21,7 +24,8 @@ Then open [main](./cmd/main.go) in your editor and interact with the documentati Optionnally, create a slack integration with an "incoming webhook" for one of your channels, and put the URL of the webhook (a secret!) in `slack_url.txt` in -the directory. That will enable alerting in Slack directly through Alertmanager. +the [configs](./configs) directory. That will enable alerting in Slack directly +through Alertmanager. You can see that the name of the service "API" comes directly from the annotation in the code. @@ -36,7 +40,6 @@ You can even monitor all the alerts triggering through Prometheus or Alertmanage In order to run this example you need: -- Go (at least 1.18) - Docker - Docker Compose @@ -66,20 +69,24 @@ The generator is idempotent. ### Building the docker image -Build the web-server for the image architecture: +Build the web-server in an image. There are 2 important things in the +image [recipe](./Dockerfile): +- The context of the image is the root of the repository, only so that this + example runs the `development` version of the code, and +- There is a specific [build script](./scripts/build_server) that uses + Go linker flags to inject build and version information in the binary. ```sh -GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -docker compose build +docker build --build-arg VERSION=1.0.0 -t web-server . ``` ### Start the services -In one terminal you can launch the stack and the small helper script to poll the the server: +In one terminal you can launch the image and the small helper script to poll the the server: ```sh -docker compose up -d -./poll_server +cd ../.. +docker compose -f docker-compose.prometheus-example.yaml up ``` ### Check the links on Prometheus @@ -104,9 +111,9 @@ configuration to the correct notification service: ![Alertmanager alerts dashboard showing the alerts firing](../../assets/alertmanager-alert-example.png) -This demo example has a [minimal configuration](./alertmanager.yml) for alerts +This demo example has a [minimal configuration](./configs/alertmanager.yml) for alerts that expects a file `slack_url.txt` to be passed in docker-compose context. -Create the file in the same folder as this README, and if the file exists, the +Create the file in the [configs folder](./configs), and if the file exists, the triggered alerts automatically go on Slack to the configured channel: ![a Slack bot is posting an alert directly in the channel](../../assets/slack-alert-example.png) diff --git a/examples/web/alertmanager.yml b/examples/web/alertmanager.yml deleted file mode 100644 index 92d01ab..0000000 --- a/examples/web/alertmanager.yml +++ /dev/null @@ -1,17 +0,0 @@ -global: - # Also possible to use the URL directly - # Ex: `slack_api_url: 'https://slack.com/...'` - slack_api_url_file: '/etc/alertmanager/slack_url' - -route: - receiver: 'slack-notifications' - group_by: [sloth_service, sloth_slo, objective_name] - -receivers: -- name: 'slack-notifications' - slack_configs: - # Channel is ignored when using a webhook. The webhook URL encodes the - # channel the alerts will be posted to. - - channel: '#alerts' - title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}" - text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}" diff --git a/examples/web/autometrics.rules.yml b/examples/web/autometrics.rules.yml deleted file mode 120000 index 9f80a00..0000000 --- a/examples/web/autometrics.rules.yml +++ /dev/null @@ -1 +0,0 @@ -../../configs/autometrics.rules.yml \ No newline at end of file diff --git a/examples/web/cmd/main.go b/examples/web/cmd/main.go index f0b1b4d..cab7efd 100644 --- a/examples/web/cmd/main.go +++ b/examples/web/cmd/main.go @@ -8,7 +8,7 @@ import ( "net/http" "time" - amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" + autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) @@ -16,10 +16,27 @@ import ( // README //go:generate go run ../../../cmd/autometrics/main.go +var ( + Version = "development" + Commit = "n/a" + Branch string +) + func main() { rand.Seed(time.Now().UnixNano()) - amImpl.Init(nil, amImpl.DefBuckets) + // Everything in BuildInfo is optional. + // You can also use any string variable whose value is + // injected at build time by ldflags. + autometrics.Init( + nil, + autometrics.DefBuckets, + autometrics.BuildInfo{ + Version: Version, + Commit: Commit, + Branch: Branch, + }, + ) http.HandleFunc("/", errorable(indexHandler)) http.HandleFunc("/random-error", errorable(randomErrorHandler)) @@ -51,20 +68,20 @@ func main() { // // autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 // //autometrics:doc --slo "API" --latency-target 99 --latency-ms 250 func indexHandler(w http.ResponseWriter, _ *http.Request) error { - defer amImpl.Instrument(amImpl.PreInstrument(amImpl.NewContext( - amImpl.WithConcurrentCalls(true), - amImpl.WithCallerName(true), - amImpl.WithSloName("API"), - amImpl.WithAlertLatency(250000000*time.Nanosecond, 99), + defer autometrics.Instrument(autometrics.PreInstrument(autometrics.NewContext( + autometrics.WithConcurrentCalls(true), + autometrics.WithCallerName(true), + autometrics.WithSloName("API"), + autometrics.WithAlertLatency(250000000*time.Nanosecond, 99), )), nil) //autometrics:defer time.Sleep(time.Duration(rand.Intn(500)) * time.Millisecond) @@ -97,20 +114,20 @@ var handlerError = errors.New("failed to handle request") // // autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 // //autometrics:doc --slo "API" --success-target 90 func randomErrorHandler(w http.ResponseWriter, _ *http.Request) (err error) { - defer amImpl.Instrument(amImpl.PreInstrument(amImpl.NewContext( - amImpl.WithConcurrentCalls(true), - amImpl.WithCallerName(true), - amImpl.WithSloName("API"), - amImpl.WithAlertSuccess(90), + defer autometrics.Instrument(autometrics.PreInstrument(autometrics.NewContext( + autometrics.WithConcurrentCalls(true), + autometrics.WithCallerName(true), + autometrics.WithSloName("API"), + autometrics.WithAlertSuccess(90), )), &err) //autometrics:defer isErr := rand.Intn(2) == 0 diff --git a/examples/web/cmd/main.go.orig b/examples/web/cmd/main.go.orig index 99e9f76..536c255 100644 --- a/examples/web/cmd/main.go.orig +++ b/examples/web/cmd/main.go.orig @@ -8,7 +8,7 @@ import ( "net/http" "time" - amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" + autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) @@ -16,10 +16,27 @@ import ( // README //go:generate go run ../../../cmd/autometrics/main.go +var ( + Version = "development" + Commit = "n/a" + Branch string +) + func main() { rand.Seed(time.Now().UnixNano()) - amImpl.Init(nil, amImpl.DefBuckets) + // Everything in BuildInfo is optional. + // You can also use any string variable whose value is + // injected at build time by ldflags. + autometrics.Init( + nil, + autometrics.DefBuckets, + autometrics.BuildInfo{ + Version: Version, + Commit: Commit, + Branch: Branch, + }, + ) http.HandleFunc("/", errorable(indexHandler)) http.HandleFunc("/random-error", errorable(randomErrorHandler)) diff --git a/examples/web/configs/alertmanager.yml b/examples/web/configs/alertmanager.yml new file mode 120000 index 0000000..2b1bf0c --- /dev/null +++ b/examples/web/configs/alertmanager.yml @@ -0,0 +1 @@ +../../../configs/alertmanager.yml \ No newline at end of file diff --git a/examples/web/configs/autometrics.rules.yml b/examples/web/configs/autometrics.rules.yml new file mode 120000 index 0000000..2048540 --- /dev/null +++ b/examples/web/configs/autometrics.rules.yml @@ -0,0 +1 @@ +../../../configs/shared/autometrics.rules.yml \ No newline at end of file diff --git a/examples/web/prometheus.yaml b/examples/web/configs/prometheus.yaml similarity index 100% rename from examples/web/prometheus.yaml rename to examples/web/configs/prometheus.yaml diff --git a/examples/web/load.Dockerfile b/examples/web/load.Dockerfile new file mode 100644 index 0000000..b20451d --- /dev/null +++ b/examples/web/load.Dockerfile @@ -0,0 +1,8 @@ +FROM golang:1.20-alpine +MAINTAINER Fiberplane + +RUN apk update && apk add curl + +COPY examples/web/scripts/poll_server / + +CMD [ "/poll_server" ] diff --git a/examples/web/scripts/build_server b/examples/web/scripts/build_server new file mode 100755 index 0000000..132fe64 --- /dev/null +++ b/examples/web/scripts/build_server @@ -0,0 +1,11 @@ +#!/bin/sh + +set -euo pipefail + +VERSION=${VERSION:-development} +COMMIT=`git log -1 --format="%H"` +BRANCH=`git branch --show-current` +SCRIPT_DIR="$( dirname -- "$( readlink -f -- "$0"; )"; )" + + +go build -v -ldflags="-X 'main.Version=${VERSION}' -X 'main.Commit=${COMMIT}' -X 'main.Branch=${BRANCH}'" -o web-server ${SCRIPT_DIR}/../cmd/main.go diff --git a/examples/web/poll_server b/examples/web/scripts/poll_server similarity index 50% rename from examples/web/poll_server rename to examples/web/scripts/poll_server index 2632d28..e395463 100755 --- a/examples/web/poll_server +++ b/examples/web/scripts/poll_server @@ -2,13 +2,15 @@ set -euo pipefail +TARGET_HOST="${TARGET_HOST:-localhost}" + while true do if [ "$(($RANDOM % 2))" == "0" ]; then - curl http://localhost:62086/random-error + curl "http://${TARGET_HOST}:62086/random-error" fi if [ "$(($RANDOM % 4))" == "0" ]; then - curl http://localhost:62086/ + curl "http://${TARGET_HOST}:62086/" fi - sleep 1 + sleep 0.2 done diff --git a/go.mod b/go.mod index 17ea8f2..1469e67 100644 --- a/go.mod +++ b/go.mod @@ -5,8 +5,8 @@ go 1.18 require github.com/prometheus/client_golang v1.14.0 require ( + github.com/alexflint/go-arg v1.4.3 github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 - github.com/slok/sloth v0.11.0 go.opentelemetry.io/otel v1.14.0 go.opentelemetry.io/otel/exporters/prometheus v0.37.0 go.opentelemetry.io/otel/metric v0.37.0 @@ -16,6 +16,7 @@ require ( ) require ( + github.com/alexflint/go-scalar v1.1.0 // indirect github.com/go-logr/logr v1.2.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect go.opentelemetry.io/otel/trace v1.14.0 // indirect diff --git a/go.sum b/go.sum index 8b1ea40..6b9c1da 100644 --- a/go.sum +++ b/go.sum @@ -38,6 +38,10 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuy github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= +github.com/alexflint/go-arg v1.4.3 h1:9rwwEBpMXfKQKceuZfYcwuc/7YY7tWJbFsgG5cAU/uo= +github.com/alexflint/go-arg v1.4.3/go.mod h1:3PZ/wp/8HuqRZMUUgu7I+e1qcpUbvmS258mRXkFH4IA= +github.com/alexflint/go-scalar v1.1.0 h1:aaAouLLzI9TChcPXotr6gUhq+Scr8rl0P9P4PnltbhM= +github.com/alexflint/go-scalar v1.1.0/go.mod h1:LoFvNMqS1CPrMVltza4LvnGKhaSpc3oyLEBUZVhhS2o= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -202,8 +206,6 @@ github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= -github.com/slok/sloth v0.11.0 h1:0N3975hhO8izJoHIiPMBKZWxk6lxamuTd45MxYsOk04= -github.com/slok/sloth v0.11.0/go.mod h1:xE9zMDVvMb5ylMhkacDtC02vmRhZHNuqe5ez93OiDms= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= @@ -211,6 +213,7 @@ github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpE github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= diff --git a/internal/autometrics/doc.go b/internal/autometrics/prometheus_link_gen.go similarity index 75% rename from internal/autometrics/doc.go rename to internal/autometrics/prometheus_link_gen.go index 5a41ed1..d6f99e8 100644 --- a/internal/autometrics/doc.go +++ b/internal/autometrics/prometheus_link_gen.go @@ -46,22 +46,73 @@ func (p Prometheus) makePrometheusUrl(query, comment string) url.URL { return ret } +func addBuildInfoLabels() string { + return fmt.Sprintf("* on (instance, job) group_left(%s, %s) last_over_time(%s[1s])", + prometheus.VersionLabel, + prometheus.CommitLabel, + prometheus.BuildInfoName, + ) +} + func requestRateQuery(counterName, labelKey, labelValue string) string { - return fmt.Sprintf("sum by (%s, %s) (rate(%s{%s=\"%s\"}[5m]))", prometheus.FunctionLabel, prometheus.ModuleLabel, counterName, labelKey, labelValue) + return fmt.Sprintf("sum by (%s, %s, %s, %s) (rate(%s{%s=\"%s\"}[5m]) %s)", + prometheus.FunctionLabel, + prometheus.ModuleLabel, + prometheus.VersionLabel, + prometheus.CommitLabel, + counterName, + labelKey, + labelValue, + addBuildInfoLabels(), + ) } func errorRatioQuery(counterName, labelKey, labelValue string) string { - return fmt.Sprintf("sum by (%s, %s) (rate(%s{%s=\"%s\",%s=\"error\"}[5m]))", prometheus.FunctionLabel, prometheus.ModuleLabel, counterName, labelKey, labelValue, prometheus.ResultLabel) + return fmt.Sprintf("(sum by (%s, %s, %s, %s) (rate(%s{%s=\"%s\",%s=\"error\"}[5m]) %s)) / (%s)", + prometheus.FunctionLabel, + prometheus.ModuleLabel, + prometheus.VersionLabel, + prometheus.CommitLabel, + counterName, + labelKey, + labelValue, + prometheus.ResultLabel, + addBuildInfoLabels(), + requestRateQuery(counterName, labelKey, labelValue), + ) } func latencyQuery(bucketName, labelKey, labelValue string) string { - latency := fmt.Sprintf("sum by (le, %s, %s) (rate(%s_bucket{%s=\"%s\"}[5m]))", prometheus.FunctionLabel, prometheus.ModuleLabel, bucketName, labelKey, labelValue) + latency := fmt.Sprintf("sum by (le, %s, %s, %s, %s) (rate(%s_bucket{%s=\"%s\"}[5m]) %s)", + prometheus.FunctionLabel, + prometheus.ModuleLabel, + prometheus.VersionLabel, + prometheus.CommitLabel, + bucketName, + labelKey, + labelValue, + addBuildInfoLabels(), + ) - return fmt.Sprintf("histogram_quantile(0.99, %s) or histogram_quantile(0.95, %s)", latency, latency) + return fmt.Sprintf( + "label_replace(histogram_quantile(0.99, %s), \"percentile_latency\", \"99\", \"\", \"\") or "+ + "label_replace(histogram_quantile(0.95, %s),\"percentile_latency\", \"95\", \"\", \"\")", + latency, + latency, + ) } func concurrentCallsQuery(gaugeName, labelKey, labelValue string) string { - return fmt.Sprintf("sum by (%s, %s) %s{%s=\"%s\"}", prometheus.FunctionLabel, prometheus.ModuleLabel, gaugeName, labelKey, labelValue) + return fmt.Sprintf("sum by (%s, %s, %s, %s) (%s{%s=\"%s\"} %s)", + prometheus.FunctionLabel, + prometheus.ModuleLabel, + prometheus.VersionLabel, + prometheus.CommitLabel, + gaugeName, + labelKey, + labelValue, + addBuildInfoLabels(), + ) } func (p Prometheus) GenerateAutometricsComment(ctx GeneratorContext, funcName, moduleName string) []string { diff --git a/internal/build/build.go b/internal/build/build.go new file mode 100644 index 0000000..3600686 --- /dev/null +++ b/internal/build/build.go @@ -0,0 +1,4 @@ +package build // import "github.com/autometrics-dev/autometrics-go/internal/build" + +// Version is the version string of the build, when made available through ldflags. +var Version = "development" diff --git a/internal/generate/generate_test.go b/internal/generate/generate_test.go index 927950d..5936c29 100644 --- a/internal/generate/generate_test.go +++ b/internal/generate/generate_test.go @@ -62,12 +62,12 @@ func main() { "//\n" + "//\tautometrics:doc-end Generated documentation by Autometrics.\n" + "//\n" + - "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29%29&g0.tab=0\n" + - "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22main%22%7D&g0.tab=0\n" + - "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0\n" + + "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0\n" + + "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0\n" + + "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0\n" + "//\n" + "//autometrics:doc --slo \"Service Test\" --success-target 99\n" + "func main() {\n" + @@ -148,12 +148,12 @@ func main() { "//\n" + "//\tautometrics:doc-end Generated documentation by Autometrics.\n" + "//\n" + - "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29%29&g0.tab=0\n" + - "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22main%22%7D&g0.tab=0\n" + - "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0\n" + + "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0\n" + + "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0\n" + + "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0\n" + "//\n" + "//autometrics:doc --slo \"API\" --latency-target 99.9 --latency-ms 500\n" + "func main() {\n" + @@ -626,8 +626,8 @@ func implementContextCodeGenTest(t *testing.T, contextToSerialize autometrics.Co sourceContext := internal.GeneratorContext{ RuntimeCtx: contextToSerialize, FuncCtx: internal.GeneratorFunctionContext{ - CommentIndex: -1, - ImplImportName: "autometrics", + CommentIndex: -1, + ImplImportName: "autometrics", }, } diff --git a/pkg/autometrics/global_state.go b/pkg/autometrics/global_state.go new file mode 100644 index 0000000..1e81ebd --- /dev/null +++ b/pkg/autometrics/global_state.go @@ -0,0 +1,38 @@ +package autometrics // import "github.com/autometrics-dev/autometrics-go/pkg/autometrics" + +// These variables are describing the state of the application being autometricized, +// _not_ the build information of the binary + +var version string +var commit string +var branch string + +// GetVersion returns the version of the codebase being instrumented. +func GetVersion() string { + return version +} + +// SetVersion sets the version of the codebase being instrumented. +func SetVersion(newVersion string) { + version = newVersion +} + +// GetCommit returns the commit of the codebase being instrumented. +func GetCommit() string { + return commit +} + +// SetCommit sets the commit of the codebase being instrumented. +func SetCommit(newCommit string) { + commit = newCommit +} + +// GetBranch returns the branch of the build of the codebase being instrumented. +func GetBranch() string { + return branch +} + +// SetBranch sets the branch of the build of the codebase being instrumented. +func SetBranch(newBranch string) { + branch = newBranch +} diff --git a/pkg/autometrics/main.go b/pkg/autometrics/main.go index 17bc033..0a3fd40 100644 --- a/pkg/autometrics/main.go +++ b/pkg/autometrics/main.go @@ -1,4 +1,4 @@ -package autometrics +package autometrics // import "github.com/autometrics-dev/autometrics-go/pkg/autometrics" import ( "context" @@ -38,19 +38,40 @@ type Context struct { TrackCallerName bool // AlertConf is an optional configuration to add alerting capabilities to the metrics. AlertConf *AlertConfiguration - // startTime is the start time of a single function execution. - // Only autometrics.Instrument should read this value. - // Only autometrics.PreInstrument should write this value. + // StartTime is the start time of a single function execution. + // Only amImpl.Instrument should read this value. + // Only amImpl.PreInstrument should write this value. // - // This value is only exported for the child packages "prometheus" and "otel" + // (amImpl is either the [Prometheus] or the [Open Telemetry] implementation) + // + // This value is only exported for the child packages [Prometheus] and [Open Telemetry] + // + // [Prometheus]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus + // [Open Telemetry]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel StartTime time.Time - // callInfo contains all the relevant data for caller information. - // Only autometrics.Instrument should read this value. - // Only autometrics.PreInstrument should write/read this value. + // CallInfo contains all the relevant data for caller information. + // Only amImpl.Instrument should read this value. + // Only amImpl.PreInstrument should write/read this value. + // + // (amImpl is either the [Prometheus] or the [Open Telemetry] implementation) // - // This value is only exported for the child packages "prometheus" and "otel" + // This value is only exported for the child packages [Prometheus] and [Open Telemetry] + // + // [Prometheus]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus + // [Open Telemetry]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel CallInfo CallInfo - Context context.Context + // BuildInfo contains all the relevant data for caller information. + // Only amImpl.Instrument and PreInstrument should read this value. + // Only amImpl.Init should write/read this value. + // + // (amImpl is either the [Prometheus] or the [Open Telemetry] implementation) + // + // This value is only exported for the child packages [Prometheus] and [Open Telemetry] + // + // [Prometheus]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus + // [Open Telemetry]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel + BuildInfo BuildInfo + Context context.Context } // CallInfo holds the information about the current function call and its parent names. @@ -65,6 +86,16 @@ type CallInfo struct { ParentModuleName string } +// BuildInfo holds the information about the current build of the instrumented code. +type BuildInfo struct { + // Commit is the commit of the code. + Commit string + // Version is the version of the code. + Version string + // Branch is the branch of the build of the codebase. + Branch string +} + func NewContext() Context { return Context{ TrackConcurrentCalls: true, @@ -74,6 +105,12 @@ func NewContext() Context { } } +func (c *Context) FillBuildInfo() { + c.BuildInfo.Version = GetVersion() + c.BuildInfo.Commit = GetCommit() + c.BuildInfo.Branch = GetBranch() +} + func (c Context) Validate(allowCustomLatencies bool) error { if c.AlertConf != nil { if c.AlertConf.ServiceName == "" { diff --git a/pkg/autometrics/otel/instrument.go b/pkg/autometrics/otel/instrument.go index def0639..9217422 100644 --- a/pkg/autometrics/otel/instrument.go +++ b/pkg/autometrics/otel/instrument.go @@ -15,7 +15,7 @@ import ( // // The first argument SHOULD be a call to PreInstrument so that // the "concurrent calls" gauge is correctly setup. -func Instrument(ctx *autometrics.Context, err *error) { +func Instrument(ctx *autometrics.Context, err *error) { result := "ok" if err != nil && *err != nil { @@ -49,6 +49,9 @@ func Instrument(ctx *autometrics.Context, err *error) { attribute.Key(ResultLabel).String(result), attribute.Key(TargetSuccessRateLabel).String(successObjective), attribute.Key(SloNameLabel).String(sloName), + attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), + attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), + attribute.Key(BranchLabel).String(ctx.BuildInfo.Branch), }...) functionCallsDuration.Record(ctx.Context, time.Since(ctx.StartTime).Seconds(), []attribute.KeyValue{ @@ -58,6 +61,9 @@ func Instrument(ctx *autometrics.Context, err *error) { attribute.Key(TargetLatencyLabel).String(latencyTarget), attribute.Key(TargetSuccessRateLabel).String(latencyObjective), attribute.Key(SloNameLabel).String(sloName), + attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), + attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), + attribute.Key(BranchLabel).String(ctx.BuildInfo.Branch), }...) if ctx.TrackConcurrentCalls { @@ -66,6 +72,9 @@ func Instrument(ctx *autometrics.Context, err *error) { attribute.Key(FunctionLabel).String(ctx.CallInfo.FuncName), attribute.Key(ModuleLabel).String(ctx.CallInfo.ModuleName), attribute.Key(CallerLabel).String(callerLabel), + attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), + attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), + attribute.Key(BranchLabel).String(ctx.BuildInfo.Branch), }...) } } @@ -74,8 +83,9 @@ func Instrument(ctx *autometrics.Context, err *error) { // // It is meant to be called as the first argument to Instrument in a // defer call. -func PreInstrument(ctx *autometrics.Context) *autometrics.Context { +func PreInstrument(ctx *autometrics.Context) *autometrics.Context { ctx.CallInfo = autometrics.CallerInfo() + ctx.FillBuildInfo() ctx.Context = context.Background() var callerLabel string @@ -89,6 +99,9 @@ func PreInstrument(ctx *autometrics.Context) *autometrics.Context { attribute.Key(FunctionLabel).String(ctx.CallInfo.FuncName), attribute.Key(ModuleLabel).String(ctx.CallInfo.ModuleName), attribute.Key(CallerLabel).String(callerLabel), + attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), + attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), + attribute.Key(BranchLabel).String(ctx.BuildInfo.Branch), }...) } diff --git a/pkg/autometrics/otel/otel.go b/pkg/autometrics/otel/otel.go index bd971ff..7fe82dd 100644 --- a/pkg/autometrics/otel/otel.go +++ b/pkg/autometrics/otel/otel.go @@ -1,10 +1,12 @@ package otel // import "github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel" import ( + "context" "fmt" "github.com/autometrics-dev/autometrics-go/pkg/autometrics" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/prometheus" "go.opentelemetry.io/otel/metric/instrument" "go.opentelemetry.io/otel/sdk/instrumentation" @@ -16,6 +18,7 @@ var ( functionCallsCount instrument.Int64UpDownCounter functionCallsDuration instrument.Float64Histogram functionCallsConcurrent instrument.Int64UpDownCounter + buildInfo instrument.Int64UpDownCounter DefBuckets = autometrics.DefBuckets ) @@ -26,6 +29,8 @@ const ( FunctionCallsDurationName = "function.calls.duration" // FunctionCallsConcurrentName is the name of the openTelemetry metric for the number of simulateneously active calls to specific functions. FunctionCallsConcurrentName = "function.calls.concurrent" + // BuildInfo is the name of the openTelemetry metric for the version of the monitored codebase. + BuildInfoName = "build_info" // FunctionLabel is the openTelemetry attribute that describes the function name. // @@ -56,20 +61,36 @@ const ( TargetSuccessRateLabel = "objective.percentile" // SloLabelName is the openTelemetry attribute that describes the name of the Service Level Objective. SloNameLabel = "objective.name" -) + // CommitLabel is the openTelemetry attribute that describes the commit of the monitored codebase. + CommitLabel = "commit" + // VersionLabel is the openTelemetry attribute that describes the version of the monitored codebase. + VersionLabel = "version" + // BranchLabel is the openTelemetry attribute that describes the branch of the build of the monitored codebase. + BranchLabel = "branch" +) func completeMeterName(meterName string) string { return fmt.Sprintf("autometrics/%v", meterName) } +// BuildInfo holds meta information about the build of the instrumented code. +// +// This is a reexport of the autometrics type to allow [Init] to work with only +// the current (prometheus) package imported at the call site. +type BuildInfo = autometrics.BuildInfo + // Init sets up the metrics required for autometrics' decorated functions and registers // them to the Prometheus exporter // // Make sure that all the latency targets you want to use for SLOs are // present in the histogramBuckets array, otherwise the alerts will fail // to work (they will never trigger.) -func Init(meterName string, histogramBuckets []float64) error { +func Init(meterName string, histogramBuckets []float64, buildInformation BuildInfo) error { + autometrics.SetCommit(buildInformation.Commit) + autometrics.SetVersion(buildInformation.Version) + autometrics.SetBranch(buildInformation.Branch) + exporter, err := prometheus.New( // The units are removed from the exporter so that the names of the // exported metrics after the View rename are consistent with the @@ -114,5 +135,17 @@ func Init(meterName string, histogramBuckets []float64) error { return fmt.Errorf("error initializing %v metric: %w", FunctionCallsConcurrentName, err) } + buildInfo, err = meter.Int64UpDownCounter(BuildInfoName, instrument.WithDescription("The information of the current build.")) + if err != nil { + return fmt.Errorf("error initializing %v metric: %w", BuildInfoName, err) + } + + buildInfo.Add(context.Background(), 1, + []attribute.KeyValue{ + attribute.Key(CommitLabel).String(buildInformation.Commit), + attribute.Key(VersionLabel).String(buildInformation.Version), + attribute.Key(BranchLabel).String(buildInformation.Branch), + }...) + return nil } diff --git a/pkg/autometrics/prometheus/instrument.go b/pkg/autometrics/prometheus/instrument.go index 1d8b803..42b8ca3 100644 --- a/pkg/autometrics/prometheus/instrument.go +++ b/pkg/autometrics/prometheus/instrument.go @@ -14,7 +14,7 @@ import ( // // The first argument SHOULD be a call to PreInstrument so that // the "concurrent calls" gauge is correctly setup. -func Instrument(ctx *autometrics.Context, err *error) { +func Instrument(ctx *autometrics.Context, err *error) { result := "ok" if err != nil && *err != nil { @@ -47,6 +47,9 @@ func Instrument(ctx *autometrics.Context, err *error) { ResultLabel: result, TargetSuccessRateLabel: successObjective, SloNameLabel: sloName, + BranchLabel: ctx.BuildInfo.Branch, + CommitLabel: ctx.BuildInfo.Commit, + VersionLabel: ctx.BuildInfo.Version, }).Inc() functionCallsDuration.With(prometheus.Labels{ FunctionLabel: ctx.CallInfo.FuncName, @@ -55,13 +58,19 @@ func Instrument(ctx *autometrics.Context, err *error) { TargetLatencyLabel: latencyTarget, TargetSuccessRateLabel: latencyObjective, SloNameLabel: sloName, + BranchLabel: ctx.BuildInfo.Branch, + CommitLabel: ctx.BuildInfo.Commit, + VersionLabel: ctx.BuildInfo.Version, }).Observe(time.Since(ctx.StartTime).Seconds()) if ctx.TrackConcurrentCalls { functionCallsConcurrent.With(prometheus.Labels{ - FunctionLabel: ctx.CallInfo.FuncName, - ModuleLabel: ctx.CallInfo.ModuleName, - CallerLabel: callerLabel, + FunctionLabel: ctx.CallInfo.FuncName, + ModuleLabel: ctx.CallInfo.ModuleName, + CallerLabel: callerLabel, + BranchLabel: ctx.BuildInfo.Branch, + CommitLabel: ctx.BuildInfo.Commit, + VersionLabel: ctx.BuildInfo.Version, }).Dec() } } @@ -70,8 +79,9 @@ func Instrument(ctx *autometrics.Context, err *error) { // // It is meant to be called as the first argument to Instrument in a // defer call. -func PreInstrument(ctx *autometrics.Context) *autometrics.Context { +func PreInstrument(ctx *autometrics.Context) *autometrics.Context { ctx.CallInfo = autometrics.CallerInfo() + ctx.FillBuildInfo() var callerLabel string if ctx.TrackCallerName { @@ -80,9 +90,12 @@ func PreInstrument(ctx *autometrics.Context) *autometrics.Context { if ctx.TrackConcurrentCalls { functionCallsConcurrent.With(prometheus.Labels{ - FunctionLabel: ctx.CallInfo.FuncName, - ModuleLabel: ctx.CallInfo.ModuleName, - CallerLabel: callerLabel, + FunctionLabel: ctx.CallInfo.FuncName, + ModuleLabel: ctx.CallInfo.ModuleName, + CallerLabel: callerLabel, + BranchLabel: ctx.BuildInfo.Branch, + CommitLabel: ctx.BuildInfo.Commit, + VersionLabel: ctx.BuildInfo.Version, }).Inc() } diff --git a/pkg/autometrics/prometheus/prometheus.go b/pkg/autometrics/prometheus/prometheus.go index aae8ccc..26fed54 100644 --- a/pkg/autometrics/prometheus/prometheus.go +++ b/pkg/autometrics/prometheus/prometheus.go @@ -9,6 +9,7 @@ var ( functionCallsCount *prometheus.CounterVec functionCallsDuration *prometheus.HistogramVec functionCallsConcurrent *prometheus.GaugeVec + buildInfo *prometheus.GaugeVec DefBuckets = autometrics.DefBuckets ) @@ -19,6 +20,8 @@ const ( FunctionCallsDurationName = "function_calls_duration" // FunctionCallsConcurrentName is the name of the prometheus metric for the number of simulateneously active calls to specific functions. FunctionCallsConcurrentName = "function_calls_concurrent" + // BuildInfo is the name of the prometheus metric for the version of the monitored codebase. + BuildInfoName = "build_info" // FunctionLabel is the prometheus label that describes the function name. // @@ -47,10 +50,23 @@ const ( // In the case of success objectives, it describes the percentage of calls // that must be successful (i.e. have their [ResultLabel] be 'ok'). TargetSuccessRateLabel = "objective_percentile" - // SloLabelName is the prometheus label that describes the name of the Service Level Objective. + // SloLabel is the prometheus label that describes the name of the Service Level Objective. SloNameLabel = "objective_name" + + // CommitLabel is the prometheus label that describes the commit of the monitored codebase. + CommitLabel = "commit" + // VersionLabel is the prometheus label that describes the version of the monitored codebase. + VersionLabel = "version" + // BranchLabel is the prometheus label that describes the branch of the build of the monitored codebase. + BranchLabel = "branch" ) +// BuildInfo holds meta information about the build of the instrumented code. +// +// This is a reexport of the autometrics type to allow [Init] to work with only +// the current (prometheus) package imported at the call site. +type BuildInfo = autometrics.BuildInfo + // Init sets up the metrics required for autometrics' decorated functions and registers // them to the argument registry. // @@ -60,29 +76,45 @@ const ( // Make sure that all the latency targets you want to use for SLOs are // present in the histogramBuckets array, otherwise the alerts will fail // to work (they will never trigger.) -func Init(reg *prometheus.Registry, histogramBuckets []float64) error { +func Init(reg *prometheus.Registry, histogramBuckets []float64, buildInformation BuildInfo) error { + autometrics.SetCommit(buildInformation.Commit) + autometrics.SetVersion(buildInformation.Version) + autometrics.SetBranch(buildInformation.Branch) + functionCallsCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: FunctionCallsCountName, - }, []string{FunctionLabel, ModuleLabel, CallerLabel, ResultLabel, TargetSuccessRateLabel, SloNameLabel}) + }, []string{FunctionLabel, ModuleLabel, CallerLabel, ResultLabel, TargetSuccessRateLabel, SloNameLabel, CommitLabel, VersionLabel, BranchLabel}) functionCallsDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: FunctionCallsDurationName, Buckets: histogramBuckets, - }, []string{FunctionLabel, ModuleLabel, CallerLabel, TargetLatencyLabel, TargetSuccessRateLabel, SloNameLabel}) + }, []string{FunctionLabel, ModuleLabel, CallerLabel, TargetLatencyLabel, TargetSuccessRateLabel, SloNameLabel, CommitLabel, VersionLabel, BranchLabel}) functionCallsConcurrent = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: FunctionCallsConcurrentName, - }, []string{FunctionLabel, ModuleLabel, CallerLabel}) + }, []string{FunctionLabel, ModuleLabel, CallerLabel, CommitLabel, VersionLabel, BranchLabel}) + + buildInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: BuildInfoName, + }, []string{CommitLabel, VersionLabel, BranchLabel}) if reg != nil { reg.MustRegister(functionCallsCount) reg.MustRegister(functionCallsDuration) reg.MustRegister(functionCallsConcurrent) + reg.MustRegister(buildInfo) } else { prometheus.DefaultRegisterer.MustRegister(functionCallsCount) prometheus.DefaultRegisterer.MustRegister(functionCallsDuration) prometheus.DefaultRegisterer.MustRegister(functionCallsConcurrent) + prometheus.DefaultRegisterer.MustRegister(buildInfo) } + buildInfo.With(prometheus.Labels{ + CommitLabel: buildInformation.Commit, + VersionLabel: buildInformation.Version, + BranchLabel: buildInformation.Branch, + }).Set(1) + return nil } diff --git a/scripts/build_generator b/scripts/build_generator new file mode 100755 index 0000000..8dd5121 --- /dev/null +++ b/scripts/build_generator @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -euo pipefail + +BUILD_PACK="github.com/autometrics-dev/autometrics-go/internal/build" +VERSION=`git describe --tags` +SCRIPT_DIR="$( dirname -- "$( readlink -f -- "$0"; )"; )" + +go build -v -ldflags="-X '${BUILD_PACK}.Version=${VERSION}'" ${SCRIPT_DIR}/../cmd/autometrics/main.go