From 2670345b936c7adb47ba6764c105d3514ee10a00 Mon Sep 17 00:00:00 2001 From: Gerry Agbobada Date: Mon, 24 Apr 2023 17:50:20 +0200 Subject: [PATCH] Reorganize projects to use build info in metrics --- .gitmodules | 3 + README.md | 11 +- {examples/otel => configs}/alertmanager.yml | 0 configs/autometrics.rules.yml | 1594 ----------------- configs/shared | 1 + ...docker-compose.open-telemetry-example.yaml | 12 +- ... => docker-compose.prometheus-example.yaml | 23 +- examples/otel/Dockerfile | 26 +- examples/otel/README.md | 5 + examples/otel/autometrics.rules.yml | 1594 ----------------- examples/otel/configs/alertmanager.yml | 1 + examples/otel/configs/autometrics.rules.yml | 1 + examples/otel/{ => configs}/prometheus.yaml | 0 examples/otel/load.Dockerfile | 8 + examples/otel/scripts/build_server | 11 + examples/{web => otel/scripts}/poll_server | 8 +- examples/web/Dockerfile | 26 +- examples/web/README.md | 33 +- examples/web/alertmanager.yml | 17 - examples/web/autometrics.rules.yml | 1 - examples/web/cmd/main.go | 34 +- examples/web/cmd/main.go.orig | 10 +- examples/web/configs/alertmanager.yml | 1 + examples/web/configs/autometrics.rules.yml | 1 + examples/web/{ => configs}/prometheus.yaml | 0 examples/web/load.Dockerfile | 8 + examples/web/scripts/build_server | 11 + examples/{otel => web/scripts}/poll_server | 8 +- internal/autometrics/doc.go | 3 +- internal/generate/generate_test.go | 8 +- 30 files changed, 182 insertions(+), 3277 deletions(-) create mode 100644 .gitmodules rename {examples/otel => configs}/alertmanager.yml (100%) delete mode 100644 configs/autometrics.rules.yml create mode 160000 configs/shared rename examples/web/docker-compose.yaml => docker-compose.open-telemetry-example.yaml (71%) rename examples/otel/docker-compose.yaml => docker-compose.prometheus-example.yaml (61%) delete mode 100644 examples/otel/autometrics.rules.yml create mode 120000 examples/otel/configs/alertmanager.yml create mode 120000 examples/otel/configs/autometrics.rules.yml rename examples/otel/{ => configs}/prometheus.yaml (100%) create mode 100644 examples/otel/load.Dockerfile create mode 100755 examples/otel/scripts/build_server rename examples/{web => otel/scripts}/poll_server (50%) delete mode 100644 examples/web/alertmanager.yml delete mode 120000 examples/web/autometrics.rules.yml create mode 120000 examples/web/configs/alertmanager.yml create mode 120000 examples/web/configs/autometrics.rules.yml rename examples/web/{ => configs}/prometheus.yaml (100%) create mode 100644 examples/web/load.Dockerfile create mode 100755 examples/web/scripts/build_server rename examples/{otel => web/scripts}/poll_server (50%) diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..7ab9faf --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "autometrics-shared"] + path = configs/shared + url = https://github.com/autometrics-dev/autometrics-shared.git diff --git a/README.md b/README.md index 9b5f823..a006b06 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,16 @@ trigger alerts directly from production usage: ![a Slack bot is posting an alert directly in the channel](./assets/slack-alert-example.png) A fully working use-case and example of library usage is available in the -[examples/web](./examples/web) subdirectory +[examples/web](./examples/web) subdirectory. You can build and run load on the +example server using: + +```console +git submodule update --init +docker compose -f docker-compose.prometheus-example.yaml up +``` + +And then explore the generated links by opening the [main +file](./examples/web/cmd/main.go). ## How to use diff --git a/examples/otel/alertmanager.yml b/configs/alertmanager.yml similarity index 100% rename from examples/otel/alertmanager.yml rename to configs/alertmanager.yml diff --git a/configs/autometrics.rules.yml b/configs/autometrics.rules.yml deleted file mode 100644 index 57d3038..0000000 --- a/configs/autometrics.rules.yml +++ /dev/null @@ -1,1594 +0,0 @@ - ---- -# Code generated by Sloth (v0.11.0): https://github.com/slok/sloth. -# DO NOT EDIT. - -groups: -- name: sloth-slo-sli-recordings-autometrics-success-rate-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", - sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-90 - rules: - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", - sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-95 - rules: - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", - sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99 - rules: - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99_9", - sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99_9 - rules: - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", - sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: latency-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-90 - rules: - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", - sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: latency-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-95 - rules: - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", - sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: latency-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99 - rules: - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", - sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99_9 - rules: - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. diff --git a/configs/shared b/configs/shared new file mode 160000 index 0000000..fd4aa1e --- /dev/null +++ b/configs/shared @@ -0,0 +1 @@ +Subproject commit fd4aa1e7fa3aaa7a736f778ee782e522df73b336 diff --git a/examples/web/docker-compose.yaml b/docker-compose.open-telemetry-example.yaml similarity index 71% rename from examples/web/docker-compose.yaml rename to docker-compose.open-telemetry-example.yaml index 335e10d..5582f85 100644 --- a/examples/web/docker-compose.yaml +++ b/docker-compose.open-telemetry-example.yaml @@ -20,8 +20,8 @@ services: container_name: alertmanager restart: unless-stopped volumes: - - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml - - ./slack_url.txt:/etc/alertmanager/slack_url + - ./examples/otel/configs/alertmanager.yml:/etc/alertmanager/alertmanager.yml + - ./examples/otel/configs/slack_url.txt:/etc/alertmanager/slack_url command: - '--config.file=/etc/alertmanager/alertmanager.yml' expose: @@ -34,8 +34,8 @@ services: container_name: prometheus restart: unless-stopped volumes: - - ./prometheus.yaml:/etc/prometheus/prometheus.yaml - - ./autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml + - ./examples/otel/configs/prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./examples/otel/configs/autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml command: - '--config.file=/etc/prometheus/prometheus.yaml' expose: @@ -48,7 +48,9 @@ services: - web-server web-server: - build: . + build: + context: . + dockerfile: examples/otel/Dockerfile container_name: web-server restart: unless-stopped expose: diff --git a/examples/otel/docker-compose.yaml b/docker-compose.prometheus-example.yaml similarity index 61% rename from examples/otel/docker-compose.yaml rename to docker-compose.prometheus-example.yaml index 335e10d..aaa805d 100644 --- a/examples/otel/docker-compose.yaml +++ b/docker-compose.prometheus-example.yaml @@ -20,8 +20,8 @@ services: container_name: alertmanager restart: unless-stopped volumes: - - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml - - ./slack_url.txt:/etc/alertmanager/slack_url + - ./examples/web/configs/alertmanager.yml:/etc/alertmanager/alertmanager.yml + - ./examples/web/configs/slack_url.txt:/etc/alertmanager/slack_url command: - '--config.file=/etc/alertmanager/alertmanager.yml' expose: @@ -34,8 +34,8 @@ services: container_name: prometheus restart: unless-stopped volumes: - - ./prometheus.yaml:/etc/prometheus/prometheus.yaml - - ./autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml + - ./examples/web/configs/prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./examples/web/configs/autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml command: - '--config.file=/etc/prometheus/prometheus.yaml' expose: @@ -48,10 +48,23 @@ services: - web-server web-server: - build: . + build: + context: . + dockerfile: examples/web/Dockerfile container_name: web-server restart: unless-stopped expose: - 62086 ports: - "62086:62086" + + load-server: + build: + context: . + dockerfile: examples/web/load.Dockerfile + environment: + TARGET_HOST: web-server + container_name: load-server + restart: unless-stopped + depends_on: + - web-server diff --git a/examples/otel/Dockerfile b/examples/otel/Dockerfile index 782f7a9..1a87b34 100644 --- a/examples/otel/Dockerfile +++ b/examples/otel/Dockerfile @@ -1,16 +1,24 @@ FROM golang:1.20-alpine MAINTAINER Fiberplane +ARG version=development -# Cannot really build the demo image from -# the examples subfolder because of -# relative imports shenanigans that go out of build context (i.e. upwards) -# -# Use -# GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -# -# To build the web-server app +RUN apk update && apk add git -COPY web-server / +WORKDIR /app + +COPY . ./ + +RUN go mod download + +WORKDIR /app/examples/web + +RUN go generate cmd/main.go + +ENV VERSION="$version" + +RUN scripts/build_server + +RUN cp web-server / EXPOSE 62086 diff --git a/examples/otel/README.md b/examples/otel/README.md index 3e29afd..8cb369f 100644 --- a/examples/otel/README.md +++ b/examples/otel/README.md @@ -11,3 +11,8 @@ You can notice the 3 differences that are mentionned in the top-level README: - The autometrics call in the Go generator has the `-otel` flag - The `amImpl.Init` call uses a different first argument, with the name of the OpenTelemetry scope to use + +## Quickstart + +You can build and run the example by using the +`docker-compose.open-telemetry-example.yaml` file at the root of the repo. diff --git a/examples/otel/autometrics.rules.yml b/examples/otel/autometrics.rules.yml deleted file mode 100644 index 57d3038..0000000 --- a/examples/otel/autometrics.rules.yml +++ /dev/null @@ -1,1594 +0,0 @@ - ---- -# Code generated by Sloth (v0.11.0): https://github.com/slok/sloth. -# DO NOT EDIT. - -groups: -- name: sloth-slo-sli-recordings-autometrics-success-rate-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", - sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-90 - rules: - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", - sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-95 - rules: - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", - sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99 - rules: - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99_9", - sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99_9 - rules: - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", - sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: latency-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-90 - rules: - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", - sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: latency-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-95 - rules: - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", - sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: latency-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99 - rules: - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", - sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99_9 - rules: - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. diff --git a/examples/otel/configs/alertmanager.yml b/examples/otel/configs/alertmanager.yml new file mode 120000 index 0000000..2b1bf0c --- /dev/null +++ b/examples/otel/configs/alertmanager.yml @@ -0,0 +1 @@ +../../../configs/alertmanager.yml \ No newline at end of file diff --git a/examples/otel/configs/autometrics.rules.yml b/examples/otel/configs/autometrics.rules.yml new file mode 120000 index 0000000..2048540 --- /dev/null +++ b/examples/otel/configs/autometrics.rules.yml @@ -0,0 +1 @@ +../../../configs/shared/autometrics.rules.yml \ No newline at end of file diff --git a/examples/otel/prometheus.yaml b/examples/otel/configs/prometheus.yaml similarity index 100% rename from examples/otel/prometheus.yaml rename to examples/otel/configs/prometheus.yaml diff --git a/examples/otel/load.Dockerfile b/examples/otel/load.Dockerfile new file mode 100644 index 0000000..b20451d --- /dev/null +++ b/examples/otel/load.Dockerfile @@ -0,0 +1,8 @@ +FROM golang:1.20-alpine +MAINTAINER Fiberplane + +RUN apk update && apk add curl + +COPY examples/web/scripts/poll_server / + +CMD [ "/poll_server" ] diff --git a/examples/otel/scripts/build_server b/examples/otel/scripts/build_server new file mode 100755 index 0000000..865bd73 --- /dev/null +++ b/examples/otel/scripts/build_server @@ -0,0 +1,11 @@ +#!/bin/sh + +set -euo pipefail + +VERSION=${VERSION:-development} +COMMIT=`git log -1 --format="%H"` +BUILD_TIME=`date -Iseconds` +SCRIPT_DIR="$( dirname -- "$( readlink -f -- "$0"; )"; )" + + +go build -v -ldflags="-X 'main.Version=${VERSION}' -X 'main.Commit=${COMMIT}' -X 'main.BuildTime=${BUILD_TIME}'" -o web-server ${SCRIPT_DIR}/../cmd/main.go diff --git a/examples/web/poll_server b/examples/otel/scripts/poll_server similarity index 50% rename from examples/web/poll_server rename to examples/otel/scripts/poll_server index 2632d28..e395463 100755 --- a/examples/web/poll_server +++ b/examples/otel/scripts/poll_server @@ -2,13 +2,15 @@ set -euo pipefail +TARGET_HOST="${TARGET_HOST:-localhost}" + while true do if [ "$(($RANDOM % 2))" == "0" ]; then - curl http://localhost:62086/random-error + curl "http://${TARGET_HOST}:62086/random-error" fi if [ "$(($RANDOM % 4))" == "0" ]; then - curl http://localhost:62086/ + curl "http://${TARGET_HOST}:62086/" fi - sleep 1 + sleep 0.2 done diff --git a/examples/web/Dockerfile b/examples/web/Dockerfile index 782f7a9..1a87b34 100644 --- a/examples/web/Dockerfile +++ b/examples/web/Dockerfile @@ -1,16 +1,24 @@ FROM golang:1.20-alpine MAINTAINER Fiberplane +ARG version=development -# Cannot really build the demo image from -# the examples subfolder because of -# relative imports shenanigans that go out of build context (i.e. upwards) -# -# Use -# GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -# -# To build the web-server app +RUN apk update && apk add git -COPY web-server / +WORKDIR /app + +COPY . ./ + +RUN go mod download + +WORKDIR /app/examples/web + +RUN go generate cmd/main.go + +ENV VERSION="$version" + +RUN scripts/build_server + +RUN cp web-server / EXPOSE 62086 diff --git a/examples/web/README.md b/examples/web/README.md index c52d733..9cdb9ca 100644 --- a/examples/web/README.md +++ b/examples/web/README.md @@ -8,9 +8,12 @@ It shows the generator usage and sets up Prometheus to showcase the ## Quick start ``` sh -GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -docker compose up -d -./poll_server +# Go to the root of the repo +cd ../.. +# Build all the images +docker compose -f docker-compose.prometheus-example.yaml build +# Run all the services +docker compose -f docker-compose.prometheus-example.yaml up ``` Then open [main](./cmd/main.go) in your editor and interact with the documentation links! @@ -21,7 +24,8 @@ Then open [main](./cmd/main.go) in your editor and interact with the documentati Optionnally, create a slack integration with an "incoming webhook" for one of your channels, and put the URL of the webhook (a secret!) in `slack_url.txt` in -the directory. That will enable alerting in Slack directly through Alertmanager. +the [configs](./configs) directory. That will enable alerting in Slack directly +through Alertmanager. You can see that the name of the service "API" comes directly from the annotation in the code. @@ -36,7 +40,6 @@ You can even monitor all the alerts triggering through Prometheus or Alertmanage In order to run this example you need: -- Go (at least 1.18) - Docker - Docker Compose @@ -66,20 +69,24 @@ The generator is idempotent. ### Building the docker image -Build the web-server for the image architecture: +Build the web-server in an image. There are 2 important things in the +image [recipe](./Dockerfile): +- The context of the image is the root of the repository, only so that this + example runs the `development` version of the code, and +- There is a specific [build script](./scripts/build_server) that uses + Go linker flags to inject build and version information in the binary. ```sh -GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -docker compose build +docker build --build-arg VERSION=1.0.0 -t web-server . ``` ### Start the services -In one terminal you can launch the stack and the small helper script to poll the the server: +In one terminal you can launch the image and the small helper script to poll the the server: ```sh -docker compose up -d -./poll_server +cd ../.. +docker compose -f docker-compose.prometheus-example.yaml up ``` ### Check the links on Prometheus @@ -104,9 +111,9 @@ configuration to the correct notification service: ![Alertmanager alerts dashboard showing the alerts firing](../../assets/alertmanager-alert-example.png) -This demo example has a [minimal configuration](./alertmanager.yml) for alerts +This demo example has a [minimal configuration](./configs/alertmanager.yml) for alerts that expects a file `slack_url.txt` to be passed in docker-compose context. -Create the file in the same folder as this README, and if the file exists, the +Create the file in the [configs folder](./configs), and if the file exists, the triggered alerts automatically go on Slack to the configured channel: ![a Slack bot is posting an alert directly in the channel](../../assets/slack-alert-example.png) diff --git a/examples/web/alertmanager.yml b/examples/web/alertmanager.yml deleted file mode 100644 index 92d01ab..0000000 --- a/examples/web/alertmanager.yml +++ /dev/null @@ -1,17 +0,0 @@ -global: - # Also possible to use the URL directly - # Ex: `slack_api_url: 'https://slack.com/...'` - slack_api_url_file: '/etc/alertmanager/slack_url' - -route: - receiver: 'slack-notifications' - group_by: [sloth_service, sloth_slo, objective_name] - -receivers: -- name: 'slack-notifications' - slack_configs: - # Channel is ignored when using a webhook. The webhook URL encodes the - # channel the alerts will be posted to. - - channel: '#alerts' - title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}" - text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}" diff --git a/examples/web/autometrics.rules.yml b/examples/web/autometrics.rules.yml deleted file mode 120000 index 9f80a00..0000000 --- a/examples/web/autometrics.rules.yml +++ /dev/null @@ -1 +0,0 @@ -../../configs/autometrics.rules.yml \ No newline at end of file diff --git a/examples/web/cmd/main.go b/examples/web/cmd/main.go index e9260ca..7a5a7c3 100644 --- a/examples/web/cmd/main.go +++ b/examples/web/cmd/main.go @@ -16,6 +16,10 @@ import ( // README //go:generate go run ../../../cmd/autometrics/main.go +var Version = "development" +var Commit = "n/a" +var BuildTime string + func main() { rand.Seed(time.Now().UnixNano()) @@ -26,9 +30,9 @@ func main() { nil, amImpl.DefBuckets, amImpl.BuildInfo{ - Version: "0.4.0", - Commit: "anySHA", - BuildTime: "", + Version: Version, + Commit: Commit, + BuildTime: BuildTime, }, ) @@ -62,12 +66,12 @@ func main() { // // autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 // //autometrics:doc --slo "API" --latency-target 99 --latency-ms 250 func indexHandler(w http.ResponseWriter, _ *http.Request) error { @@ -108,12 +112,12 @@ var handlerError = errors.New("failed to handle request") // // autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 // //autometrics:doc --slo "API" --success-target 90 func randomErrorHandler(w http.ResponseWriter, _ *http.Request) (err error) { diff --git a/examples/web/cmd/main.go.orig b/examples/web/cmd/main.go.orig index 2fe7a85..86057f7 100644 --- a/examples/web/cmd/main.go.orig +++ b/examples/web/cmd/main.go.orig @@ -16,6 +16,10 @@ import ( // README //go:generate go run ../../../cmd/autometrics/main.go +var Version = "development" +var Commit = "n/a" +var BuildTime string + func main() { rand.Seed(time.Now().UnixNano()) @@ -26,9 +30,9 @@ func main() { nil, amImpl.DefBuckets, amImpl.BuildInfo{ - Version: "0.4.0", - Commit: "anySHA", - BuildTime: "", + Version: Version, + Commit: Commit, + BuildTime: BuildTime, }, ) diff --git a/examples/web/configs/alertmanager.yml b/examples/web/configs/alertmanager.yml new file mode 120000 index 0000000..2b1bf0c --- /dev/null +++ b/examples/web/configs/alertmanager.yml @@ -0,0 +1 @@ +../../../configs/alertmanager.yml \ No newline at end of file diff --git a/examples/web/configs/autometrics.rules.yml b/examples/web/configs/autometrics.rules.yml new file mode 120000 index 0000000..2048540 --- /dev/null +++ b/examples/web/configs/autometrics.rules.yml @@ -0,0 +1 @@ +../../../configs/shared/autometrics.rules.yml \ No newline at end of file diff --git a/examples/web/prometheus.yaml b/examples/web/configs/prometheus.yaml similarity index 100% rename from examples/web/prometheus.yaml rename to examples/web/configs/prometheus.yaml diff --git a/examples/web/load.Dockerfile b/examples/web/load.Dockerfile new file mode 100644 index 0000000..b20451d --- /dev/null +++ b/examples/web/load.Dockerfile @@ -0,0 +1,8 @@ +FROM golang:1.20-alpine +MAINTAINER Fiberplane + +RUN apk update && apk add curl + +COPY examples/web/scripts/poll_server / + +CMD [ "/poll_server" ] diff --git a/examples/web/scripts/build_server b/examples/web/scripts/build_server new file mode 100755 index 0000000..865bd73 --- /dev/null +++ b/examples/web/scripts/build_server @@ -0,0 +1,11 @@ +#!/bin/sh + +set -euo pipefail + +VERSION=${VERSION:-development} +COMMIT=`git log -1 --format="%H"` +BUILD_TIME=`date -Iseconds` +SCRIPT_DIR="$( dirname -- "$( readlink -f -- "$0"; )"; )" + + +go build -v -ldflags="-X 'main.Version=${VERSION}' -X 'main.Commit=${COMMIT}' -X 'main.BuildTime=${BUILD_TIME}'" -o web-server ${SCRIPT_DIR}/../cmd/main.go diff --git a/examples/otel/poll_server b/examples/web/scripts/poll_server similarity index 50% rename from examples/otel/poll_server rename to examples/web/scripts/poll_server index 2632d28..e395463 100755 --- a/examples/otel/poll_server +++ b/examples/web/scripts/poll_server @@ -2,13 +2,15 @@ set -euo pipefail +TARGET_HOST="${TARGET_HOST:-localhost}" + while true do if [ "$(($RANDOM % 2))" == "0" ]; then - curl http://localhost:62086/random-error + curl "http://${TARGET_HOST}:62086/random-error" fi if [ "$(($RANDOM % 4))" == "0" ]; then - curl http://localhost:62086/ + curl "http://${TARGET_HOST}:62086/" fi - sleep 1 + sleep 0.2 done diff --git a/internal/autometrics/doc.go b/internal/autometrics/doc.go index 6efa99c..7c963e5 100644 --- a/internal/autometrics/doc.go +++ b/internal/autometrics/doc.go @@ -68,7 +68,7 @@ func requestRateQuery(counterName, labelKey, labelValue string) string { } func errorRatioQuery(counterName, labelKey, labelValue string) string { - return fmt.Sprintf("sum by (%s, %s, %s, %s) (rate(%s{%s=\"%s\",%s=\"error\"}[5m]) %s)", + return fmt.Sprintf("(sum by (%s, %s, %s, %s) (rate(%s{%s=\"%s\",%s=\"error\"}[5m]) %s)) / (%s)", prometheus.FunctionLabel, prometheus.ModuleLabel, prometheus.VersionLabel, @@ -78,6 +78,7 @@ func errorRatioQuery(counterName, labelKey, labelValue string) string { labelValue, prometheus.ResultLabel, addBuildInfoLabels(), + requestRateQuery(counterName, labelKey, labelValue), ) } diff --git a/internal/generate/generate_test.go b/internal/generate/generate_test.go index 42be489..bf3bbca 100644 --- a/internal/generate/generate_test.go +++ b/internal/generate/generate_test.go @@ -63,11 +63,11 @@ func main() { "//\tautometrics:doc-end Generated documentation by Autometrics.\n" + "//\n" + "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "//\n" + "//autometrics:doc --slo \"Service Test\" --success-target 99\n" + "func main() {\n" + @@ -149,11 +149,11 @@ func main() { "//\tautometrics:doc-end Generated documentation by Autometrics.\n" + "//\n" + "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "//\n" + "//autometrics:doc --slo \"API\" --latency-target 99.9 --latency-ms 500\n" + "func main() {\n" +