From 8e1391ce8c7b909e623d19f37b49ae338e79a9a0 Mon Sep 17 00:00:00 2001 From: Luca Joss <43531661+ljoss17@users.noreply.github.com> Date: Tue, 28 Nov 2023 11:21:18 +0100 Subject: [PATCH] Add `broadcast_errors` metric (#3710) * Add metric for broadcast errors * Update the guide with the new broadcast error metric * Add changelog entry --- .../3708-add-broadcast-errors.md | 3 +++ crates/relayer/src/chain/cosmos/retry.rs | 14 +++++++++++ crates/telemetry/src/state.rs | 24 +++++++++++++++++++ .../src/documentation/telemetry/operators.md | 1 + 4 files changed, 42 insertions(+) create mode 100644 .changelog/unreleased/features/ibc-telemetry/3708-add-broadcast-errors.md diff --git a/.changelog/unreleased/features/ibc-telemetry/3708-add-broadcast-errors.md b/.changelog/unreleased/features/ibc-telemetry/3708-add-broadcast-errors.md new file mode 100644 index 0000000000..089736a3e7 --- /dev/null +++ b/.changelog/unreleased/features/ibc-telemetry/3708-add-broadcast-errors.md @@ -0,0 +1,3 @@ +- Add a new metric `broadcast_errors`` which + records the errors observed when broadcasting Txs + ([\#3708](https://github.com/informalsystems/hermes/issues/3708)) \ No newline at end of file diff --git a/crates/relayer/src/chain/cosmos/retry.rs b/crates/relayer/src/chain/cosmos/retry.rs index 4d0091fc92..9f9a5daa8a 100644 --- a/crates/relayer/src/chain/cosmos/retry.rs +++ b/crates/relayer/src/chain/cosmos/retry.rs @@ -107,6 +107,13 @@ async fn do_send_tx_with_account_sequence_retry( refreshing account sequence number and retrying once" ); + telemetry!( + broadcast_errors, + &account.address.to_string(), + response.code.into(), + &response.log, + ); + refresh_account_and_retry_send_tx_with_account_sequence( rpc_client, config, key_pair, account, tx_memo, messages, ) @@ -147,6 +154,13 @@ async fn do_send_tx_with_account_sequence_retry( "failed to broadcast tx with unrecoverable error" ); + telemetry!( + broadcast_errors, + &account.address.to_string(), + code.into(), + &response.log + ); + Ok(response) } } diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index fc4680b1be..4abdb93740 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -197,6 +197,9 @@ pub struct TelemetryState { /// Sum of rewarded fees over the past FEE_LIFETIME seconds period_fees: ObservableGauge, + + /// Number of errors observed by Hermes when broadcasting a Tx + broadcast_errors: Counter, } impl TelemetryState { @@ -371,6 +374,13 @@ impl TelemetryState { .u64_observable_gauge("ics29_period_fees") .with_description("Amount of ICS29 fees rewarded over the past 7 days") .init(), + + broadcast_errors: meter + .u64_counter("broadcast_errors") + .with_description( + "Number of errors observed by Hermes when broadcasting a Tx", + ) + .init(), } } @@ -1069,6 +1079,20 @@ impl TelemetryState { pub fn add_visible_fee_address(&self, address: String) { self.visible_fee_addresses.insert(address); } + + /// Add an error and its description to the list of errors observed after broadcasting + /// a Tx with a specific account. + pub fn broadcast_errors(&self, address: &String, error_code: u32, error_description: &String) { + let cx = Context::current(); + + let labels = &[ + KeyValue::new("account", address.to_string()), + KeyValue::new("error_code", error_code.to_string()), + KeyValue::new("error_description", error_description.to_string()), + ]; + + self.broadcast_errors.add(&cx, 1, labels); + } } use std::sync::Arc; diff --git a/guide/src/documentation/telemetry/operators.md b/guide/src/documentation/telemetry/operators.md index af59e1590c..c550232beb 100644 --- a/guide/src/documentation/telemetry/operators.md +++ b/guide/src/documentation/telemetry/operators.md @@ -142,6 +142,7 @@ the `backlog_oldest_sequence` that is blocked. | `tx_latency_submitted` | Latency for all transactions submitted to a chain (i.e., difference between the moment when Hermes received an event until the corresponding transaction(s) were submitted), per chain, counterparty chain, channel and port | `u64` ValueRecorder | None | | `cleared_send_packet_count_total`  | Number of SendPacket events received during the initial and periodic clearing, per chain, counterparty chain, channel and port | `u64` Counter | Packet workers enabled, and periodic packet clearing or clear on start enabled | | `cleared_acknowledgment_count_total` | Number of WriteAcknowledgement events received during the initial and periodic clearing, per chain, counterparty chain, channel and port | `u64` Counter | Packet workers enabled, and periodic packet clearing or clear on start enabled | +| `broadcast_errors_total` | Number of errors observed by Hermes when broadcasting a Tx, per error type and account | `u64` Counter | Packet workers enabled | Notes: - The two metrics `cleared_send_packet_count_total` and `cleared_acknowledgment_count_total` are only populated if `tx_confirmation = true`.