Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include healthcheck logic for helper scripts running as sidecars #1842

Draft
wants to merge 15 commits into
base: alpha
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
262 changes: 230 additions & 32 deletions files/docker/node/addons/healthcheck.sh
Original file line number Diff line number Diff line change
@@ -1,43 +1,241 @@
#!/usr/bin/env bash
#!/bin/bash
# shellcheck source=/dev/null
# shellcheck disable=SC2317
######################################
# User Variables - Change as desired #
# Common variables set in env file #
######################################

source /opt/cardano/cnode/scripts/env
ENTRYPOINT_PROCESS="${ENTRYPOINT_PROCESS:-cnode.sh}" # Get the script from ENTRYPOINT_PROCESS or default to "cnode.sh" if not set
HEALTHCHECK_CPU_THRESHOLD="${HEALTHCHECK_CPU_THRESHOLD:-80}" # The CPU threshold to warn about if the sidecar process exceeds this for more than 60 seconds, defaults to 80%.
HEALTHCHECK_RETRIES="${HEALTHCHECK_RETRIES:-20}" # The number of retries if tip is not incrementing, or cpu usage is over the threshold
HEALTHCHECK_RETRY_WAIT="${HEALTHCHECK_RETRY_WAIT:-3}" # The time (in seconds) to wait between retries
rdlrt marked this conversation as resolved.
Show resolved Hide resolved
DB_SYNC_ALLOWED_DRIFT="${DB_SYNC_ALLOWED_DRIFT:-3600}" # The allowed drift in seconds for the DB to be considered in sync
CNCLI_DB_ALLOWED_DRIFT="${CNCLI_DB_ALLOWED_DRIFT:-300}" # The allowed drift in slots for the CNCLI DB to be considered in sync

CCLI=$(which cardano-cli)
######################################
# Do NOT modify code below #
######################################

if [[ "$NETWORK" == "guild-mainnet" ]]; then NETWORK=mainnet; fi
[[ ${0} != '-bash' ]] && PARENT="$(dirname $0)" || PARENT="$(pwd)"
# Check if env file is missing in current folder (no update checks as will mostly run as daemon), source env if present
[[ ! -f "${PARENT}"/env ]] && echo -e "\nCommon env file missing in \"${PARENT}\", please ensure latest guild-deploy.sh was run and this script is being run from ${CNODE_HOME}/scripts folder! \n" && exit 1
. "${PARENT}"/env offline

# For querying tip, the seperation of testnet-magic vs mainnet as argument is optional
# Define a mapping of scripts to their corresponding health check functions
declare -A PROCESS_TO_HEALTHCHECK
PROCESS_TO_HEALTHCHECK=(
["dbsync.sh"]="check_db_sync"
["cnode.sh"]="check_node"
["cncli.sh"]="check_cncli"
)

FIRST=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block)
# FUNCTIONS
check_cncli() {
cncli_pid=$(pgrep -f "${ENTRYPOINT_PROCESS}")
cncli_subcmd=$(ps -p "${cncli_pid}" -o cmd= | awk '{print $NF}')

if [[ "${ENABLE_KOIOS}" == "N" ]] || [[ -z "${KOIOS_API}" ]]; then
# when KOIOS is not enabled or KOIOS_API is unset, use default behavior
sleep 60;
SECOND=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block)
if [[ "$FIRST" -ge "$SECOND" ]]; then
echo "there is a problem"
exit 1
if [[ "${cncli_subcmd}" != "ptsendtip" ]]; then
if check_cncli_db ; then
return 0
else
return 1
fi
else
echo "we're healthy - node: $FIRST -> node: $SECOND"
if check_cncli_send_tip; then
return 0
else
return 1
fi
fi
else
# else leverage koios and only require the node is on tip
CURL=$(which curl)
JQ=$(which jq)
URL="${KOIOS_API}/tip"
SECOND=$($CURL -s "${URL}" | $JQ '.[0].block_no')
for (( CHECK=1; CHECK<=20; CHECK++ )); do
if [[ "$FIRST" -eq "$SECOND" ]]; then
echo "we're healthy - node: $FIRST == koios: $SECOND"
exit 0
elif [[ "$FIRST" -lt "$SECOND" ]]; then
sleep 3
FIRST=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block)
elif [[ "$FIRST" -gt "$SECOND" ]]; then
sleep 3
SECOND=$($CURL "${KOIOS_URL}" | $JQ '.[0].block_no')
}


check_cncli_db() {
CCLI=$(which cardano-cli)
SQLITE=$(which sqlite3)
# Check if the DB is in sync
CNCLI_SLOT=$(${SQLITE} "${CNODE_HOME}/guild-db/cncli/cncli.db" 'select slot_number from chain order by id desc limit 1;')
NODE_SLOT=$(${CCLI} query tip --testnet-magic "${NWMAGIC}" | jq .slot)
if check_tip "${NODE_SLOT}" "${CNCLI_SLOT}" "${CNCLI_DB_ALLOWED_DRIFT}" ; then
echo "We're healthy - DB is in sync"
return 0
else
echo "Error: DB is not in sync"
return 1
fi
}


# Function to check if the tip is successfully being sent to Pooltool
rdlrt marked this conversation as resolved.
Show resolved Hide resolved
check_cncli_send_tip() {
# Timeout in seconds for capturing the log entry
log_entry_timeout=60

# Get the process ID of cncli
process_id=$(pgrep -of cncli) || {
echo "Error: cncli process not found."
return 1 # Return 1 if the process is not found
}

# Loop through the retries
for (( CHECK=0; CHECK<=HEALTHCHECK_RETRIES; CHECK++ )); do

# Define the error suffix message for retries
if [ "$HEALTHCHECK_RETRIES" -ne 0 ]; then
error_message_suffix="Attempt $((CHECK + 1)). Retrying in $HEALTHCHECK_RETRY_WAIT seconds."
else error_message_suffix="Retries disabled (HEALTHCHECK_RETRIES=0)"
fi

# Capture the next output from cncli that is related to Pooltool
pt_log_entry=$(timeout $log_entry_timeout cat /proc/$process_id/fd/1 | grep -i --line-buffered "pooltool" | head -n 1)
if [ -z "$pt_log_entry" ]; then
echo "Unable to capture cncli output within $log_entry_timeout seconds. $error_message_suffix"
sleep $HEALTHCHECK_RETRY_WAIT # Wait n seconds then retry
continue # Retry if the output capture fails
fi

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the part I was mentioning earlier about "fail early". The reason being is we have compounded retries occurring which reduces observability. I want to apologize (profusely) in advance, this comment is going to be pretty verbose to break down what I mean by compounding retries and reduced observability.


First, I can definitely see where my earlier statements about using the query tip 60 seconds and relying on the 3 "retries" was confusing. What I was referring to is this portion of the current healthcheck node query tip:

if [[ "${ENABLE_KOIOS}" == "N" ]] || [[ -z "${KOIOS_API}" ]]; then
# when KOIOS is not enabled or KOIOS_API is unset, use default behavior
sleep 60;
SECOND=$($CCLI query tip --testnet-magic ${NWMAGIC} | jq .block)
if [[ "$FIRST" -ge "$SECOND" ]]; then
echo "there is a problem"
exit 1
else
echo "we're healthy - node: $FIRST -> node: $SECOND"
fi

It sleeps 60 between the first block and second, and has zero retries. It only relies on the --healthcheck-retries of the container configuration. This results in a minimum 15 minutes and maximum 24 minutes before the container actually shows an unhealthy status. It does not compound retries which makes the status change from healthy -> unhealthy take even longer.

I'll clarify what I mean by compounding retries:

  1. The containers own HEALTHCHECK defined in the dockerfile sets the container automatically to use --health-start-period 5m --healthcheck-interval 5m without being defined it also gets --healthcheck-retries 3 from the default number of retries. Here is the container inspect output with time in nanoseconds.

    podman container inspect preview-ccio-pool | jq '.[].Config.Healthcheck'
    {
      "Test": [
        "CMD-SHELL",
        "/home/guild/.scripts/healthcheck.sh"
      ],
      "StartPeriod": 300000000000,
      "Interval": 300000000000,
      "Timeout": 100000000000
    }
    
  2. The check for ptsendtip does a loop with HEALTHCHECK_RETRIES (default 20), with log_entry_timeout (set to 60), if it fails we wait HEALTHCHECK_RETRY_WAIT (default 3), and start again.

    • The check for ptsendtip has the potential to run for 21 minutes runtime (60 seconds per loop + 3 second wait, times 20).
    • The container healthcheck timeout is set to 100 seconds. Exceeding timeout causes an unhealthy response
    • This status changing from healthy to unhealthy requires exhausting the containers retry value, multiplied by the Interval value.

That is a minimum of 15 and maximum of almost 27 minutes (including runtime).

I see multiple options as a solution:

  1. Drop the retry logic from check_cncli_send_tip(). Let it run once for 60 seconds and exit 1 if it doesn't see a block. This makes it similar to the query tip in the timeout and all other checks in regards to no retries (except the KOIOS version of check_node).

  2. Keep the retry logic in check_cncli_send_tip() and set the HEALTCHECK_RETRIES to default to 0.

  3. Add retry logic to every check function in the healthcheck script and set HEALTCHECK_RETRIES to default to 0.

Each option results in a 15-21 minute window before knowing the container is unhealthy when the container configuration for healthcheck intervals & retries and all script variables are left at current/default values.

I'm torn between 3 and 1 as my favorite option. Improving observability even further requires either the operator to adjust the startup, interval, timeout and retries when creating the container OR for us to come to a consensus about adjusting the Dockerfile HEALTHCHECK statement when building the container image.

Copy link
Contributor

@adamsthws adamsthws Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great, thankyou for the thorough explination.

Drop the retry logic from check_cncli_send_tip()

My gut feeling is that this would be the prefered approach, minimising the code to add/maintain.

Let it run once for 60 seconds

To test how often it times out, I'm in the process of running the function 100 times with varying timeout values...

  • timeout=60 - Result: x% timeout frequency (x-fail/x-success).
  • timeout=90 - Result: x% timeout frequency (x-fail/x-success).
  • timeout=120 - Result: x% timeout frequency (x-fail/x-success).
    (Results TBD)

After removing retries from the function, if it times out, it then must wait another 5mins before retring again --healthcheck-interval 5m... I propose to change the timeout within the function to something larger, maybe 90s-120s (depending on the results of my testing).

E.g timeout=90 (rather than timeout=60) increases the container time to failure by 30s while decreasing the container time to sucess by relatively larger 4.5min (by avoiding unnecassary timouts and retries).

What would be the maximum time you would expect between tips?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would be the maximum time you would expect between tips?

Anecdotally I have seen Tip (diff) reach 60 seconds quite often. Occasionally a bit higher at 100-120 seconds. The KOIOS addition was because the original query tip I had twice over a few months hit 60 seconds on repeated retries and marked it unhealthy causing some investigations. After adding KOIOS, or even disabling KOIOS and using the original query tip, I don't remember it happening in the last year seeing unhealthy on the node.


In rare, extreme events, it has exceeded 5 minutes. Node 1.35, or 1.34, had one event long ago that was exceeding 5 minutes. Not sure if this linked issue/comment showing 8 minute delay between blocks is the one, or something else as I didnt read the whole thread. The time frame seems like it could be the one I'm thinking of, and I don't recall at the moment another lasting longer than 5.

IntersectMBO/cardano-node#4421 (comment)

Again, these are rare "extreme" events, and the healthcheck in theory wouldn't even point this out with the current configuration anyway when it lasts less than the --healthcheck-retries multiplied by the interval, we should in theory only see this in grafana/prometheus and in theory the container runtimes would still be showing a healthy container even with timeout=60.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To test how often it times out, I'm in the process of running the function 100 times with varying timeout values...

Here are the results:
timeout=60 - Result: 9% timeout frequency (9-fail/91-success).
timeout=90 - Result: 4% timeout frequency (4-fail/96-success).
timeout=120 - Result: 2% timeout frequency (2-fail/98-success).

I had twice over a few months hit 60 seconds on repeated retries and marked it unhealthy causing some investigations

In this limited testing sample, it has already failed sucessively (twice in a row) - (with lower timout of 60s)... So I feel using a higher timout would mean less chance of failure three times in a row, causing unnceassary investigations.

...How would you feel about making timeout=120?

Copy link
Collaborator Author

@TrevorBenson TrevorBenson Jan 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you happen to have details about consecutive failure/timeouts? For example based on the 300 second interval if it's never observed that we have 1 consecutive failures over 100 attempts, then it still results in 0 status changes to unhealty.

To bump the timeout even higher I'd probably want to see 2, possibly 3, consecutive timeoutes reached based on X interval being tested. If it reaches 50-66% of the required consecutive failures to cause the node to be marked unhealthy then I would definitely consider bumping it up.

However, if the results were similar to KOIOS where there was never more than 1 failure and 0 consecutive failures I would prefer leaving the internal timeout for the check at 60. Raising it to 120 would also require an increase of the Dockerfile HEALTHCHECK settings timeout (aka --healthcheck-timeout) value currently at 100.

If you don't have the details on how many consecutive failures were observed this gist contains the POC I did to gather the details for KOIOS. I won't get time to setup a ptsendtip until sometime this weekend, which would push my ability to run this test out to Sunday or sometime next week.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you happen to have details about consecutive failure/timeouts? For example based on the 300 second interval if it's never observed that we have 1 consecutive failures over 100 attempts, then it still results in 0 status changes to unhealty.

These tests had no interval... I didn't feel adding an interval would effect the results in any meaningful way.

  • timeout=60 - Result: 9% timeout frequency (9-fail/91-success).

    • failed consecutively 1 time. (interval=0)
  • timeout=90 - Result: 4% timeout frequency (4-fail/96-success).

    • failed consecutively 0 times. (interval=0)
  • timeout=120 - Result: 2% timeout frequency (2-fail/98-success).

    • failed consecutively 0 times. (interval=0)

To bump the timeout even higher I'd probably want to see 2, possibly 3, consecutive timeoutes reached based on X interval being tested. If it reaches 50-66% of the required consecutive failures to cause the node to be marked unhealthy then I would definitely consider bumping it up.

I'm in the process of re-running it with timeout=60, interval=300s... I'll keep you posted.

Raising it to 120 would also require an increase of the Dockerfile HEALTHCHECK settings timeout (aka --healthcheck-timeout) value currently at 100.

Oh yes, thanks for pointing that out. Let's see how the re-test goes.

If you don't have the details on how many consecutive failures were observed this gist contains the POC I did to gather the details for KOIOS.

This looks great. Nice use of sqlite!
Here's my test, it's not as elegant but it gets the job done:
https://gist.github.com/adamsthws/dce0263bea2302047660b9c8ea458cbd

# Define the success message to check for
success_status='.*"success":true.*'
failure_status='.*"success":false.*'

# Check if the success message exists in the captured log
if echo "$pt_log_entry" | grep -q $success_status; then
echo "Healthy: Tip is being sent to Pooltool."
return 0 # Return 0 if the success message is found
elif echo "$pt_log_entry" | grep -q $failure_status; then
failure_message=$(echo "$pt_log_entry" | grep -oP '"message":"\K[^"]+')
echo "Failed to send tip. $failure_message"
return 1 # Return 1 if the success message is not found
fi
done
echo "there is a problem"
exit 1

echo "Error: Max retries reached."
return 1 # Return 1 if retries are exhausted
}


check_db_sync() {
# Check if the DB is in sync
[[ -z "${PGPASSFILE}" ]] && PGPASSFILE="${CNODE_HOME}/priv/.pgpass"
if [[ ! -f "${PGPASSFILE}" ]]; then
echo "ERROR: The PGPASSFILE (${PGPASSFILE}) not found, please ensure you've followed the instructions on guild-operators website!" && exit 1
return 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For docker/podman, there arent current instructions - we need to possibly attach a sample reference compose file , wherein PGPASSFILE can be added (or instructions added to be part of priv folder)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the current Guild Ops DBSync documentation which mentions this considered insufficient in this case? It documents it at the top of the page, and lower down shows examples of using the priv folder as the path.

image

Also, this healtcheck won't impose any new requirements for using dbsync.sh in a container that don't already exist using it on bare metal.

  • Line 120 comes from line 42 of set_defaults() function:

    set_defaults() {
    if [[ -z "${DBSYNCBIN}" ]]; then
    [[ -f "${HOME}/.local/bin/cardano-db-sync" ]] && DBSYNCBIN="${HOME}/.local/bin/cardano-db-sync" || DBSYNCBIN="$(command -v cardano-db-sync)"
    fi
    [[ -z "${PGPASSFILE}" ]] && PGPASSFILE="${CNODE_HOME}/priv/.pgpass"
    [[ -z "${DBSYNC_CONFIG}" ]] && DBSYNC_CONFIG="${CNODE_HOME}/files/dbsync.json"
    [[ -z "${DBSYNC_SCHEMA_DIR}" ]] && DBSYNC_SCHEMA_DIR="${CNODE_HOME}/guild-db/schema"
    [[ -z "${DBSYNC_STATE_DIR}" ]] && DBSYNC_STATE_DIR="${CNODE_HOME}/guild-db/ledger-state"
    [[ -z "${SYSTEMD_PGNAME}" ]] && SYSTEMD_PGNAME="postgresql"
    }

  • Lines 121-122 come from lines 52-53 of check_defaults() function:

    check_defaults() {
    if [[ -z "${DBSYNCBIN}" ]]; then
    echo "ERROR: DBSYNCBIN variable is not set, please set full path to cardano-db-sync binary!" && exit 1
    elif [[ ! -f "${PGPASSFILE}" ]]; then
    echo "ERROR: The PGPASSFILE (${PGPASSFILE}) not found, please ensure you've followed the instructions on guild-operators website!" && exit 1
    exit 1
    elif [[ ! -f "${DBSYNC_CONFIG}" ]]; then
    echo "ERROR: Could not find the dbsync config file: ${DBSYNC_CONFIG} . Please ensure you've run guild-deploy.sh and/or edit the DBSYNC_CONFIG variable if using a custom file." && exit 1
    elif [[ ! -d "${DBSYNC_SCHEMA_DIR}" ]]; then
    echo "ERROR: The schema directory (${DBSYNC_SCHEMA_DIR}) does not exist. Please ensure you've follow the instructions on guild-operators website" && exit 1
    fi
    }

A compose file example is definitely an option. However, I find it can also be a crutch by allowing users to skip reading documentation. If you consider it a "must have" then I think it should be part of another PR and have its own separate testing and review.

Copy link
Contributor

@rdlrt rdlrt Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the current Guild Ops DBSync documentation which mentions this considered insufficient in this case? It documents it at the top of the page, and lower down shows examples of using the priv folder as the path.

I look at (atleast a big % of) docker users as spoon-fed users - not intended in any demeaning way, but the primary requirement of them using docker [besides virtual env isolation] would be simplicity - as we're not talking about slightly more experienced/advanced users already taking to next step and deploying in professional environments , their expectations will be not having to go through document pages beyond docker reference ones.

If someone running a manual setup is moving to docker, I dont expect them to have as many queries. It does kinda begs the question if docker users would also share the pre-reqs on home page - in my experience of reading github issues on IO/intersectmbo issues, a very large section does not.

If you consider it a "must have" then I think it should be part of another PR and have its own separate testing and review.

Ye - I agree. That - if added - does not need to be part of this PR indeed

Copy link
Contributor

@rdlrt rdlrt Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Talking about dbsync in particular, I would think this would likely be a seperate docker image altogether (which is why I mentioned earlier there is some work to be done):

  • Would lighten size (by removing binaries) from current cardano-node image [and vice versa]
  • Add more steps in build, like having schema directory specific to dbsync version, ledger-state directory
  • Above mentioned PGPASSFILE will be part of it's package
  • A dedicated page for operations for dbsync docker image.
  • Deploy pg_bech32 as part of image
  • Have better provision for snapshot restoration
  • Have script modifications to allow for any type of snapshot restore to compare config options (as users may otherwise end up with a lot of wasted hours to find out incompatibility)

In theory , same can be done for smaller components, but there is little value as their setups are mostly the same. For instance, for something like ogmios - we dont need a custom image, official image is perfectly usable.

Once above is available, a [docker|podman]-compose will essentially just give a reference on sharing volumes between the pods|containers

Copy link
Collaborator Author

@TrevorBenson TrevorBenson Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ye - I agree. That - if added - does not need to be part of this PR indeed

Sounds good, after this, and some other documentation issues/PR are resolved I'll open one and start a compose example.

Talking about dbsync in particular, I would think this would likely be a seperate docker image altogether [...]

Nice. So far I've been running the upstream db-sync image, as well as ogmios, and guild ops for node, cncli, and mithril. I'm hereby offering to assist with a new container if any help is needed.

In theory , same can be done for smaller components, but there is little value as their setups are mostly the same. For instance, for something like ogmios - we dont need a custom image, official image is perfectly usable.

Yep. The socket, configs (files) and just about every other subdir of CNODE_HOME lend themselves to volume usage. Not scripts though. The excessive work trying to make scripts a shared volume wasn't worth the time I initially spent to get it working. I've found a container environment file with all vars + a completely stock scripts directory and env file inside the container to be the simplest way to run separate containers per process with shared vars.

else
# parse the password from the pgpass file
IFS=':' read -r PGHOST PGPORT _ PGUSER PGPASSWORD < "${PGPASSFILE}"
PGDATABASE=cexplorer
export PGHOST PGPORT PGDATABASE PGUSER PGPASSWORD
fi
CURRENT_TIME=$(date +%s)
LATEST_BLOCK_TIME=$(date --date="$(psql -qt -c 'select time from block order by id desc limit 1;')" +%s)
if check_tip "${CURRENT_TIME}""${LATEST_BLOCK_TIME}" "${DB_SYNC_ALLOWED_DRIFT}"; then
echo "We're healthy - DB is in sync"
return 0
else
echo "Error: DB is not in sync"
return 1
fi
}


# Function to check if the node is running and is on tip
check_node() {
CCLI=$(which cardano-cli)

# Adjust NETWORK variable if needed
if [[ "$NETWORK" == "guild-mainnet" ]]; then NETWORK=mainnet; fi

FIRST=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block)

if [[ "${ENABLE_KOIOS}" == "N" ]] || [[ -z "${KOIOS_API}" ]]; then
sleep 60
SECOND=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block)
if [[ "$FIRST" -ge "$SECOND" ]]; then
echo "There is a problem"
return 1
else
echo "We're healthy - node: $FIRST -> node: $SECOND"
return 0
fi
else
CURL=$(which curl)
JQ=$(which jq)
URL="${KOIOS_API}/tip"
SECOND=$($CURL -s "${URL}" | $JQ '.[0].block_no')

for (( CHECK=0; CHECK<=HEALTHCHECK_RETRIES; CHECK++ )); do
if [[ "$FIRST" -eq "$SECOND" ]]; then
echo "We're healthy - node: $FIRST == koios: $SECOND"
return 0
elif [[ "$FIRST" -lt "$SECOND" ]]; then
sleep "$HEALTHCHECK_RETRY_WAIT"
FIRST=$($CCLI query tip --testnet-magic "${NWMAGIC}" | jq .block)
elif [[ "$FIRST" -gt "$SECOND" ]]; then
sleep "$HEALTHCHECK_RETRY_WAIT"
SECOND=$($CURL "${KOIOS_URL}" | $JQ '.[0].block_no')
fi
TrevorBenson marked this conversation as resolved.
Show resolved Hide resolved
done
echo "There is a problem"
return 1
fi
}

# Function to check if a process is running and its CPU usage
check_process() {
local process_name="$1"
local cpu_threshold="$2"

for (( CHECK=0; CHECK<=HEALTHCHECK_RETRIES; CHECK++ )); do
# Check CPU usage of the process
CPU_USAGE=$(ps -C "$process_name" -o %cpu= | awk '{s+=$1} END {print s}')

# Check if CPU usage exceeds threshold
if (( CPU_USAGE > cpu_threshold )); then
echo "Warning: High CPU usage detected for '$process_name' ($CPU_USAGE%)"
sleep "$HEALTHCHECK_RETRY_WAIT" # Retry after a pause
continue
fi

if ! pgrep -x "$process_name" > /dev/null && ! pgrep -x "sleep" > /dev/null; then
echo "Error: '$process_name' is not running, and no 'sleep' process found"
return 3 # Return 3 if the process is not running and sleep is not found
fi

echo "We're healthy - $process_name"
return 0 # Return 0 if the process is healthy
done

echo "Max retries reached for $process_name"
return 1 # Return 1 if retries are exhausted
}


check_tip() {
TIP=$1
DB_TIP=$2
ALLOWED_DRIFT=$3

if [[ $(( TIP - DB_TIP )) -lt ${ALLOWED_DRIFT} ]]; then
return 0
else
return 1
fi
}


# MAIN
if [[ -n "${PROCESS_TO_HEALTHCHECK[$ENTRYPOINT_PROCESS]}" ]]; then
echo "Checking health for $ENTRYPOINT_PROCESS"
eval "${PROCESS_TO_HEALTHCHECK[$ENTRYPOINT_PROCESS]}"
exit $?
else
# When
# Determine the process name or script to check health
if [[ -n "${SCRIPT_TO_BINARY_MAP[$ENTRYPOINT_PROCESS]}" ]]; then
process="${SCRIPT_TO_BINARY_MAP[$ENTRYPOINT_PROCESS]}"
fi
echo "Checking health for process: $process"
check_process "$process" "$HEALTHCHECK_CPU_THRESHOLD"
exit $?
fi
8 changes: 4 additions & 4 deletions files/docker/node/dockerfile_bin
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ RUN curl -sL -H "Accept: application/vnd.github.everest-preview+json" -H "Conte

# ENTRY SCRIPT
ADD https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/files/docker/node/addons/banner.txt \
https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/files/docker/node/addons/block_watcher.sh \
https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/files/docker/node/addons/healthcheck.sh /home/guild/.scripts/
ADD https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/scripts/cnode-helper-scripts/guild-deploy.sh \
https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/files/docker/node/addons/block_watcher.sh /home/guild/.scripts/
ADD https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/files/docker/node/addons/healthcheck.sh \
https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/scripts/cnode-helper-scripts/guild-deploy.sh \
https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/scripts/cnode-helper-scripts/mithril-client.sh \
https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/scripts/cnode-helper-scripts/mithril-signer.sh \
https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLOY_BRANCH}/scripts/cnode-helper-scripts/mithril-relay.sh /opt/cardano/cnode/scripts/
Expand All @@ -105,7 +105,7 @@ ADD https://raw.githubusercontent.com/${G_ACCOUNT}/guild-operators/${GUILD_DEPLO
RUN sudo chmod -R a+rx /home/guild/.scripts/*.sh /opt/cardano/cnode/scripts/*.sh /home/guild/entrypoint.sh /conf \
&& sudo chown -R guild:guild /home/guild/.* $CNODE_HOME /conf

HEALTHCHECK --start-period=5m --interval=5m --timeout=100s CMD /home/guild/.scripts/healthcheck.sh
HEALTHCHECK --start-period=5m --interval=5m --timeout=100s CMD /opt/cardano/cnode/scripts/healthcheck.sh

ENTRYPOINT ["./entrypoint.sh"]

Expand Down
Loading