From d752a5d89d6b384dda76407d8fd6f790dba90e34 Mon Sep 17 00:00:00 2001 From: Dorian Villet Date: Wed, 15 Nov 2023 21:47:26 +0100 Subject: [PATCH] Fix node provisioning ssh.dial timeout handling. --- provisioner/internal/docker.go | 12 ++++++------ provisioner/openstack/node.go | 26 ++++++++++++-------------- provisioner/openstack/provisioner.go | 2 +- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/provisioner/internal/docker.go b/provisioner/internal/docker.go index a074cf8..c082acf 100644 --- a/provisioner/internal/docker.go +++ b/provisioner/internal/docker.go @@ -169,10 +169,10 @@ func RunContainer( // Always wait 1 second before running the health check, and potentially more between retries time.Sleep(lo.Ternary(i > 0, interval, 1*time.Second)) - healthCheckLog := serviceLog.With(slog.Group("retry", "attempt", i+1, "interval", interval)) + healthCheckCmd := append([]string{service.Health.Cmd}, service.Health.Args...) exec, err := docker.ContainerExecCreate(ctx, containerId, types.ExecConfig{ - Cmd: append([]string{service.Health.Cmd}, service.Health.Args...), + Cmd: healthCheckCmd, Env: serviceEnv[service.Name], AttachStdout: true, // We are piping stdout to io.Discard to "wait" for completion }) @@ -183,7 +183,7 @@ func RunContainer( execCtx, cancel := context.WithTimeout(ctx, timeout) defer cancel() - healthCheckLog.Debug("Running health check") + serviceLog.Debug("Running health check", "cmd", healthCheckCmd, "attempt", i+1, "interval", interval) attach, err := docker.ContainerExecAttach(execCtx, exec.ID, types.ExecStartCheck{}) if err != nil { serviceErrors <- fmt.Errorf("failed to attach docker exec for service '%s': %w", service.Name, err) @@ -209,13 +209,13 @@ func RunContainer( return } if inspect.ExitCode == 0 { - healthCheckLog.Debug("Service is ready") + serviceLog.Debug("Service is ready") return } - healthCheckLog.Debug("Service health check failed, retrying...", "exitcode", inspect.ExitCode) + serviceLog.Debug("Service health check unsuccessful, retrying...", "exitcode", inspect.ExitCode) } else { - healthCheckLog.Debug("Service health check timed out, retrying...") + serviceLog.Debug("Service health check timed out, retrying...") } } diff --git a/provisioner/openstack/node.go b/provisioner/openstack/node.go index 1fb3e24..a77d5b7 100644 --- a/provisioner/openstack/node.go +++ b/provisioner/openstack/node.go @@ -2,14 +2,12 @@ package openstack import ( "context" - "errors" "fmt" "log/slog" "net" "os" "os/exec" "sync" - "syscall" "time" "github.com/docker/docker/client" @@ -77,17 +75,23 @@ func (n *Node) connect(server *servers.Server) (err error) { return fmt.Errorf("failed to find IPv4 address for server '%s'", n.name) } + initialWait, retryInterval, timeout := 10*time.Second, 5*time.Second, 3*time.Minute + // Initialize SSH connection - ctx, cancel := context.WithTimeout(context.TODO(), 1*time.Minute) + ctx, cancel := context.WithTimeout(context.TODO(), timeout) defer cancel() + // Let some time for the node to boot + n.log.Debug("Waiting for node to boot", "wait", initialWait) + time.Sleep(initialWait) + + connectionAttempts := 1 for n.ssh == nil { select { case <-ctx.Done(): - return fmt.Errorf("failed to connect to server '%s' after 1 minute", n.name) + return fmt.Errorf("failed to connect to server '%s' after %s and %d attempts: %w", n.name, timeout, connectionAttempts, err) default: - time.Sleep(5 * time.Second) n.ssh, err = ssh.Dial("tcp", fmt.Sprintf("%s:22", nodeAddress), &ssh.ClientConfig{ User: n.provisioner.config.SshUsername, Timeout: 10 * time.Second, @@ -97,15 +101,9 @@ func (n *Node) connect(server *servers.Server) (err error) { }, }) if err != nil { - switch { - case errors.Is(err, syscall.ECONNREFUSED), - errors.Is(err, syscall.ETIMEDOUT), - errors.Is(err, os.ErrDeadlineExceeded): - n.log.Debug("SSH connection to server refused, retrying in 5 seconds") - - default: - return fmt.Errorf("failed to connect to server '%s': %w", n.name, err) - } + n.log.Debug(fmt.Errorf("Connection to node refused (attempt %d), retrying in %s: %w", connectionAttempts, retryInterval, err).Error()) + time.Sleep(retryInterval) + connectionAttempts += 1 } } } diff --git a/provisioner/openstack/provisioner.go b/provisioner/openstack/provisioner.go index ea3e50e..2107005 100644 --- a/provisioner/openstack/provisioner.go +++ b/provisioner/openstack/provisioner.go @@ -127,7 +127,7 @@ func (p *Provisioner) Provision(nodeName string) (scheduler.Node, error) { log: p.log.With(slog.Group("node", "name", name)), } - node.log.Info("Created server, waiting for it to become ready") + node.log.Info("Node created") return node, node.connect(server) }