From 08a5edcb02533d56eb192f5d0a0b2708f20cd00d Mon Sep 17 00:00:00 2001 From: Dorian Villet Date: Wed, 15 Nov 2023 21:47:26 +0100 Subject: [PATCH] Fix node provisioning ssh.dial timeout handling. --- provisioner/internal/docker.go | 12 ++++---- provisioner/openstack/node.go | 42 +++++++++++++++------------- provisioner/openstack/provisioner.go | 2 +- 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/provisioner/internal/docker.go b/provisioner/internal/docker.go index a074cf8..c082acf 100644 --- a/provisioner/internal/docker.go +++ b/provisioner/internal/docker.go @@ -169,10 +169,10 @@ func RunContainer( // Always wait 1 second before running the health check, and potentially more between retries time.Sleep(lo.Ternary(i > 0, interval, 1*time.Second)) - healthCheckLog := serviceLog.With(slog.Group("retry", "attempt", i+1, "interval", interval)) + healthCheckCmd := append([]string{service.Health.Cmd}, service.Health.Args...) exec, err := docker.ContainerExecCreate(ctx, containerId, types.ExecConfig{ - Cmd: append([]string{service.Health.Cmd}, service.Health.Args...), + Cmd: healthCheckCmd, Env: serviceEnv[service.Name], AttachStdout: true, // We are piping stdout to io.Discard to "wait" for completion }) @@ -183,7 +183,7 @@ func RunContainer( execCtx, cancel := context.WithTimeout(ctx, timeout) defer cancel() - healthCheckLog.Debug("Running health check") + serviceLog.Debug("Running health check", "cmd", healthCheckCmd, "attempt", i+1, "interval", interval) attach, err := docker.ContainerExecAttach(execCtx, exec.ID, types.ExecStartCheck{}) if err != nil { serviceErrors <- fmt.Errorf("failed to attach docker exec for service '%s': %w", service.Name, err) @@ -209,13 +209,13 @@ func RunContainer( return } if inspect.ExitCode == 0 { - healthCheckLog.Debug("Service is ready") + serviceLog.Debug("Service is ready") return } - healthCheckLog.Debug("Service health check failed, retrying...", "exitcode", inspect.ExitCode) + serviceLog.Debug("Service health check unsuccessful, retrying...", "exitcode", inspect.ExitCode) } else { - healthCheckLog.Debug("Service health check timed out, retrying...") + serviceLog.Debug("Service health check timed out, retrying...") } } diff --git a/provisioner/openstack/node.go b/provisioner/openstack/node.go index 1fb3e24..b2cf144 100644 --- a/provisioner/openstack/node.go +++ b/provisioner/openstack/node.go @@ -2,14 +2,12 @@ package openstack import ( "context" - "errors" "fmt" "log/slog" "net" "os" "os/exec" "sync" - "syscall" "time" "github.com/docker/docker/client" @@ -77,56 +75,62 @@ func (n *Node) connect(server *servers.Server) (err error) { return fmt.Errorf("failed to find IPv4 address for server '%s'", n.name) } + initialWait, retryInterval, cmdTimeout, timeout := 5*time.Second, 2*time.Second, 5*time.Second, 1*time.Minute + // Initialize SSH connection - ctx, cancel := context.WithTimeout(context.TODO(), 1*time.Minute) + ctx, cancel := context.WithTimeout(context.TODO(), timeout) defer cancel() + // Let some time for the node to boot + n.log.Debug("Wait for node to boot", "wait", initialWait) + time.Sleep(initialWait) + + connectionAttempts := 1 for n.ssh == nil { select { case <-ctx.Done(): - return fmt.Errorf("failed to connect to server '%s' after 1 minute", n.name) + return fmt.Errorf("failed to connect to server '%s' after %s and %d attempts: %w", n.name, timeout, connectionAttempts, err) default: - time.Sleep(5 * time.Second) n.ssh, err = ssh.Dial("tcp", fmt.Sprintf("%s:22", nodeAddress), &ssh.ClientConfig{ User: n.provisioner.config.SshUsername, - Timeout: 10 * time.Second, + Timeout: cmdTimeout, HostKeyCallback: ssh.InsecureIgnoreHostKey(), Auth: []ssh.AuthMethod{ ssh.PublicKeys(n.provisioner.privateKey), }, }) if err != nil { - switch { - case errors.Is(err, syscall.ECONNREFUSED), - errors.Is(err, syscall.ETIMEDOUT), - errors.Is(err, os.ErrDeadlineExceeded): - n.log.Debug("SSH connection to server refused, retrying in 5 seconds") - - default: - return fmt.Errorf("failed to connect to server '%s': %w", n.name, err) - } + n.log.Debug(fmt.Errorf("Connection to node refused (attempt %d), retrying in %s: %w", connectionAttempts, retryInterval, err).Error()) + time.Sleep(retryInterval) + connectionAttempts += 1 } } } + // Let some time for Docker daemon to start + n.log.Debug("Wait for Docker daemon to start", "wait", initialWait) + time.Sleep(initialWait) + // Initialize Docker client + connectionAttempts = 1 n.docker, err = client.NewClientWithOpts( client.WithHost(n.provisioner.config.DockerHost), client.WithDialContext(func(ctx context.Context, network, addr string) (conn net.Conn, err error) { - ctx, cancel := context.WithDeadline(ctx, time.Now().Add(2*time.Minute)) + ctx, cancel := context.WithTimeout(ctx, timeout) defer cancel() for { select { case <-ctx.Done(): - return nil, fmt.Errorf("failed to dial docker: %w", ctx.Err()) + return nil, fmt.Errorf("failed to connect to Docker daemon after %s and %d attempts: %w", timeout, connectionAttempts, ctx.Err()) default: if conn, err = n.ssh.Dial(network, addr); err == nil { return } else { - n.log.Debug("Connection to Docker daemon refused, retrying in 5 seconds") - time.Sleep(5 * time.Second) + n.log.Debug(fmt.Errorf("Connection to Docker daemon refused (attempt %d), retrying in %s: %w", connectionAttempts, retryInterval, err).Error()) + time.Sleep(retryInterval) + connectionAttempts += 1 } } } diff --git a/provisioner/openstack/provisioner.go b/provisioner/openstack/provisioner.go index ea3e50e..2107005 100644 --- a/provisioner/openstack/provisioner.go +++ b/provisioner/openstack/provisioner.go @@ -127,7 +127,7 @@ func (p *Provisioner) Provision(nodeName string) (scheduler.Node, error) { log: p.log.With(slog.Group("node", "name", name)), } - node.log.Info("Created server, waiting for it to become ready") + node.log.Info("Node created") return node, node.connect(server) }