Skip to content

Commit

Permalink
Fix node provisioning ssh.dial timeout handling.
Browse files Browse the repository at this point in the history
  • Loading branch information
gnutix committed Nov 15, 2023
1 parent a6daf6c commit 08a5edc
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 26 deletions.
12 changes: 6 additions & 6 deletions provisioner/internal/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,10 +169,10 @@ func RunContainer(
// Always wait 1 second before running the health check, and potentially more between retries
time.Sleep(lo.Ternary(i > 0, interval, 1*time.Second))

healthCheckLog := serviceLog.With(slog.Group("retry", "attempt", i+1, "interval", interval))
healthCheckCmd := append([]string{service.Health.Cmd}, service.Health.Args...)

exec, err := docker.ContainerExecCreate(ctx, containerId, types.ExecConfig{
Cmd: append([]string{service.Health.Cmd}, service.Health.Args...),
Cmd: healthCheckCmd,
Env: serviceEnv[service.Name],
AttachStdout: true, // We are piping stdout to io.Discard to "wait" for completion
})
Expand All @@ -183,7 +183,7 @@ func RunContainer(

execCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
healthCheckLog.Debug("Running health check")
serviceLog.Debug("Running health check", "cmd", healthCheckCmd, "attempt", i+1, "interval", interval)
attach, err := docker.ContainerExecAttach(execCtx, exec.ID, types.ExecStartCheck{})
if err != nil {
serviceErrors <- fmt.Errorf("failed to attach docker exec for service '%s': %w", service.Name, err)
Expand All @@ -209,13 +209,13 @@ func RunContainer(
return
}
if inspect.ExitCode == 0 {
healthCheckLog.Debug("Service is ready")
serviceLog.Debug("Service is ready")
return
}

healthCheckLog.Debug("Service health check failed, retrying...", "exitcode", inspect.ExitCode)
serviceLog.Debug("Service health check unsuccessful, retrying...", "exitcode", inspect.ExitCode)
} else {
healthCheckLog.Debug("Service health check timed out, retrying...")
serviceLog.Debug("Service health check timed out, retrying...")
}
}

Expand Down
42 changes: 23 additions & 19 deletions provisioner/openstack/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@ package openstack

import (
"context"
"errors"
"fmt"
"log/slog"
"net"
"os"
"os/exec"
"sync"
"syscall"
"time"

"github.com/docker/docker/client"
Expand Down Expand Up @@ -77,56 +75,62 @@ func (n *Node) connect(server *servers.Server) (err error) {
return fmt.Errorf("failed to find IPv4 address for server '%s'", n.name)
}

initialWait, retryInterval, cmdTimeout, timeout := 5*time.Second, 2*time.Second, 5*time.Second, 1*time.Minute

// Initialize SSH connection
ctx, cancel := context.WithTimeout(context.TODO(), 1*time.Minute)
ctx, cancel := context.WithTimeout(context.TODO(), timeout)
defer cancel()

// Let some time for the node to boot
n.log.Debug("Wait for node to boot", "wait", initialWait)
time.Sleep(initialWait)

connectionAttempts := 1
for n.ssh == nil {
select {
case <-ctx.Done():
return fmt.Errorf("failed to connect to server '%s' after 1 minute", n.name)
return fmt.Errorf("failed to connect to server '%s' after %s and %d attempts: %w", n.name, timeout, connectionAttempts, err)

default:
time.Sleep(5 * time.Second)
n.ssh, err = ssh.Dial("tcp", fmt.Sprintf("%s:22", nodeAddress), &ssh.ClientConfig{
User: n.provisioner.config.SshUsername,
Timeout: 10 * time.Second,
Timeout: cmdTimeout,
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
Auth: []ssh.AuthMethod{
ssh.PublicKeys(n.provisioner.privateKey),
},
})
if err != nil {
switch {
case errors.Is(err, syscall.ECONNREFUSED),
errors.Is(err, syscall.ETIMEDOUT),
errors.Is(err, os.ErrDeadlineExceeded):
n.log.Debug("SSH connection to server refused, retrying in 5 seconds")

default:
return fmt.Errorf("failed to connect to server '%s': %w", n.name, err)
}
n.log.Debug(fmt.Errorf("Connection to node refused (attempt %d), retrying in %s: %w", connectionAttempts, retryInterval, err).Error())
time.Sleep(retryInterval)
connectionAttempts += 1
}
}
}

// Let some time for Docker daemon to start
n.log.Debug("Wait for Docker daemon to start", "wait", initialWait)
time.Sleep(initialWait)

// Initialize Docker client
connectionAttempts = 1
n.docker, err = client.NewClientWithOpts(
client.WithHost(n.provisioner.config.DockerHost),
client.WithDialContext(func(ctx context.Context, network, addr string) (conn net.Conn, err error) {
ctx, cancel := context.WithDeadline(ctx, time.Now().Add(2*time.Minute))
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
for {
select {
case <-ctx.Done():
return nil, fmt.Errorf("failed to dial docker: %w", ctx.Err())
return nil, fmt.Errorf("failed to connect to Docker daemon after %s and %d attempts: %w", timeout, connectionAttempts, ctx.Err())

default:
if conn, err = n.ssh.Dial(network, addr); err == nil {
return
} else {
n.log.Debug("Connection to Docker daemon refused, retrying in 5 seconds")
time.Sleep(5 * time.Second)
n.log.Debug(fmt.Errorf("Connection to Docker daemon refused (attempt %d), retrying in %s: %w", connectionAttempts, retryInterval, err).Error())
time.Sleep(retryInterval)
connectionAttempts += 1
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion provisioner/openstack/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func (p *Provisioner) Provision(nodeName string) (scheduler.Node, error) {
log: p.log.With(slog.Group("node", "name", name)),
}

node.log.Info("Created server, waiting for it to become ready")
node.log.Info("Node created")
return node, node.connect(server)
}

Expand Down

0 comments on commit 08a5edc

Please sign in to comment.