Skip to content

Commit

Permalink
PRODENG-2686 MKE Health now pings all managers
Browse files Browse the repository at this point in the history
Signed-off-by: Dimitar <[email protected]>
  • Loading branch information
cranzy authored and james-nesbitt committed Jul 18, 2024
1 parent 6822787 commit 82c9320
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 38 deletions.
76 changes: 49 additions & 27 deletions pkg/product/mke/api/cluster_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/url"
"strconv"
"strings"
"sync"

"github.com/Mirantis/mcc/pkg/constant"
common "github.com/Mirantis/mcc/pkg/product/common/api"
Expand Down Expand Up @@ -266,50 +267,71 @@ func IsCustomImageRepo(imageRepo string) bool {
return imageRepo != constant.ImageRepo && imageRepo != constant.ImageRepoLegacy
}

// CheckMKEHealthRemote will check mke cluster health from a host and return an error if it failed.
func (c *ClusterSpec) CheckMKEHealthRemote(h *Host) error {
u, err := c.MKEURL()
if err != nil {
return err
}
u.Path = "/_ping"
func pingHost(h *Host, address string, waitgroup *sync.WaitGroup, errCh chan<- error) {
url := fmt.Sprintf("https://%s/_ping", address)

err = retry.Do(
err := retry.Do(
func() error {
log.Infof("%s: waiting for MKE at %s to become healthy", h, u.Host)
if err := h.CheckHTTPStatus(u.String(), http.StatusOK); err != nil {
log.Infof("%s: waiting for MKE at %s to become healthy", h, url)
if err := h.CheckHTTPStatus(url, http.StatusOK); err != nil {
return fmt.Errorf("check http status: %w", err)
}
return nil
},
retry.Attempts(12), // last attempt should wait ~7min
)
if err != nil {
return fmt.Errorf("MKE health check failed: %w", err)
errCh <- fmt.Errorf("MKE health check failed: %w", err)
}
errCh <- nil
waitgroup.Done()
}

// CheckMKEHealthRemote will check mke cluster health from a list of hosts and return an error if it failed.
func (c *ClusterSpec) CheckMKEHealthRemote(hosts []*Host) error {
errCh := make(chan error, len(hosts))
var wg sync.WaitGroup

for _, h := range hosts {
wg.Add(1)
go pingHost(h, h.Address(), &wg, errCh)
}

wg.Wait()
close(errCh)

for err := range errCh {
if err != nil {
return fmt.Errorf("MKE health check failed: %w", err)
}
}

return nil
}

// CheckMKEHealthLocal will check the local mke health on a host and return an error if it failed.
func (c *ClusterSpec) CheckMKEHealthLocal(h *Host) error {
host := h.Metadata.InternalAddress
if port := c.MKE.InstallFlags.GetValue("--controller-port"); port != "" {
host = host + ":" + port
func (c *ClusterSpec) CheckMKEHealthLocal(hosts []*Host) error {
errCh := make(chan error, len(hosts))
var wg sync.WaitGroup

for _, h := range hosts {
wg.Add(1)
address := h.Metadata.InternalAddress
if port := c.MKE.InstallFlags.GetValue("--controller-port"); port != "" {
address = address + ":" + port
}
go pingHost(h, address, &wg, errCh)
}

err := retry.Do(
func() error {
log.Infof("%s: waiting for MKE to become healthy", h)
if err := h.CheckHTTPStatus(fmt.Sprintf("https://%s/_ping", host), http.StatusOK); err != nil {
return fmt.Errorf("check http status: %w", err)
}
return nil
},
retry.Attempts(12), // last attempt should wait ~7min
)
if err != nil {
return fmt.Errorf("MKE health check failed: %w", err)
wg.Wait()
close(errCh)

for err := range errCh {
if err != nil {
return fmt.Errorf("MKE health check failed: %w", err)
}
}

return nil
}

Expand Down
4 changes: 3 additions & 1 deletion pkg/product/mke/phase/install_msr.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ func (p *InstallMSR) Run() error {
h.MSRMetadata = &api.MSRMetadata{}
}

err := p.Config.Spec.CheckMKEHealthRemote(h)
managers := p.Config.Spec.Managers()

err := p.Config.Spec.CheckMKEHealthRemote(managers)
if err != nil {
return fmt.Errorf("%s: failed to health check mke, try to set `--ucp-url` installFlag and check connectivity: %w", h, err)
}
Expand Down
12 changes: 6 additions & 6 deletions pkg/product/mke/phase/upgrade_mcr.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,13 @@ func (p *UpgradeMCR) upgradeMCRs() error {
for _, h := range managers {
err := p.upgradeMCR(h)
if err != nil {
return err
return fmt.Errorf("upgrade MCR failed. %w", err)
}
if p.Config.Spec.MKE.Metadata.Installed {
err := p.Config.Spec.CheckMKEHealthLocal(h)
if err != nil {
return fmt.Errorf("%s: %w", h, err)
}
}
if p.Config.Spec.MKE.Metadata.Installed {
err := p.Config.Spec.CheckMKEHealthLocal(managers)
if err != nil {
return fmt.Errorf("checkMKEHealthLocal failed. %w", err)
}
}

Expand Down
4 changes: 3 additions & 1 deletion pkg/product/mke/phase/upgrade_msr.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ func (p *UpgradeMSR) ShouldRun() bool {
func (p *UpgradeMSR) Run() error {
h := p.Config.Spec.MSRLeader()

err := p.Config.Spec.CheckMKEHealthRemote(h)
managers := p.Config.Spec.Managers()

err := p.Config.Spec.CheckMKEHealthRemote(managers)
if err != nil {
return fmt.Errorf("%s: failed to health check mke, try to set `--ucp-url` installFlag and check connectivity: %w", h, err)
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/product/mke/phase/validate_mke_health.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ func (p *ValidateMKEHealth) Title() string {
// launchpad phases, should be used when installing products that depend
// on MKE, such as MSR.
func (p *ValidateMKEHealth) Run() error {
// Issue a health check to the MKE san host until we receive an 'ok' status
swarmLeader := p.Config.Spec.SwarmLeader()
// Issue a health check to the MKE local address managers until we receive an 'ok' status
managers := p.Config.Spec.Managers()

if err := p.Config.Spec.CheckMKEHealthLocal(swarmLeader); err != nil {
if err := p.Config.Spec.CheckMKEHealthLocal(managers); err != nil {
return fmt.Errorf("%w: failed to validate MKE health: %w", errValidationFailed, err)
}

Expand Down

0 comments on commit 82c9320

Please sign in to comment.