From 82c9320fdceca68626e5e23378a73882b116cc53 Mon Sep 17 00:00:00 2001 From: Dimitar Date: Wed, 3 Jul 2024 16:33:48 +0300 Subject: [PATCH] PRODENG-2686 MKE Health now pings all managers Signed-off-by: Dimitar --- pkg/product/mke/api/cluster_spec.go | 76 +++++++++++++------- pkg/product/mke/phase/install_msr.go | 4 +- pkg/product/mke/phase/upgrade_mcr.go | 12 ++-- pkg/product/mke/phase/upgrade_msr.go | 4 +- pkg/product/mke/phase/validate_mke_health.go | 6 +- 5 files changed, 64 insertions(+), 38 deletions(-) diff --git a/pkg/product/mke/api/cluster_spec.go b/pkg/product/mke/api/cluster_spec.go index 59e03f84..6017089b 100644 --- a/pkg/product/mke/api/cluster_spec.go +++ b/pkg/product/mke/api/cluster_spec.go @@ -7,6 +7,7 @@ import ( "net/url" "strconv" "strings" + "sync" "github.com/Mirantis/mcc/pkg/constant" common "github.com/Mirantis/mcc/pkg/product/common/api" @@ -266,18 +267,13 @@ func IsCustomImageRepo(imageRepo string) bool { return imageRepo != constant.ImageRepo && imageRepo != constant.ImageRepoLegacy } -// CheckMKEHealthRemote will check mke cluster health from a host and return an error if it failed. -func (c *ClusterSpec) CheckMKEHealthRemote(h *Host) error { - u, err := c.MKEURL() - if err != nil { - return err - } - u.Path = "/_ping" +func pingHost(h *Host, address string, waitgroup *sync.WaitGroup, errCh chan<- error) { + url := fmt.Sprintf("https://%s/_ping", address) - err = retry.Do( + err := retry.Do( func() error { - log.Infof("%s: waiting for MKE at %s to become healthy", h, u.Host) - if err := h.CheckHTTPStatus(u.String(), http.StatusOK); err != nil { + log.Infof("%s: waiting for MKE at %s to become healthy", h, url) + if err := h.CheckHTTPStatus(url, http.StatusOK); err != nil { return fmt.Errorf("check http status: %w", err) } return nil @@ -285,31 +281,57 @@ func (c *ClusterSpec) CheckMKEHealthRemote(h *Host) error { retry.Attempts(12), // last attempt should wait ~7min ) if err != nil { - return fmt.Errorf("MKE health check failed: %w", err) + errCh <- fmt.Errorf("MKE health check failed: %w", err) + } + errCh <- nil + waitgroup.Done() +} + +// CheckMKEHealthRemote will check mke cluster health from a list of hosts and return an error if it failed. +func (c *ClusterSpec) CheckMKEHealthRemote(hosts []*Host) error { + errCh := make(chan error, len(hosts)) + var wg sync.WaitGroup + + for _, h := range hosts { + wg.Add(1) + go pingHost(h, h.Address(), &wg, errCh) } + + wg.Wait() + close(errCh) + + for err := range errCh { + if err != nil { + return fmt.Errorf("MKE health check failed: %w", err) + } + } + return nil } // CheckMKEHealthLocal will check the local mke health on a host and return an error if it failed. -func (c *ClusterSpec) CheckMKEHealthLocal(h *Host) error { - host := h.Metadata.InternalAddress - if port := c.MKE.InstallFlags.GetValue("--controller-port"); port != "" { - host = host + ":" + port +func (c *ClusterSpec) CheckMKEHealthLocal(hosts []*Host) error { + errCh := make(chan error, len(hosts)) + var wg sync.WaitGroup + + for _, h := range hosts { + wg.Add(1) + address := h.Metadata.InternalAddress + if port := c.MKE.InstallFlags.GetValue("--controller-port"); port != "" { + address = address + ":" + port + } + go pingHost(h, address, &wg, errCh) } - err := retry.Do( - func() error { - log.Infof("%s: waiting for MKE to become healthy", h) - if err := h.CheckHTTPStatus(fmt.Sprintf("https://%s/_ping", host), http.StatusOK); err != nil { - return fmt.Errorf("check http status: %w", err) - } - return nil - }, - retry.Attempts(12), // last attempt should wait ~7min - ) - if err != nil { - return fmt.Errorf("MKE health check failed: %w", err) + wg.Wait() + close(errCh) + + for err := range errCh { + if err != nil { + return fmt.Errorf("MKE health check failed: %w", err) + } } + return nil } diff --git a/pkg/product/mke/phase/install_msr.go b/pkg/product/mke/phase/install_msr.go index 768557bd..f2350ab4 100644 --- a/pkg/product/mke/phase/install_msr.go +++ b/pkg/product/mke/phase/install_msr.go @@ -42,7 +42,9 @@ func (p *InstallMSR) Run() error { h.MSRMetadata = &api.MSRMetadata{} } - err := p.Config.Spec.CheckMKEHealthRemote(h) + managers := p.Config.Spec.Managers() + + err := p.Config.Spec.CheckMKEHealthRemote(managers) if err != nil { return fmt.Errorf("%s: failed to health check mke, try to set `--ucp-url` installFlag and check connectivity: %w", h, err) } diff --git a/pkg/product/mke/phase/upgrade_mcr.go b/pkg/product/mke/phase/upgrade_mcr.go index 7c31c16d..c8875399 100644 --- a/pkg/product/mke/phase/upgrade_mcr.go +++ b/pkg/product/mke/phase/upgrade_mcr.go @@ -91,13 +91,13 @@ func (p *UpgradeMCR) upgradeMCRs() error { for _, h := range managers { err := p.upgradeMCR(h) if err != nil { - return err + return fmt.Errorf("upgrade MCR failed. %w", err) } - if p.Config.Spec.MKE.Metadata.Installed { - err := p.Config.Spec.CheckMKEHealthLocal(h) - if err != nil { - return fmt.Errorf("%s: %w", h, err) - } + } + if p.Config.Spec.MKE.Metadata.Installed { + err := p.Config.Spec.CheckMKEHealthLocal(managers) + if err != nil { + return fmt.Errorf("checkMKEHealthLocal failed. %w", err) } } diff --git a/pkg/product/mke/phase/upgrade_msr.go b/pkg/product/mke/phase/upgrade_msr.go index 884d71ae..f744eee3 100644 --- a/pkg/product/mke/phase/upgrade_msr.go +++ b/pkg/product/mke/phase/upgrade_msr.go @@ -43,7 +43,9 @@ func (p *UpgradeMSR) ShouldRun() bool { func (p *UpgradeMSR) Run() error { h := p.Config.Spec.MSRLeader() - err := p.Config.Spec.CheckMKEHealthRemote(h) + managers := p.Config.Spec.Managers() + + err := p.Config.Spec.CheckMKEHealthRemote(managers) if err != nil { return fmt.Errorf("%s: failed to health check mke, try to set `--ucp-url` installFlag and check connectivity: %w", h, err) } diff --git a/pkg/product/mke/phase/validate_mke_health.go b/pkg/product/mke/phase/validate_mke_health.go index 87a73401..19bbcc5c 100644 --- a/pkg/product/mke/phase/validate_mke_health.go +++ b/pkg/product/mke/phase/validate_mke_health.go @@ -32,10 +32,10 @@ func (p *ValidateMKEHealth) Title() string { // launchpad phases, should be used when installing products that depend // on MKE, such as MSR. func (p *ValidateMKEHealth) Run() error { - // Issue a health check to the MKE san host until we receive an 'ok' status - swarmLeader := p.Config.Spec.SwarmLeader() + // Issue a health check to the MKE local address managers until we receive an 'ok' status + managers := p.Config.Spec.Managers() - if err := p.Config.Spec.CheckMKEHealthLocal(swarmLeader); err != nil { + if err := p.Config.Spec.CheckMKEHealthLocal(managers); err != nil { return fmt.Errorf("%w: failed to validate MKE health: %w", errValidationFailed, err) }