Skip to content

Commit

Permalink
OCPBUGS-42810: actively move bootstrap member lead
Browse files Browse the repository at this point in the history
This PR will actively try to move the leadership away from the bootstrap member to another healthy member.
  • Loading branch information
tjungblu committed Nov 20, 2024
1 parent 9a2aa20 commit a747960
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 29 deletions.
14 changes: 14 additions & 0 deletions pkg/etcdcli/etcdcli.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,20 @@ func (g *etcdClientGetter) MemberRemove(ctx context.Context, memberID uint64) er
return err
}

func (g *etcdClientGetter) MoveLeader(ctx context.Context, toMember uint64) error {
cli, err := g.clientPool.Get()
if err != nil {
return err
}

defer g.clientPool.Return(cli)

ctx, cancel := context.WithTimeout(ctx, DefaultClientTimeout)
defer cancel()
_, err = cli.MoveLeader(ctx, toMember)
return err
}

func (g *etcdClientGetter) MemberList(ctx context.Context) ([]*etcdserverpb.Member, error) {
cli, err := g.clientPool.Get()
if err != nil {
Expand Down
16 changes: 16 additions & 0 deletions pkg/etcdcli/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ func (f *fakeEtcdClient) MemberRemove(ctx context.Context, memberID uint64) erro
return nil
}

func (f *fakeEtcdClient) MoveLeader(ctx context.Context, toMember uint64) error {
for _, status := range f.opts.status {
status.Leader = toMember
}

return nil
}

func (f *fakeEtcdClient) MemberHealth(ctx context.Context) (memberHealth, error) {
var healthy, unhealthy int
var memberHealth memberHealth
Expand Down Expand Up @@ -251,6 +259,14 @@ func WithFakeStatus(status []*clientv3.StatusResponse) FakeClientOption {
}
}

func WithLeader(leader uint64) FakeClientOption {
return func(fo *FakeClientOptions) {
for _, status := range fo.status {
status.Leader = leader
}
}
}

// WithFakeDefragErrors configures each call to Defrag to consume one error from the given slice
func WithFakeDefragErrors(errors []error) FakeClientOption {
return func(fo *FakeClientOptions) {
Expand Down
5 changes: 5 additions & 0 deletions pkg/etcdcli/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ type EtcdClient interface {
HealthyMemberLister
UnhealthyMemberLister
MemberStatusChecker
LeaderMover
Status

GetMember(ctx context.Context, name string) (*etcdserverpb.Member, error)
Expand Down Expand Up @@ -64,6 +65,10 @@ type MemberRemover interface {
MemberRemove(ctx context.Context, memberID uint64) error
}

type LeaderMover interface {
MoveLeader(ctx context.Context, toMember uint64) error
}

type MemberLister interface {
// MemberList lists all members in a cluster
MemberList(ctx context.Context) ([]*etcdserverpb.Member, error)
Expand Down
116 changes: 93 additions & 23 deletions pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package bootstrapteardown
import (
"context"
"fmt"
"go.etcd.io/etcd/api/v3/etcdserverpb"
"time"

operatorv1 "github.com/openshift/api/operator/v1"
Expand Down Expand Up @@ -62,12 +63,29 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo
return fmt.Errorf("failed to get bootstrap scaling strategy: %w", err)
}
// checks the actual etcd cluster membership API if etcd-bootstrap exists
safeToRemoveBootstrap, hasBootstrap, bootstrapID, err := c.canRemoveEtcdBootstrap(ctx, scalingStrategy)
safeToRemoveBootstrap, hasBootstrap, bootstrapMember, err := c.canRemoveEtcdBootstrap(ctx, scalingStrategy)
if err != nil {
return fmt.Errorf("error while canRemoveEtcdBootstrap: %w", err)
}

err = c.removeBootstrap(timeoutCtx, safeToRemoveBootstrap, hasBootstrap, bootstrapID)
if hasBootstrap {
if err := c.ensureBootstrapIsNotLeader(ctx, bootstrapMember); err != nil {
klog.Errorf("error while ensuring bootstrap is not leader: %v", err)
}
}

// TODO(thomas): it seems on SNO, this is not enough, we might have a non-working apiserver at this point in time
revisionStable, err := ceohelpers.IsRevisionStable(c.operatorClient)
if err != nil {
return fmt.Errorf("BootstrapTeardownController failed to determine stability of revisions: %w", err)
}

if !revisionStable {
klog.Infof("BootstrapTeardownController is waiting for stable etcd revision before removing the bootstrap member")
return nil
}

err = c.removeBootstrap(timeoutCtx, safeToRemoveBootstrap, hasBootstrap, bootstrapMember)
if err != nil {
_, _, updateErr := v1helpers.UpdateStatus(ctx, c.operatorClient, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{
Type: "BootstrapTeardownDegraded",
Expand All @@ -90,13 +108,20 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo
return updateErr
}

func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeToRemoveBootstrap bool, hasBootstrap bool, bootstrapID uint64) error {
func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeToRemoveBootstrap bool, hasBootstrap bool, bootstrapMember *etcdserverpb.Member) error {
bootstrapID := uint64(0)
bootstrapUrl := "unknown"
if bootstrapMember != nil {
bootstrapID = bootstrapMember.ID
bootstrapUrl = bootstrapMember.GetClientURLs()[0]
}

if !hasBootstrap {
klog.V(4).Infof("no bootstrap anymore setting removal status")
// this is to ensure the status is always set correctly, even if the status update below failed
updateErr := setSuccessfulBoostrapRemovalStatus(ctx, c.operatorClient)
updateErr := setSuccessfulBootstrapRemovalStatus(ctx, c.operatorClient)
if updateErr != nil {
return fmt.Errorf("error while setSuccessfulBoostrapRemovalStatus: %w", updateErr)
return fmt.Errorf("error while setSuccessfulBootstrapRemovalStatus: %w", updateErr)
}

// if the bootstrap isn't present, then clearly we're available enough to terminate. This avoids any risk of flapping.
Expand Down Expand Up @@ -141,20 +166,21 @@ func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeT
if isBootstrapComplete, err := bootstrap.IsBootstrapComplete(c.configmapLister); !isBootstrapComplete || err != nil {
return err
}
klog.Warningf("Removing bootstrap member [%x]", bootstrapID)

klog.Warningf("Removing bootstrap member [%x] (%s)", bootstrapID, bootstrapUrl)

// this is ugly until bootkube is updated, but we want to be sure that bootkube has time to be waiting to watch the condition coming back.
if err := c.etcdClient.MemberRemove(ctx, bootstrapID); err != nil {
return fmt.Errorf("error while removing bootstrap member [%x]: %w", bootstrapID, err)
return fmt.Errorf("error while removing bootstrap member [%x] (%s): %w", bootstrapID, bootstrapUrl, err)
}

klog.Infof("Successfully removed bootstrap member [%x]", bootstrapID)
klog.Infof("Successfully removed bootstrap member [%x] (%s)", bootstrapID, bootstrapUrl)
// below might fail, since the member removal can cause some downtime for raft to settle on a quorum
// it's important that everything below is properly retried above during normal controller reconciliation
return setSuccessfulBoostrapRemovalStatus(ctx, c.operatorClient)
return setSuccessfulBootstrapRemovalStatus(ctx, c.operatorClient)
}

func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.StaticPodOperatorClient) error {
func setSuccessfulBootstrapRemovalStatus(ctx context.Context, client v1helpers.StaticPodOperatorClient) error {
_, _, updateErr := v1helpers.UpdateStatus(ctx, client, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{
Type: "EtcdBootstrapMemberRemoved",
Status: operatorv1.ConditionTrue,
Expand All @@ -165,57 +191,101 @@ func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.St
}

// canRemoveEtcdBootstrap returns whether it is safe to remove bootstrap, whether bootstrap is in the list, and an error
func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context, scalingStrategy ceohelpers.BootstrapScalingStrategy) (bool, bool, uint64, error) {
func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context, scalingStrategy ceohelpers.BootstrapScalingStrategy) (bool, bool, *etcdserverpb.Member, error) {
members, err := c.etcdClient.MemberList(ctx)
if err != nil {
return false, false, 0, err
return false, false, nil, err
}

var hasBootstrap bool
var bootstrapMemberID uint64
var bootstrapMember *etcdserverpb.Member
for _, member := range members {
if member.Name == "etcd-bootstrap" {
hasBootstrap = true
bootstrapMemberID = member.ID
bootstrapMember = member
break
}
}
if !hasBootstrap {
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
}

// First, enforce the main HA invariants in terms of member counts.
switch scalingStrategy {
case ceohelpers.HAScalingStrategy:
if len(members) < 4 {
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
}
case ceohelpers.DelayedHAScalingStrategy:
if len(members) < 3 {
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
}
case ceohelpers.UnsafeScalingStrategy:
if len(members) < 2 {
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
}
}

// Next, given member counts are satisfied, check member health.
unhealthyMembers, err := c.etcdClient.UnhealthyMembers(ctx)
if err != nil {
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
}

// the etcd-bootstrap member is allowed to be unhealthy and can still be removed
switch {
case len(unhealthyMembers) == 0:
return true, hasBootstrap, bootstrapMemberID, nil
return true, hasBootstrap, bootstrapMember, nil
case len(unhealthyMembers) > 1:
return false, hasBootstrap, bootstrapMemberID, nil
return false, hasBootstrap, bootstrapMember, nil
default:
if unhealthyMembers[0].Name == "etcd-bootstrap" {
return true, true, unhealthyMembers[0].ID, nil
return true, true, bootstrapMember, nil
}
return false, hasBootstrap, bootstrapMember, nil
}
}

func (c *BootstrapTeardownController) ensureBootstrapIsNotLeader(ctx context.Context, bootstrapMember *etcdserverpb.Member) error {
if bootstrapMember == nil {
return fmt.Errorf("bootstrap member was not provided")
}
status, err := c.etcdClient.Status(ctx, bootstrapMember.ClientURLs[0])
if err != nil {
return fmt.Errorf("could not find bootstrap member status: %w", err)
}

if bootstrapMember.ID != status.Leader {
return nil
}

klog.Warningf("Bootstrap member [%x] (%s) detected as leader, trying to move elsewhere...", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0])

memberHealth, err := c.etcdClient.MemberHealth(ctx)
if err != nil {
return fmt.Errorf("could not find member health: %w", err)
}

var otherMember *etcdserverpb.Member
// we can pick any other healthy voting member as the target to move to
for _, m := range memberHealth.GetHealthyMembers() {
if m.ID != bootstrapMember.ID && !m.IsLearner {
otherMember = m
break
}
return false, hasBootstrap, bootstrapMemberID, nil
}

if otherMember == nil {
return fmt.Errorf("could not find other healthy member to move leader")
}

klog.Warningf("Moving lead from bootstrap member [%x] (%s) detected as leader to [%x] (%s)", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0], otherMember.ID, otherMember.GetClientURLs()[0])
err = c.etcdClient.MoveLeader(ctx, otherMember.ID)
if err != nil {
return err
}

klog.Warningf("Moving lead from bootstrap member [%x] (%s) to [%x] (%s) succesfully!", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0], otherMember.ID, otherMember.GetClientURLs()[0])

return nil
}
Loading

0 comments on commit a747960

Please sign in to comment.