Skip to content

Commit

Permalink
Detect IP conflicts before gateway detection to fix gateway detection…
Browse files Browse the repository at this point in the history
… could potentially update the arp table causing communication fail

Signed-off-by: Cyclinder Kuo <[email protected]>
  • Loading branch information
cyclinder committed Dec 31, 2024
1 parent 415359e commit 082a68e
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 42 deletions.
1 change: 1 addition & 0 deletions .github/workflows/trivy-scan-image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
uses: actions/[email protected]
with:
pattern: image-tar-spiderpool-*-${{ inputs.ref }}
merge-multiple: true
path: test/.download

- name: List downloaded files
Expand Down
110 changes: 68 additions & 42 deletions cmd/coordinator/cmd/command_add.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,65 @@ func CmdAdd(args *skel.CmdArgs) (err error) {

logger.Sugar().Infof("Get coordinator config: %+v", c)

errg := errgroup.Group{}
// we do detect gateway connection firstly
errgConflict := errgroup.Group{}
// IP conflict detection must precede gateway detection, which avoids the
// possibility that gateway detection may update arp table entries first and cause
// communication problems when IP conflict detection fails
var ipc *ipchecking.IPChecker
if conf.IPConflict != nil && *conf.IPConflict {
logger.Debug("Try to detect ip conflict")
ipc, err = ipchecking.NewIPChecker(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.hostNs, c.netns, logger)
if err != nil {
return fmt.Errorf("failed to run NewIPChecker: %w", err)
}
ipc.DoIPConflictChecking(prevResult.IPs, c.currentInterface, &errgConflict)
} else {
logger.Debug("disable detect ip conflict")
}

cleanIPsFn := func() error {
_, innerErr := client.Daemonset.DeleteIpamIps(daemonset.NewDeleteIpamIpsParams().WithContext(context.TODO()).WithIpamBatchDelArgs(
&models.IpamBatchDelArgs{
ContainerID: &args.ContainerID,
NetNamespace: args.Netns,
PodName: (*string)(&k8sArgs.K8S_POD_NAME),
PodNamespace: (*string)(&k8sArgs.K8S_POD_NAMESPACE),
PodUID: (*string)(&k8sArgs.K8S_POD_UID),
},
))
if innerErr != nil {
logger.Sugar().Errorf("failed to clean up conflict IPs, error: %v", innerErr)
return multierr.Append(err, innerErr)
}
return nil
}

if err = errgConflict.Wait(); err != nil {
logger.Error("failed to detect ip conflict", zap.Error(err))
if errors.Is(err, constant.ErrIPConflict) {
logger.Info("ip conflict detected, clean up conflict IPs")
if cleanerr := cleanIPsFn(); err != nil {
logger.Error("failed to clean up conflict IPs", zap.Error(cleanerr))
return cleanerr
}
}
return err
}

// Fixed Mac addresses must come after IP conflict detection, otherwise the switch learns to communicate
// with the wrong Mac address when IP conflict detection fails
if len(conf.MacPrefix) != 0 {
hwAddr, err := networking.OverwriteHwAddress(logger, c.netns, conf.MacPrefix, args.IfName)
if err != nil {
return fmt.Errorf("failed to update hardware address for interface %s, maybe hardware_prefix(%s) is invalid: %v", args.IfName, conf.MacPrefix, err)
}
logger.Info("Fix mac address successfully", zap.String("interface", args.IfName), zap.String("macAddress", hwAddr))
}

// we do detect gateway connection lastly
// Finally, there is gateway detection, which updates the correct arp table entries
// once there are no IP address conflicts and fixed Mac addresses
errgGateway := errgroup.Group{}
if conf.DetectGateway != nil && *conf.DetectGateway {
logger.Debug("Try to detect gateway")

Expand All @@ -202,7 +259,6 @@ func CmdAdd(args *skel.CmdArgs) (err error) {
return fmt.Errorf("failed to GetDefaultGatewayByName: %v", err)
}
logger.Debug("Get GetDefaultGatewayByName", zap.Any("Gws", gws))

p, err := gwconnection.New(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.currentInterface, logger)
if err != nil {
return fmt.Errorf("failed to init the gateway client: %v", err)
Expand All @@ -211,10 +267,10 @@ func CmdAdd(args *skel.CmdArgs) (err error) {
for _, gw := range gws {
if gw.To4() != nil {
p.V4Gw = gw
errg.Go(c.hostNs, c.netns, p.ArpingOverIface)
errgGateway.Go(c.hostNs, c.netns, p.ArpingOverIface)
} else {
p.V6Gw = gw
errg.Go(c.hostNs, c.netns, p.NDPingOverIface)
errgGateway.Go(c.hostNs, c.netns, p.NDPingOverIface)
}
}
return nil
Expand All @@ -226,48 +282,18 @@ func CmdAdd(args *skel.CmdArgs) (err error) {
logger.Debug("disable detect gateway")
}

var ipc *ipchecking.IPChecker
if conf.IPConflict != nil && *conf.IPConflict {
logger.Debug("Try to detect ip conflict")
ipc, err = ipchecking.NewIPChecker(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.hostNs, c.netns, logger)
if err != nil {
return fmt.Errorf("failed to run NewIPChecker: %w", err)
}
ipc.DoIPConflictChecking(prevResult.IPs, c.currentInterface, &errg)
} else {
logger.Debug("disable detect ip conflict")
}

if err = errg.Wait(); err != nil {
logger.Error("failed to detect gateway and ip checking", zap.Error(err))
if errors.Is(err, constant.ErrIPConflict) || errors.Is(err, constant.ErrGatewayUnreachable) {
_, innerErr := client.Daemonset.DeleteIpamIps(daemonset.NewDeleteIpamIpsParams().WithContext(context.TODO()).WithIpamBatchDelArgs(
&models.IpamBatchDelArgs{
ContainerID: &args.ContainerID,
NetNamespace: args.Netns,
PodName: (*string)(&k8sArgs.K8S_POD_NAME),
PodNamespace: (*string)(&k8sArgs.K8S_POD_NAMESPACE),
PodUID: (*string)(&k8sArgs.K8S_POD_UID),
},
))
if innerErr != nil {
logger.Sugar().Errorf("failed to clean up conflict IPs, error: %v", innerErr)
return multierr.Append(err, innerErr)
if err = errgGateway.Wait(); err != nil {
logger.Error("failed to detect gateway reachable", zap.Error(err))
if errors.Is(err, constant.ErrGatewayUnreachable) {
logger.Info("gateway unreachable detected, clean up conflict IPs")
if cleanErr := cleanIPsFn(); err != nil {
logger.Error("failed to clean up conflict IPs", zap.Error(cleanErr))
return cleanErr
}
}

return err
}

// overwrite mac address
if len(conf.MacPrefix) != 0 {
hwAddr, err := networking.OverwriteHwAddress(logger, c.netns, conf.MacPrefix, args.IfName)
if err != nil {
return fmt.Errorf("failed to update hardware address for interface %s, maybe hardware_prefix(%s) is invalid: %v", args.IfName, conf.MacPrefix, err)
}
logger.Info("Override hardware address successfully", zap.String("interface", args.IfName), zap.String("hardware address", hwAddr))
}

// set txqueuelen
if conf.TxQueueLen != nil && *conf.TxQueueLen > 0 {
if err = networking.LinkSetTxqueueLen(args.IfName, int(*conf.TxQueueLen)); err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,10 @@ var _ = Describe("MacvlanOverlayOne", Label("overlay", "one-nic", "coordinator")
Expect(frame.CreateSpiderMultusInstance(nad)).NotTo(HaveOccurred())

DeferCleanup(func() {
if CurrentSpecReport().Failed() {
GinkgoWriter.Println("If the use case fails, the cleanup step will be skipped")
return
}
GinkgoWriter.Printf("delete spiderMultusConfig %v/%v. \n", namespace, multusNadName)
Expect(frame.DeleteSpiderMultusInstance(namespace, multusNadName)).NotTo(HaveOccurred())

Expand Down Expand Up @@ -698,6 +702,10 @@ var _ = Describe("MacvlanOverlayOne", Label("overlay", "one-nic", "coordinator")
Expect(frame.CreateSpiderMultusInstance(nad)).NotTo(HaveOccurred())

DeferCleanup(func() {
if CurrentSpecReport().Failed() {
GinkgoWriter.Println("If the use case fails, the cleanup step will be skipped")
return
}
GinkgoWriter.Printf("delete spiderMultusConfig %v/%v. \n", namespace, multusNadName)
Expect(frame.DeleteSpiderMultusInstance(namespace, multusNadName)).NotTo(HaveOccurred())

Expand Down

0 comments on commit 082a68e

Please sign in to comment.