diff --git a/.github/workflows/trivy-scan-image.yaml b/.github/workflows/trivy-scan-image.yaml index ec79053bb0..f60b114029 100644 --- a/.github/workflows/trivy-scan-image.yaml +++ b/.github/workflows/trivy-scan-image.yaml @@ -27,6 +27,7 @@ jobs: uses: actions/download-artifact@v4.1.8 with: pattern: image-tar-spiderpool-*-${{ inputs.ref }} + merge-multiple: true path: test/.download - name: List downloaded files diff --git a/cmd/coordinator/cmd/command_add.go b/cmd/coordinator/cmd/command_add.go index 94718b64bd..1d0057c75c 100644 --- a/cmd/coordinator/cmd/command_add.go +++ b/cmd/coordinator/cmd/command_add.go @@ -189,8 +189,58 @@ func CmdAdd(args *skel.CmdArgs) (err error) { logger.Sugar().Infof("Get coordinator config: %+v", c) - errg := errgroup.Group{} - // we do detect gateway connection firstly + errgConflict := errgroup.Group{} + // IP conflict detection must precede gateway detection, which avoids the + // possibility that gateway detection may update arp table entries first and cause + // communication problems when IP conflict detection fails + // see https://github.com/spidernet-io/spiderpool/issues/4475 + var ipc *ipchecking.IPChecker + if conf.IPConflict != nil && *conf.IPConflict { + logger.Debug("Try to detect ip conflict") + ipc, err = ipchecking.NewIPChecker(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.hostNs, c.netns, logger) + if err != nil { + return fmt.Errorf("failed to run NewIPChecker: %w", err) + } + ipc.DoIPConflictChecking(prevResult.IPs, c.currentInterface, &errgConflict) + } else { + logger.Debug("disable detect ip conflict") + } + + if err = errgConflict.Wait(); err != nil { + logger.Error("failed to detect ip conflict", zap.Error(err)) + if errors.Is(err, constant.ErrIPConflict) { + logger.Info("ip conflict detected, clean up conflict IPs") + _, innerErr := client.Daemonset.DeleteIpamIps(daemonset.NewDeleteIpamIpsParams().WithContext(context.TODO()).WithIpamBatchDelArgs( + &models.IpamBatchDelArgs{ + ContainerID: &args.ContainerID, + NetNamespace: args.Netns, + PodName: (*string)(&k8sArgs.K8S_POD_NAME), + PodNamespace: (*string)(&k8sArgs.K8S_POD_NAMESPACE), + PodUID: (*string)(&k8sArgs.K8S_POD_UID), + }, + )) + if innerErr != nil { + logger.Sugar().Errorf("failed to clean up conflict IPs, error: %v", innerErr) + return multierr.Append(err, innerErr) + } + } + return err + } + + // Fixed Mac addresses must come after IP conflict detection, otherwise the switch learns to communicate + // with the wrong Mac address when IP conflict detection fails + if len(conf.MacPrefix) != 0 { + hwAddr, err := networking.OverwriteHwAddress(logger, c.netns, conf.MacPrefix, args.IfName) + if err != nil { + return fmt.Errorf("failed to update hardware address for interface %s, maybe hardware_prefix(%s) is invalid: %v", args.IfName, conf.MacPrefix, err) + } + logger.Info("Fix mac address successfully", zap.String("interface", args.IfName), zap.String("macAddress", hwAddr)) + } + + // we do detect gateway connection lastly + // Finally, there is gateway detection, which updates the correct arp table entries + // once there are no IP address conflicts and fixed Mac addresses + errgGateway := errgroup.Group{} if conf.DetectGateway != nil && *conf.DetectGateway { logger.Debug("Try to detect gateway") @@ -202,7 +252,6 @@ func CmdAdd(args *skel.CmdArgs) (err error) { return fmt.Errorf("failed to GetDefaultGatewayByName: %v", err) } logger.Debug("Get GetDefaultGatewayByName", zap.Any("Gws", gws)) - p, err := gwconnection.New(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.currentInterface, logger) if err != nil { return fmt.Errorf("failed to init the gateway client: %v", err) @@ -211,10 +260,10 @@ func CmdAdd(args *skel.CmdArgs) (err error) { for _, gw := range gws { if gw.To4() != nil { p.V4Gw = gw - errg.Go(c.hostNs, c.netns, p.ArpingOverIface) + errgGateway.Go(c.hostNs, c.netns, p.ArpingOverIface) } else { p.V6Gw = gw - errg.Go(c.hostNs, c.netns, p.NDPingOverIface) + errgGateway.Go(c.hostNs, c.netns, p.NDPingOverIface) } } return nil @@ -226,21 +275,10 @@ func CmdAdd(args *skel.CmdArgs) (err error) { logger.Debug("disable detect gateway") } - var ipc *ipchecking.IPChecker - if conf.IPConflict != nil && *conf.IPConflict { - logger.Debug("Try to detect ip conflict") - ipc, err = ipchecking.NewIPChecker(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.hostNs, c.netns, logger) - if err != nil { - return fmt.Errorf("failed to run NewIPChecker: %w", err) - } - ipc.DoIPConflictChecking(prevResult.IPs, c.currentInterface, &errg) - } else { - logger.Debug("disable detect ip conflict") - } - - if err = errg.Wait(); err != nil { - logger.Error("failed to detect gateway and ip checking", zap.Error(err)) - if errors.Is(err, constant.ErrIPConflict) || errors.Is(err, constant.ErrGatewayUnreachable) { + if err = errgGateway.Wait(); err != nil { + logger.Error("failed to detect gateway reachable", zap.Error(err)) + if errors.Is(err, constant.ErrGatewayUnreachable) { + logger.Info("gateway unreachable detected, clean up conflict IPs") _, innerErr := client.Daemonset.DeleteIpamIps(daemonset.NewDeleteIpamIpsParams().WithContext(context.TODO()).WithIpamBatchDelArgs( &models.IpamBatchDelArgs{ ContainerID: &args.ContainerID, @@ -255,19 +293,9 @@ func CmdAdd(args *skel.CmdArgs) (err error) { return multierr.Append(err, innerErr) } } - return err } - // overwrite mac address - if len(conf.MacPrefix) != 0 { - hwAddr, err := networking.OverwriteHwAddress(logger, c.netns, conf.MacPrefix, args.IfName) - if err != nil { - return fmt.Errorf("failed to update hardware address for interface %s, maybe hardware_prefix(%s) is invalid: %v", args.IfName, conf.MacPrefix, err) - } - logger.Info("Override hardware address successfully", zap.String("interface", args.IfName), zap.String("hardware address", hwAddr)) - } - // set txqueuelen if conf.TxQueueLen != nil && *conf.TxQueueLen > 0 { if err = networking.LinkSetTxqueueLen(args.IfName, int(*conf.TxQueueLen)); err != nil { diff --git a/test/e2e/coordinator/macvlan-overlay-one/macvlan_overlay_one_test.go b/test/e2e/coordinator/macvlan-overlay-one/macvlan_overlay_one_test.go index f72be43b4e..1b4833fbdd 100644 --- a/test/e2e/coordinator/macvlan-overlay-one/macvlan_overlay_one_test.go +++ b/test/e2e/coordinator/macvlan-overlay-one/macvlan_overlay_one_test.go @@ -419,6 +419,10 @@ var _ = Describe("MacvlanOverlayOne", Label("overlay", "one-nic", "coordinator") Expect(frame.CreateSpiderMultusInstance(nad)).NotTo(HaveOccurred()) DeferCleanup(func() { + if CurrentSpecReport().Failed() { + GinkgoWriter.Println("If the use case fails, the cleanup step will be skipped") + return + } GinkgoWriter.Printf("delete spiderMultusConfig %v/%v. \n", namespace, multusNadName) Expect(frame.DeleteSpiderMultusInstance(namespace, multusNadName)).NotTo(HaveOccurred()) @@ -698,6 +702,10 @@ var _ = Describe("MacvlanOverlayOne", Label("overlay", "one-nic", "coordinator") Expect(frame.CreateSpiderMultusInstance(nad)).NotTo(HaveOccurred()) DeferCleanup(func() { + if CurrentSpecReport().Failed() { + GinkgoWriter.Println("If the use case fails, the cleanup step will be skipped") + return + } GinkgoWriter.Printf("delete spiderMultusConfig %v/%v. \n", namespace, multusNadName) Expect(frame.DeleteSpiderMultusInstance(namespace, multusNadName)).NotTo(HaveOccurred())