diff --git a/cmd/coordinator/cmd/cni_types.go b/cmd/coordinator/cmd/cni_types.go index 1f4630ba65..81f0b31d26 100644 --- a/cmd/coordinator/cmd/cni_types.go +++ b/cmd/coordinator/cmd/cni_types.go @@ -285,11 +285,11 @@ func ValidateDelectOptions(config *DetectOptions) (*DetectOptions, error) { } if config.Interval == "" { - config.Interval = "1s" + config.Interval = "10ms" } if config.TimeOut == "" { - config.TimeOut = "3s" + config.TimeOut = "100ms" } _, err := time.ParseDuration(config.Interval) diff --git a/cmd/coordinator/cmd/command_add.go b/cmd/coordinator/cmd/command_add.go index 94718b64bd..1d0057c75c 100644 --- a/cmd/coordinator/cmd/command_add.go +++ b/cmd/coordinator/cmd/command_add.go @@ -189,8 +189,58 @@ func CmdAdd(args *skel.CmdArgs) (err error) { logger.Sugar().Infof("Get coordinator config: %+v", c) - errg := errgroup.Group{} - // we do detect gateway connection firstly + errgConflict := errgroup.Group{} + // IP conflict detection must precede gateway detection, which avoids the + // possibility that gateway detection may update arp table entries first and cause + // communication problems when IP conflict detection fails + // see https://github.com/spidernet-io/spiderpool/issues/4475 + var ipc *ipchecking.IPChecker + if conf.IPConflict != nil && *conf.IPConflict { + logger.Debug("Try to detect ip conflict") + ipc, err = ipchecking.NewIPChecker(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.hostNs, c.netns, logger) + if err != nil { + return fmt.Errorf("failed to run NewIPChecker: %w", err) + } + ipc.DoIPConflictChecking(prevResult.IPs, c.currentInterface, &errgConflict) + } else { + logger.Debug("disable detect ip conflict") + } + + if err = errgConflict.Wait(); err != nil { + logger.Error("failed to detect ip conflict", zap.Error(err)) + if errors.Is(err, constant.ErrIPConflict) { + logger.Info("ip conflict detected, clean up conflict IPs") + _, innerErr := client.Daemonset.DeleteIpamIps(daemonset.NewDeleteIpamIpsParams().WithContext(context.TODO()).WithIpamBatchDelArgs( + &models.IpamBatchDelArgs{ + ContainerID: &args.ContainerID, + NetNamespace: args.Netns, + PodName: (*string)(&k8sArgs.K8S_POD_NAME), + PodNamespace: (*string)(&k8sArgs.K8S_POD_NAMESPACE), + PodUID: (*string)(&k8sArgs.K8S_POD_UID), + }, + )) + if innerErr != nil { + logger.Sugar().Errorf("failed to clean up conflict IPs, error: %v", innerErr) + return multierr.Append(err, innerErr) + } + } + return err + } + + // Fixed Mac addresses must come after IP conflict detection, otherwise the switch learns to communicate + // with the wrong Mac address when IP conflict detection fails + if len(conf.MacPrefix) != 0 { + hwAddr, err := networking.OverwriteHwAddress(logger, c.netns, conf.MacPrefix, args.IfName) + if err != nil { + return fmt.Errorf("failed to update hardware address for interface %s, maybe hardware_prefix(%s) is invalid: %v", args.IfName, conf.MacPrefix, err) + } + logger.Info("Fix mac address successfully", zap.String("interface", args.IfName), zap.String("macAddress", hwAddr)) + } + + // we do detect gateway connection lastly + // Finally, there is gateway detection, which updates the correct arp table entries + // once there are no IP address conflicts and fixed Mac addresses + errgGateway := errgroup.Group{} if conf.DetectGateway != nil && *conf.DetectGateway { logger.Debug("Try to detect gateway") @@ -202,7 +252,6 @@ func CmdAdd(args *skel.CmdArgs) (err error) { return fmt.Errorf("failed to GetDefaultGatewayByName: %v", err) } logger.Debug("Get GetDefaultGatewayByName", zap.Any("Gws", gws)) - p, err := gwconnection.New(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.currentInterface, logger) if err != nil { return fmt.Errorf("failed to init the gateway client: %v", err) @@ -211,10 +260,10 @@ func CmdAdd(args *skel.CmdArgs) (err error) { for _, gw := range gws { if gw.To4() != nil { p.V4Gw = gw - errg.Go(c.hostNs, c.netns, p.ArpingOverIface) + errgGateway.Go(c.hostNs, c.netns, p.ArpingOverIface) } else { p.V6Gw = gw - errg.Go(c.hostNs, c.netns, p.NDPingOverIface) + errgGateway.Go(c.hostNs, c.netns, p.NDPingOverIface) } } return nil @@ -226,21 +275,10 @@ func CmdAdd(args *skel.CmdArgs) (err error) { logger.Debug("disable detect gateway") } - var ipc *ipchecking.IPChecker - if conf.IPConflict != nil && *conf.IPConflict { - logger.Debug("Try to detect ip conflict") - ipc, err = ipchecking.NewIPChecker(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.hostNs, c.netns, logger) - if err != nil { - return fmt.Errorf("failed to run NewIPChecker: %w", err) - } - ipc.DoIPConflictChecking(prevResult.IPs, c.currentInterface, &errg) - } else { - logger.Debug("disable detect ip conflict") - } - - if err = errg.Wait(); err != nil { - logger.Error("failed to detect gateway and ip checking", zap.Error(err)) - if errors.Is(err, constant.ErrIPConflict) || errors.Is(err, constant.ErrGatewayUnreachable) { + if err = errgGateway.Wait(); err != nil { + logger.Error("failed to detect gateway reachable", zap.Error(err)) + if errors.Is(err, constant.ErrGatewayUnreachable) { + logger.Info("gateway unreachable detected, clean up conflict IPs") _, innerErr := client.Daemonset.DeleteIpamIps(daemonset.NewDeleteIpamIpsParams().WithContext(context.TODO()).WithIpamBatchDelArgs( &models.IpamBatchDelArgs{ ContainerID: &args.ContainerID, @@ -255,19 +293,9 @@ func CmdAdd(args *skel.CmdArgs) (err error) { return multierr.Append(err, innerErr) } } - return err } - // overwrite mac address - if len(conf.MacPrefix) != 0 { - hwAddr, err := networking.OverwriteHwAddress(logger, c.netns, conf.MacPrefix, args.IfName) - if err != nil { - return fmt.Errorf("failed to update hardware address for interface %s, maybe hardware_prefix(%s) is invalid: %v", args.IfName, conf.MacPrefix, err) - } - logger.Info("Override hardware address successfully", zap.String("interface", args.IfName), zap.String("hardware address", hwAddr)) - } - // set txqueuelen if conf.TxQueueLen != nil && *conf.TxQueueLen > 0 { if err = networking.LinkSetTxqueueLen(args.IfName, int(*conf.TxQueueLen)); err != nil { diff --git a/docs/concepts/coordinator-zh_CN.md b/docs/concepts/coordinator-zh_CN.md index 4a958314de..4e36be74e1 100644 --- a/docs/concepts/coordinator-zh_CN.md +++ b/docs/concepts/coordinator-zh_CN.md @@ -41,7 +41,7 @@ EOF | podRPFilter | 设置 Pod 的 sysctl 参数 rp_filter | 整数型 | optional | 0 | | hostRPFilter | (遗弃)设置节点 的 sysctl 参数 rp_filter | 整数型 | optional | 0 | | txQueueLen | 设置 Pod 的网卡传输队列 | 整数型 | optional | 0 | -| detectOptions | 检测地址冲突和网关可达性的高级配置项: 包括重试次数(默认为 3 次), 探测间隔(默认为 1s) 和 超时时间(默认为 1s) | 对象类型 | optional | 空 | +| detectOptions | 检测地址冲突和网关可达性的高级配置项: 包括重试次数(默认为 3 次), 探测间隔(默认为 10ms) 和 超时时间(默认为 100ms) | 对象类型 | optional | 空 | | logOptions | 日志配置,包括 logLevel(默认为 debug) 和 logFile(默认为 /var/log/spidernet/coordinator.log) | 对象类型 | optional | - | > 如果您通过 `SpinderMultusConfig CR` 帮助创建 NetworkAttachmentDefinition CR,您可以在 `SpinderMultusConfig` 中配置 `coordinator` (所有字段)。参考: [SpinderMultusConfig](../reference/crd-spidermultusconfig.md)。 @@ -171,7 +171,7 @@ spec: vethLinkAddress: "169.254.200.1" ``` -> `vethLinkAddress` 默认为空,表示不配置。不为空则必须是一个合法的本地链路地址。 +> `vethLinkAddress` 默认为空,表示不配置。不为空则必须是一个合法的本地链路地址。 ## 自动获取集群 Service 的 CIDR diff --git a/docs/concepts/coordinator.md b/docs/concepts/coordinator.md index ffd053f5c8..d6e84a4056 100644 --- a/docs/concepts/coordinator.md +++ b/docs/concepts/coordinator.md @@ -42,7 +42,7 @@ Let's delve into how coordinator implements these features. | podRPFilter | Set the rp_filter sysctl parameter on the pod, which is recommended to be set to 0 | int | optional | 0 | | hostRPFilter | (deprecated)Set the rp_filter sysctl parameter on the node, which is recommended to be set to 0 | int | optional | 0 | | txQueueLen | set txqueuelen(Transmit Queue Length) of the pod's interface | int | optional | 0 | -| detectOptions | The advanced configuration of detectGateway and detectIPConflict, including retry numbers(default is 3), interval(default is 1s) and timeout(default is 1s) | obejct | optional | nil | +| detectOptions | The advanced configuration of detectGateway and detectIPConflict, including retry numbers(default is 3), interval(default is 10ms) and timeout(default is 100ms) | obejct | optional | nil | | logOptions | The configuration of logging, including logLevel(default is debug) and logFile(default is /var/log/spidernet/coordinator.log) | obejct | optional | nil | > You can configure `coordinator` by specifying all the relevant fields in `SpinderMultusConfig` if a NetworkAttachmentDefinition CR is created via `SpinderMultusConfig CR`. For more information, please refer to [SpinderMultusConfig](../reference/crd-spidermultusconfig.md). @@ -101,17 +101,16 @@ spec: > Note: There are some switches that are not allowed to be probed by arp, otherwise an alarm will be issued, in this case, we need to set detectGateway to false - ## Fix MAC address prefix for Pods(alpha) -Some traditional applications may require a fixed MAC address or IP address to couple the behavior of the application. For example, the License Server may need to apply a fixed Mac address -or IP address to issue a license for the app. If the MAC address of a pod changes, the issued license may be invalid. Therefore, you need to fix the MAC address of the pod. Spiderpool can fix +Some traditional applications may require a fixed MAC address or IP address to couple the behavior of the application. For example, the License Server may need to apply a fixed Mac address +or IP address to issue a license for the app. If the MAC address of a pod changes, the issued license may be invalid. Therefore, you need to fix the MAC address of the pod. Spiderpool can fix the MAC address of the application through `coordinator`, and the fixed rule is to configure the MAC address prefix (2 bytes) + convert the IP of the pod (4 bytes). Note: > currently supports updating Macvlan and SR-IOV as pods for CNI. In IPVlan L2 mode, the MAC addresses of the primary interface and the sub-interface are the same and cannot be modified. -> +> > The fixed rule is to configure the MAC address prefix (2 bytes) + the IP of the converted pod (4 bytes). An IPv4 address is 4 bytes long and can be fully converted to 2 hexadecimal numbers. For IPv6 addresses, only the last 4 bytes are taken. We can configure it via Spidermultusconfig: diff --git a/test/e2e/coordinator/macvlan-overlay-one/macvlan_overlay_one_test.go b/test/e2e/coordinator/macvlan-overlay-one/macvlan_overlay_one_test.go index f72be43b4e..1b4833fbdd 100644 --- a/test/e2e/coordinator/macvlan-overlay-one/macvlan_overlay_one_test.go +++ b/test/e2e/coordinator/macvlan-overlay-one/macvlan_overlay_one_test.go @@ -419,6 +419,10 @@ var _ = Describe("MacvlanOverlayOne", Label("overlay", "one-nic", "coordinator") Expect(frame.CreateSpiderMultusInstance(nad)).NotTo(HaveOccurred()) DeferCleanup(func() { + if CurrentSpecReport().Failed() { + GinkgoWriter.Println("If the use case fails, the cleanup step will be skipped") + return + } GinkgoWriter.Printf("delete spiderMultusConfig %v/%v. \n", namespace, multusNadName) Expect(frame.DeleteSpiderMultusInstance(namespace, multusNadName)).NotTo(HaveOccurred()) @@ -698,6 +702,10 @@ var _ = Describe("MacvlanOverlayOne", Label("overlay", "one-nic", "coordinator") Expect(frame.CreateSpiderMultusInstance(nad)).NotTo(HaveOccurred()) DeferCleanup(func() { + if CurrentSpecReport().Failed() { + GinkgoWriter.Println("If the use case fails, the cleanup step will be skipped") + return + } GinkgoWriter.Printf("delete spiderMultusConfig %v/%v. \n", namespace, multusNadName) Expect(frame.DeleteSpiderMultusInstance(namespace, multusNadName)).NotTo(HaveOccurred())