-
Notifications
You must be signed in to change notification settings - Fork 81
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Detect IP conflicts before gateway detection to fix communication fail #4474
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ jobs: | |
uses: actions/[email protected] | ||
with: | ||
pattern: image-tar-spiderpool-*-${{ inputs.ref }} | ||
merge-multiple: true | ||
path: test/.download | ||
|
||
- name: List downloaded files | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -189,8 +189,58 @@ func CmdAdd(args *skel.CmdArgs) (err error) { | |
|
||
logger.Sugar().Infof("Get coordinator config: %+v", c) | ||
|
||
errg := errgroup.Group{} | ||
// we do detect gateway connection firstly | ||
errgConflict := errgroup.Group{} | ||
// IP conflict detection must precede gateway detection, which avoids the | ||
// possibility that gateway detection may update arp table entries first and cause | ||
// communication problems when IP conflict detection fails | ||
// see https://github.com/spidernet-io/spiderpool/issues/4475 | ||
var ipc *ipchecking.IPChecker | ||
if conf.IPConflict != nil && *conf.IPConflict { | ||
logger.Debug("Try to detect ip conflict") | ||
ipc, err = ipchecking.NewIPChecker(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.hostNs, c.netns, logger) | ||
if err != nil { | ||
return fmt.Errorf("failed to run NewIPChecker: %w", err) | ||
} | ||
ipc.DoIPConflictChecking(prevResult.IPs, c.currentInterface, &errgConflict) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个超时控制 在哪体现的 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ipc, err = ipchecking.NewIPChecker(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.hostNs, c.netns, logger) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (1) 这默认超时时间多少,在代码上 comment 下,这个变量 不容易 查源头的值 (3) 缺省值 是 1s ?太长了
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这些信息会在 final configuration 中打印,类似:
多少合适?100ms? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 100ms 足够够了,并且最好要 实测下 |
||
} else { | ||
logger.Debug("disable detect ip conflict") | ||
} | ||
|
||
if err = errgConflict.Wait(); err != nil { | ||
logger.Error("failed to detect ip conflict", zap.Error(err)) | ||
if errors.Is(err, constant.ErrIPConflict) { | ||
logger.Info("ip conflict detected, clean up conflict IPs") | ||
_, innerErr := client.Daemonset.DeleteIpamIps(daemonset.NewDeleteIpamIpsParams().WithContext(context.TODO()).WithIpamBatchDelArgs( | ||
&models.IpamBatchDelArgs{ | ||
ContainerID: &args.ContainerID, | ||
NetNamespace: args.Netns, | ||
PodName: (*string)(&k8sArgs.K8S_POD_NAME), | ||
PodNamespace: (*string)(&k8sArgs.K8S_POD_NAMESPACE), | ||
PodUID: (*string)(&k8sArgs.K8S_POD_UID), | ||
}, | ||
)) | ||
if innerErr != nil { | ||
logger.Sugar().Errorf("failed to clean up conflict IPs, error: %v", innerErr) | ||
return multierr.Append(err, innerErr) | ||
} | ||
} | ||
return err | ||
} | ||
|
||
// Fixed Mac addresses must come after IP conflict detection, otherwise the switch learns to communicate | ||
// with the wrong Mac address when IP conflict detection fails | ||
if len(conf.MacPrefix) != 0 { | ||
hwAddr, err := networking.OverwriteHwAddress(logger, c.netns, conf.MacPrefix, args.IfName) | ||
if err != nil { | ||
return fmt.Errorf("failed to update hardware address for interface %s, maybe hardware_prefix(%s) is invalid: %v", args.IfName, conf.MacPrefix, err) | ||
} | ||
logger.Info("Fix mac address successfully", zap.String("interface", args.IfName), zap.String("macAddress", hwAddr)) | ||
} | ||
|
||
// we do detect gateway connection lastly | ||
// Finally, there is gateway detection, which updates the correct arp table entries | ||
// once there are no IP address conflicts and fixed Mac addresses | ||
errgGateway := errgroup.Group{} | ||
if conf.DetectGateway != nil && *conf.DetectGateway { | ||
logger.Debug("Try to detect gateway") | ||
|
||
|
@@ -202,7 +252,6 @@ func CmdAdd(args *skel.CmdArgs) (err error) { | |
return fmt.Errorf("failed to GetDefaultGatewayByName: %v", err) | ||
} | ||
logger.Debug("Get GetDefaultGatewayByName", zap.Any("Gws", gws)) | ||
|
||
p, err := gwconnection.New(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.currentInterface, logger) | ||
if err != nil { | ||
return fmt.Errorf("failed to init the gateway client: %v", err) | ||
|
@@ -211,10 +260,10 @@ func CmdAdd(args *skel.CmdArgs) (err error) { | |
for _, gw := range gws { | ||
if gw.To4() != nil { | ||
p.V4Gw = gw | ||
errg.Go(c.hostNs, c.netns, p.ArpingOverIface) | ||
errgGateway.Go(c.hostNs, c.netns, p.ArpingOverIface) | ||
} else { | ||
p.V6Gw = gw | ||
errg.Go(c.hostNs, c.netns, p.NDPingOverIface) | ||
errgGateway.Go(c.hostNs, c.netns, p.NDPingOverIface) | ||
} | ||
} | ||
return nil | ||
|
@@ -226,21 +275,10 @@ func CmdAdd(args *skel.CmdArgs) (err error) { | |
logger.Debug("disable detect gateway") | ||
} | ||
|
||
var ipc *ipchecking.IPChecker | ||
if conf.IPConflict != nil && *conf.IPConflict { | ||
logger.Debug("Try to detect ip conflict") | ||
ipc, err = ipchecking.NewIPChecker(conf.DetectOptions.Retry, conf.DetectOptions.Interval, conf.DetectOptions.TimeOut, c.hostNs, c.netns, logger) | ||
if err != nil { | ||
return fmt.Errorf("failed to run NewIPChecker: %w", err) | ||
} | ||
ipc.DoIPConflictChecking(prevResult.IPs, c.currentInterface, &errg) | ||
} else { | ||
logger.Debug("disable detect ip conflict") | ||
} | ||
|
||
if err = errg.Wait(); err != nil { | ||
logger.Error("failed to detect gateway and ip checking", zap.Error(err)) | ||
if errors.Is(err, constant.ErrIPConflict) || errors.Is(err, constant.ErrGatewayUnreachable) { | ||
if err = errgGateway.Wait(); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同理 检测 网关的 超时控制,在哪 |
||
logger.Error("failed to detect gateway reachable", zap.Error(err)) | ||
if errors.Is(err, constant.ErrGatewayUnreachable) { | ||
logger.Info("gateway unreachable detected, clean up conflict IPs") | ||
_, innerErr := client.Daemonset.DeleteIpamIps(daemonset.NewDeleteIpamIpsParams().WithContext(context.TODO()).WithIpamBatchDelArgs( | ||
&models.IpamBatchDelArgs{ | ||
ContainerID: &args.ContainerID, | ||
|
@@ -255,19 +293,9 @@ func CmdAdd(args *skel.CmdArgs) (err error) { | |
return multierr.Append(err, innerErr) | ||
} | ||
} | ||
|
||
return err | ||
} | ||
|
||
// overwrite mac address | ||
if len(conf.MacPrefix) != 0 { | ||
hwAddr, err := networking.OverwriteHwAddress(logger, c.netns, conf.MacPrefix, args.IfName) | ||
if err != nil { | ||
return fmt.Errorf("failed to update hardware address for interface %s, maybe hardware_prefix(%s) is invalid: %v", args.IfName, conf.MacPrefix, err) | ||
} | ||
logger.Info("Override hardware address successfully", zap.String("interface", args.IfName), zap.String("hardware address", hwAddr)) | ||
} | ||
|
||
// set txqueuelen | ||
if conf.TxQueueLen != nil && *conf.TxQueueLen > 0 { | ||
if err = networking.LinkSetTxqueueLen(args.IfName, int(*conf.TxQueueLen)); err != nil { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
补充 code comment,关于 每一步先后循序 原因说明
arp 问题,引用 issue来详细说明
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done