Skip to content

Commit

Permalink
Merge pull request #2655 from cyclinder/coordinator/opt_policy_routing
Browse files Browse the repository at this point in the history
coordinator: improve tuning polocy of routes to support kubevirt
  • Loading branch information
weizhoublue authored Nov 26, 2023
2 parents 00fe216 + eafb00b commit 5de6fcb
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 182 deletions.
182 changes: 32 additions & 150 deletions cmd/coordinator/cmd/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -413,33 +413,41 @@ func (c *coordinator) setupHostRoutes(logger *zap.Logger) error {
// tunePodRoutes make sure that move all routes of podDefaultRouteNIC interface to main table, and move original routes
// in main table to new table
func (c *coordinator) tunePodRoutes(logger *zap.Logger, configDefaultRouteNIC string) error {
if configDefaultRouteNIC == "" {
// by default, We always think currentInterface as pod default router interface
configDefaultRouteNIC = c.currentInterface
}

exist, err := networking.CheckInterfaceExist(c.netns, configDefaultRouteNIC)
if err != nil {
logger.Error("failed to CheckInterfaceExist", zap.String("interface", configDefaultRouteNIC), zap.Error(err))
return fmt.Errorf("failed to CheckInterfaceExist: %v", err)
}

if !exist {
return fmt.Errorf("podDefaultRouteNIC: %s don't exist in pod", configDefaultRouteNIC)
}

podDefaultRouteNIC, err := networking.GetDefaultRouteInterface(c.ipFamily, c.currentInterface, c.netns)
var err error
var podDefaultRouteNIC, moveRouteInterface string
podDefaultRouteNIC, err = networking.GetDefaultRouteInterface(c.ipFamily, c.currentInterface, c.netns)
if err != nil {
logger.Error("failed to GetDefaultRouteInterface", zap.Error(err))
return fmt.Errorf("failed to GetDefaultRouteInterface: %v", err)
}

if podDefaultRouteNIC == "" {
// TODO(cyclinder): should we be return?
// the current interface's default route no found, we can keep all routes of
// this nic in main table, and don't tune the routes
logger.Warn("podDefaultRouteNIC no found in pod, ignore tuneRoutes")
return nil
}
logger.Sugar().Infof("podDefaultRouteNIC: %v", podDefaultRouteNIC)

if configDefaultRouteNIC == "" || configDefaultRouteNIC == podDefaultRouteNIC {
// configDefaultRouteNIC is empty by default, and we always keep the all routes of the
// first NIC is in main and move the all routes of non-first NIC to policy routing table.
// see https://github.com/spidernet-io/spiderpool/issues/2176.
configDefaultRouteNIC = podDefaultRouteNIC
moveRouteInterface = c.currentInterface
} else {
exist, err := networking.CheckInterfaceExist(c.netns, configDefaultRouteNIC)
if err != nil {
logger.Error("failed to CheckInterfaceExist", zap.String("interface", configDefaultRouteNIC), zap.Error(err))
return fmt.Errorf("failed to CheckInterfaceExist: %v", err)
}

if !exist {
return fmt.Errorf("podDefaultRouteNIC: %s don't exist in pod", configDefaultRouteNIC)
}
moveRouteInterface = podDefaultRouteNIC
}

logger.Debug("Start Move Pod's routes", zap.String("configDefaultRouteNIC", configDefaultRouteNIC), zap.String("moveRouteInterface", moveRouteInterface))

// make sure that traffic sent from current interface to lookup table <ruleTable>
// eq: ip rule add from <currentInterfaceIPAddress> lookup <ruleTable>
Expand All @@ -452,38 +460,7 @@ func (c *coordinator) tunePodRoutes(logger *zap.Logger, configDefaultRouteNIC st

logger.Sugar().Infof("defaultInterfaceAddress: %v", defaultInterfaceAddress)

// get all routes of current interface
currentInterfaceRoutes, err := networking.GetRoutesByName(c.currentInterface, c.ipFamily)
if err != nil {
logger.Error("failed to GetRoutesByName", zap.Error(err))
return fmt.Errorf("failed to GetRoutesByName: %v", err)
}

logger.Sugar().Infof("currentInterfaceRoutes: %v", currentInterfaceRoutes)

// get all routes of default route interface
defaultInterfaceRoutes, err := networking.GetRoutesByName(podDefaultRouteNIC, c.ipFamily)
if err != nil {
logger.Error("failed to GetRoutesByName", zap.Error(err))
return fmt.Errorf("failed to GetRoutesByName: %v", err)
}

logger.Sugar().Infof("defaultInterfaceRoutes: %v", defaultInterfaceRoutes)

if configDefaultRouteNIC == c.currentInterface {
for idx, route := range defaultInterfaceRoutes {
zeroIPAddress := net.IPv4zero
if defaultInterfaceRoutes[idx].Family == netlink.FAMILY_V6 {
zeroIPAddress = net.IPv6zero
}
if !route.Dst.IP.Equal(zeroIPAddress) {
if err := networking.AddToRuleTable(defaultInterfaceRoutes[idx].Dst, c.currentRuleTable); err != nil {
logger.Error("failed to AddToRuleTable", zap.Error(err))
return fmt.Errorf("failed to AddToRuleTable: %v", err)
}
}
}

for idx := range defaultInterfaceAddress {
ipNet := networking.ConvertMaxMaskIPNet(defaultInterfaceAddress[idx].IP)
err = networking.AddFromRuleTable(ipNet, c.currentRuleTable)
Expand All @@ -492,55 +469,7 @@ func (c *coordinator) tunePodRoutes(logger *zap.Logger, configDefaultRouteNIC st
return err
}
}

// move all routes of the specified interface to a new route table
if err = networking.MoveRouteTable(logger, podDefaultRouteNIC, unix.RT_TABLE_MAIN, c.currentRuleTable, c.ipFamily); err != nil {
return err
}

} else if configDefaultRouteNIC == podDefaultRouteNIC {
for idx, route := range currentInterfaceRoutes {
zeroIPAddress := net.IPv4zero
if defaultInterfaceRoutes[idx].Family == netlink.FAMILY_V6 {
zeroIPAddress = net.IPv6zero
}
if !route.Dst.IP.Equal(zeroIPAddress) {
if err := networking.AddToRuleTable(currentInterfaceRoutes[idx].Dst, c.currentRuleTable); err != nil {
logger.Error("failed to AddToRuleTable", zap.Error(err))
return fmt.Errorf("failed to AddToRuleTable: %v", err)
}
}
}

for idx := range c.currentAddress {
ipNet := networking.ConvertMaxMaskIPNet(c.currentAddress[idx].IP)
err = networking.AddFromRuleTable(ipNet, c.currentRuleTable)
if err != nil {
logger.Error("failed to AddFromRuleTable", zap.Error(err))
return err
}
}

// move all routes of the specified interface from src rule table to dst route table
if err = networking.MoveRouteTable(logger, c.currentInterface, unix.RT_TABLE_MAIN, c.currentRuleTable, c.ipFamily); err != nil {
return err
}
} else {
// that's mean there are more than 2 interfaces in pod, and
// configDefaultRouteNIC's routes in a new rule table
// we should move configDefaultRouteNIC's routes to main and
// move currentInterface's routes to new rule table

// move current interface's routes to new rule table
for idx, route := range currentInterfaceRoutes {
if route.Dst != nil {
if err := networking.AddToRuleTable(currentInterfaceRoutes[idx].Dst, c.currentRuleTable); err != nil {
logger.Error("failed to AddToRuleTable", zap.Error(err))
return fmt.Errorf("failed to AddToRuleTable: %v", err)
}
}
}

for idx := range c.currentAddress {
ipNet := networking.ConvertMaxMaskIPNet(c.currentAddress[idx].IP)
err = networking.AddFromRuleTable(ipNet, c.currentRuleTable)
Expand All @@ -549,65 +478,18 @@ func (c *coordinator) tunePodRoutes(logger *zap.Logger, configDefaultRouteNIC st
return err
}
}

// move current interface's routes to new rule table
if err = networking.MoveRouteTable(logger, c.currentInterface, unix.RT_TABLE_MAIN, c.currentRuleTable, c.ipFamily); err != nil {
return err
}

routes, err := networking.GetRoutesByName(configDefaultRouteNIC, c.ipFamily)
if err != nil {
return fmt.Errorf("failed to GetRoutesByName for configDefaultRouteNIC: %v", err)
}

address, err := networking.GetAddersByName(configDefaultRouteNIC, c.ipFamily)
if err != nil {
return fmt.Errorf("failed to GetAddrs for configDefaultRouteNIC: %v", err)
}

ruleTable := c.mustGetRuleNumber(c.podNics)
if ruleTable < 0 {
return fmt.Errorf("coordinator must be working with spiderpool: no spiderendpoint records found")
}

// 1. cleanup ip rule to cidr for configDefaultRouteNIC interface
for idx := range routes {
if routes[idx].Dst != nil {
if err = networking.DelToRuleTable(routes[idx].Dst, ruleTable); err != nil {
return fmt.Errorf("failed to DelToRuleTable: %v", err)
}
}
}

// 2. cleanup ip rule from cidr for configDefaultRouteNIC interface
for idx := range address {
if routes[idx].Dst != nil {
if err = networking.DelFromRuleTable(address[idx].IPNet, ruleTable); err != nil {
return fmt.Errorf("failed to DelToRuleTable: %v", err)
}
}
}

// 3. move configDefaultRouteNIC interface's routes to main table
if err = networking.MoveRouteTable(logger, configDefaultRouteNIC, ruleTable, unix.RT_TABLE_MAIN, c.ipFamily); err != nil {
return err
}
}
// move all routes of the specified interface to a new route table
if err = networking.MoveRouteTable(logger, moveRouteInterface, unix.RT_TABLE_MAIN, c.currentRuleTable, c.ipFamily); err != nil {
return err
}

// for idx, _ := range c.hostIPRouteForPod {
// ipNet := networking.ConvertMaxMaskIPNet(c.hostIPRouteForPod[idx])
// if err = networking.DelToRuleTable(ipNet, c.hostRuleTable); err != nil {
// logger.Error("failed to AddToRuleTable", zap.String("Dst", ipNet.String()), zap.Error(err))
// // return fmt.Errorf("failed to AddToRuleTable: %v", err)
// }
// }

logger.Info("tunePodRoutes successfully", zap.String("configDefaultRouteInterface", configDefaultRouteNIC), zap.String("currentDefaultRouteInterface", podDefaultRouteNIC))
logger.Info("tunePodRoutes successfully")
return nil
})

if err != nil {
logger.Error("failed to moveRouteTable for routeMoveInterface", zap.String("routeMoveInterface", configDefaultRouteNIC), zap.Error(err))
logger.Error("failed to moveRouteTable for routeMoveInterface", zap.Error(err))
return err
}

Expand Down
2 changes: 1 addition & 1 deletion docs/concepts/coordinator-zh_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Spiderpool 内置一个叫 `coordinator` 的 CNI meta-plugin, 它在 Main CNI
| type | CNI 的类型 | 字符串 | required |coordinator |
| mode | coordinator 运行的模式. "auto": coordinator 自动判断运行在 Underlay 或者 Overlay; "underlay": 为 Pod 创建一对 Veth 设备,用于转发集群东西向流量。由 Pod 的 Underlay 网卡转发南北向流量; "overlay": 不额外创建 veth 设备,运行在多网卡模式。由 overlay 类型的 CNI(calico,cilium) 转发集群东西向流量,由 underlay 网卡转发南北向流量; "disable": 禁用 coordinator | 字符串 | optional | auto |
| tunePodRoutes | Pod 多网卡模式下,是否调协 Pod 的路由,解决访问来回路径不一致的问题 | 布尔型 | optional | true |
| podDefaultRouteNic | 配置 Pod 的默认路由网卡 | 字符串 | optional | "" |
| podDefaultRouteNic | Pod 多网卡时,配置 Pod 的默认路由网卡。默认为 "", 其 value 实际为 Pod 第一张拥有默认路由的网卡| 字符串 | optional | "" |
| podDefaultCniNic | K8s 中 Pod 默认的第一张网卡 | 布尔型 | optional | eth0 |
| detectGateway | 创建 Pod 时是否检查网关是否可达 | 布尔型 | optional | false |
| detectIPConflict | 创建 Pod 时是否检查 Pod 的 IP 是否可达 | 布尔型 | optional | false |
Expand Down
2 changes: 1 addition & 1 deletion docs/concepts/coordinator.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Let's delve into how coordinator implements these features.
| type | The name of this Spidercoordinators resource | string | required |coordinator |
| mode | the mode in which the coordinator run. "auto": Automatically determine if it's overlay or underlay; "underlay": All NICs for pods are underlay NICs, and in this case the coordinator will create veth-pairs device to solve the problem of underlay pods accessing services; "overlay": The coordinator does not create veth-pair devices, but the first NIC of the pod cannot be an underlay NIC, which is created by overlay CNI (e.g. calico, cilium). Solve the problem of pod access to service through the first NIC; "disable": The coordinator does nothing and exits directly | string | optional | auto |
| tunePodRoutes | Tune the pod's routing tables while a pod is in multi-NIC mode | bool | optional | true |
| podDefaultRouteNic | Configure the default routed NIC for the pod while a pod is in multi-NIC mode | string | optional | "" |
| podDefaultRouteNic | Configure the default routed NIC for the pod while a pod is in multi-NIC mode, The default value is 0, indicate that the first network interface of the pod has the default route. | string | optional | "" |
| podDefaultCniNic | The name of the pod's first NIC defaults to eth0 in kubernetes | bool | optional | eth0 |
| detectGateway | Enable gateway detection while creating pods, which prevent pod creation if the gateway is unreachable | bool | optional | false |
| detectIPConflict | Enable IP conflicting checking for pods, which prevent pod creation if the pod's ip is conflicting | bool | optional | false |
Expand Down
15 changes: 5 additions & 10 deletions docs/usage/underlay_cni_service-zh_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,26 +76,21 @@ kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future versi
# ip rule
0: from all lookup local
32759: from 10.233.105.154 lookup 100
32761: from all to 169.254.1.1 lookup 100
32762: from all to 10.233.64.0/18 lookup 100
32763: from all to 10.233.0.0/18 lookup 100
32765: from all to 10.6.212.102 lookup 100
32766: from all lookup main
32767: from all lookup default
# ip r
default via 10.6.0.1 dev net1
10.6.0.0/16 dev net1 proto kernel scope link src 10.6.212.227
# ip r show table 100
default via 169.254.1.1 dev eth0
10.6.212.102 dev eth0 scope link
10.233.0.0/18 via 10.6.212.102 dev eth0
10.233.64.0/18 via 10.6.212.102 dev eth0
169.254.1.1 dev eth0 scope link
# ip r show table 100
default via 10.6.0.1 dev net1
10.6.0.0/16 dev net1 proto kernel scope link src 10.6.212.227
```

- **32759: from 10.233.105.154 lookup 100**: 确保从 `eth0` (calico 网卡)发出的数据包走 table 100
- **32762: from all to 10.233.64.0/18 lookup 100**: 确保 Pod 访问 ClusterIP 时走 table 100,从 `eth0` 转发出去。
- 默认情况下,net1 的所有子网路由保留在 Main 表; `eth0` 的子网路由保留在 Table 100。
- **32759: from 10.233.105.154 lookup 100**: 确保从 `eth0` (calico 网卡)发出的数据包走 table 100。
- 默认情况下: 除了默认路由,所有路由都保留在 Main 表,但会把 net1 的默认路由移动到 table 100。

这些策略路由确保多网卡场景下,Underlay Pod 也能够正常访问 Service。

Expand Down
17 changes: 6 additions & 11 deletions docs/usage/underlay_cni_service.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,30 +85,25 @@ When creating a Pod in Overlay mode and entering the Pod network command space,
```shell
root@controller:~# kubectl exec -it macvlan-overlay-97bf89fdd-kdgrb sh
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
#

# ip rule
0: from all lookup local
32759: from 10.233.105.154 lookup 100
32761: from all to 169.254.1.1 lookup 100
32762: from all to 10.233.64.0/18 lookup 100
32763: from all to 10.233.0.0/18 lookup 100
32765: from all to 10.6.212.102 lookup 100
32765: from 10.6.212.227 lookup 100
32766: from all lookup main
32767: from all lookup default
# ip r
default via 10.6.0.1 dev net1
10.6.0.0/16 dev net1 proto kernel scope link src 10.6.212.227
# ip r show table 100
default via 169.254.1.1 dev eth0
10.6.212.102 dev eth0 scope link
10.233.0.0/18 via 10.6.212.102 dev eth0
10.233.64.0/18 via 10.6.212.102 dev eth0
169.254.1.1 dev eth0 scope link
# ip r show table 100
default via 10.6.0.1 dev net1
10.6.0.0/16 dev net1 proto kernel scope link src 10.6.212.227
```

- **32759: from 10.233.105.154 lookup 100**: Ensure that packets sent from `eth0` (calico network card) go through table 100
- **32762: from all to 10.233.64.0/18 lookup 100**: Ensure that when Pods access ClusterIP, they go through table 100 and are forwarded out from `eth0`.
- By default, all subnet routes of net1 are reserved in the Main table; subnet routes of `eth0` are reserved in Table 100.
- In the default configuration: Except for the default route, all routes are retained in the Main table, but the default route for 'net1' is moved to table 100.

These policy routes ensure that Underlay Pods can also normally access Service in multi-network card scenarios.

Expand Down
15 changes: 10 additions & 5 deletions pkg/networking/networking/route.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,17 @@ func MoveRouteTable(logger *zap.Logger, iface string, srcRuleTable, dstRuleTable
}

if route.LinkIndex == link.Attrs().Index {
if err = netlink.RouteDel(&route); err != nil {
logger.Error("failed to RouteDel in main", zap.String("route", route.String()), zap.Error(err))
return fmt.Errorf("failed to RouteDel %s in main table: %+v", route.String(), err)
// only delete default route
if route.Dst == nil || route.Dst.IP.Equal(net.IPv4zero) {
if err = netlink.RouteDel(&route); err != nil {
logger.Error("failed to RouteDel in main", zap.String("route", route.String()), zap.Error(err))
return fmt.Errorf("failed to RouteDel %s in main table: %+v", route.String(), err)
}
logger.Debug("Del the default route from main successfully", zap.String("Route", route.String()))
}
logger.Debug("Del the route from main successfully", zap.String("Route", route.String()))

// we need copy the all routes in main table of the podDefaultRouteNic to dstRuleTable.
// Otherwise, the reply packet don't know
route.Table = dstRuleTable
if err = netlink.RouteAdd(&route); err != nil && !os.IsExist(err) {
logger.Error("failed to RouteAdd in new table ", zap.String("route", route.String()), zap.Error(err))
Expand Down Expand Up @@ -220,7 +225,7 @@ func MoveRouteTable(logger *zap.Logger, iface string, srcRuleTable, dstRuleTable
continue
}

logger.Debug("deletedRoute", zap.String("deletedRoute", deletedRoute.String()))
logger.Debug("Deleting IPv6 DefaultRoute", zap.String("deletedRoute", deletedRoute.String()))
if err := netlink.RouteDel(deletedRoute); err != nil {
logger.Error("failed to RouteDel for IPv6", zap.String("Route", route.String()), zap.Error(err))
return fmt.Errorf("failed to RouteDel %v for IPv6: %+v", route.String(), err)
Expand Down
Loading

0 comments on commit 5de6fcb

Please sign in to comment.