diff --git a/cmd/antrea-agent/agent.go b/cmd/antrea-agent/agent.go index 28f8d607528..e141af1a862 100644 --- a/cmd/antrea-agent/agent.go +++ b/cmd/antrea-agent/agent.go @@ -140,6 +140,7 @@ func run(o *Options) error { enableNodePortLocal := features.DefaultFeatureGate.Enabled(features.NodePortLocal) && o.config.NodePortLocal.Enable l7NetworkPolicyEnabled := features.DefaultFeatureGate.Enabled(features.L7NetworkPolicy) enableMulticlusterGW := features.DefaultFeatureGate.Enabled(features.Multicluster) && o.config.Multicluster.EnableGateway + _, multiclusterEncryptionMode := config.GetTrafficEncryptionModeFromStr(o.config.Multicluster.TrafficEncryptionMode) enableMulticlusterNP := features.DefaultFeatureGate.Enabled(features.Multicluster) && o.config.Multicluster.EnableStretchedNetworkPolicy enableFLowExporter := features.DefaultFeatureGate.Enabled(features.FlowExporter) && o.config.FlowExporter.Enable @@ -199,7 +200,8 @@ func run(o *Options) error { IPsecConfig: config.IPsecConfig{ AuthenticationMode: ipsecAuthenticationMode, }, - EnableMulticlusterGW: enableMulticlusterGW, + EnableMulticlusterGW: enableMulticlusterGW, + MulticlusterEncryptionMode: multiclusterEncryptionMode, } wireguardConfig := &config.WireGuardConfig{ diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index 8a368bcfc32..c258f7c7dbd 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -69,6 +69,14 @@ const ( roundNumKey = "roundNum" // round number key in externalIDs. initialRoundNum = 1 maxRetryForRoundNumSave = 5 + // On Linux, OVS configures the MTU for tunnel interfaces to 65000. + // See https://github.com/openvswitch/ovs/blame/3e666ba000b5eff58da8abb4e8c694ac3f7b08d6/lib/dpif-netlink-rtnl.c#L348-L360 + // There are some edge cases (e.g., Kind clusters) where the transport Node's MTU may be + // larger than that (e.g., 65535), and packets may be dropped. To account for this, we use + // 65000 as an upper bound for the MTU calculated in getInterfaceMTU, when encap is + // supported. For simplicity's sake, we also use this upper bound for Windows, even if it + // does not apply. + ovsTunnelMaxMTU = 65000 ) var ( @@ -1092,7 +1100,7 @@ func (i *Initializer) waitForIPsecMonitorDaemon() error { // initializeWireguard checks if preconditions are met for using WireGuard and initializes WireGuard client or cleans up. func (i *Initializer) initializeWireGuard() error { - i.wireGuardConfig.MTU = i.nodeConfig.NodeTransportInterfaceMTU - config.WireGuardOverhead + i.wireGuardConfig.MTU = i.nodeConfig.NodeTransportInterfaceMTU - i.networkConfig.WireGuardMTUDeduction wgClient, err := wireguard.New(i.nodeConfig, i.wireGuardConfig) if err != nil { return err @@ -1195,10 +1203,13 @@ func (i *Initializer) getInterfaceMTU(transportInterface *net.Interface) (int, e isIPv6 := i.nodeConfig.NodeIPv6Addr != nil mtu -= i.networkConfig.CalculateMTUDeduction(isIPv6) - - if i.networkConfig.TrafficEncryptionMode == config.TrafficEncryptionModeIPSec { - mtu -= config.IPSecESPOverhead + if i.networkConfig.TrafficEncapMode.SupportsEncap() { + // See comment for ovsTunnelMaxMTU constant above. + if mtu > ovsTunnelMaxMTU { + mtu = ovsTunnelMaxMTU + } } + return mtu, nil } diff --git a/pkg/agent/config/node_config.go b/pkg/agent/config/node_config.go index ebba7f3da9e..59de8bdb8c6 100644 --- a/pkg/agent/config/node_config.go +++ b/pkg/agent/config/node_config.go @@ -34,12 +34,15 @@ const ( ) const ( - vxlanOverhead = 50 - geneveOverhead = 50 - greOverhead = 38 + vxlanOverhead = 50 + geneveOverhead = 50 + // GRE overhead: 14-byte outer MAC, 20-byte outer IPv4, 8-byte GRE header (4-byte standard header + 4-byte key field) + greOverhead = 42 + ipv6ExtraOverhead = 20 - WireGuardOverhead = 80 + // WireGuard overhead: 20-byte outer IPv4, 8-byte UDP header, 4-byte type, 4-byte key index, 8-byte nonce, 16-byte authentication tag + WireGuardOverhead = 60 // IPsec ESP can add a maximum of 38 bytes to the packet including the ESP // header and trailer. IPSecESPOverhead = 38 @@ -201,14 +204,19 @@ type NetworkConfig struct { TransportIfaceCIDRs []string IPv4Enabled bool IPv6Enabled bool - // MTUDeduction only counts IPv4 tunnel overhead, no IPsec and WireGuard overhead. + // MTUDeduction is the MTU deduction for encapsulation and encryption in cluster. MTUDeduction int + // WireGuardMTUDeduction is the MTU deduction for WireGuard encryption. + // It is calculated based on whether IPv6 is used. + WireGuardMTUDeduction int // Set by the defaultMTU config option or auto discovered. // Auto discovery will use MTU value of the Node's transport interface. // For Encap and Hybrid mode, InterfaceMTU will be adjusted to account for // encap header. - InterfaceMTU int - EnableMulticlusterGW bool + InterfaceMTU int + + EnableMulticlusterGW bool + MulticlusterEncryptionMode TrafficEncryptionModeType } // IsIPv4Enabled returns true if the cluster network supports IPv4. Legal cases are: @@ -264,24 +272,50 @@ func (nc *NetworkConfig) NeedsDirectRoutingToPeer(peerIP net.IP, localIP *net.IP return (nc.TrafficEncapMode == TrafficEncapModeNoEncap || nc.TrafficEncapMode == TrafficEncapModeHybrid) && localIP.Contains(peerIP) } +func (nc *NetworkConfig) getEncapMTUDeduction(isIPv6 bool) int { + var deduction int + if nc.TunnelType == ovsconfig.VXLANTunnel { + deduction = vxlanOverhead + } else if nc.TunnelType == ovsconfig.GeneveTunnel { + deduction = geneveOverhead + } else if nc.TunnelType == ovsconfig.GRETunnel { + deduction = greOverhead + } else { + return 0 + } + if isIPv6 { + deduction += ipv6ExtraOverhead + } + return deduction +} + func (nc *NetworkConfig) CalculateMTUDeduction(isIPv6 bool) int { - var mtuDeduction int - // When Multi-cluster Gateway is enabled, we need to reduce MTU for potential cross-cluster traffic. - if nc.TrafficEncapMode.SupportsEncap() || nc.EnableMulticlusterGW { - if nc.TunnelType == ovsconfig.VXLANTunnel { - mtuDeduction = vxlanOverhead - } else if nc.TunnelType == ovsconfig.GeneveTunnel { - mtuDeduction = geneveOverhead - } else if nc.TunnelType == ovsconfig.GRETunnel { - mtuDeduction = greOverhead - } + nc.WireGuardMTUDeduction = WireGuardOverhead + if isIPv6 { + nc.WireGuardMTUDeduction += ipv6ExtraOverhead } - if nc.TrafficEncapMode.SupportsEncap() && isIPv6 { - mtuDeduction += ipv6ExtraOverhead + if nc.EnableMulticlusterGW { + nc.MTUDeduction = nc.getEncapMTUDeduction(isIPv6) + // When multi-cluster WireGuard is enabled, cross-cluster traffic will be encapsulated and encrypted, we need to + // reduce MTU for both encapsulation and encryption. + if nc.MulticlusterEncryptionMode == TrafficEncryptionModeWireGuard { + nc.MTUDeduction += nc.WireGuardMTUDeduction + } + return nc.MTUDeduction + } + if nc.TrafficEncapMode.SupportsEncap() { + nc.MTUDeduction = nc.getEncapMTUDeduction(isIPv6) + } + if nc.TrafficEncryptionMode == TrafficEncryptionModeWireGuard { + // When WireGuard is enabled, cross-node traffic will only be encrypted, just reduce MTU for encryption. + nc.MTUDeduction = nc.WireGuardMTUDeduction + } else if nc.TrafficEncryptionMode == TrafficEncryptionModeIPSec { + // When IPsec is enabled, cross-node traffic will be encapsulated and encrypted, we need to reduce MTU for both + // encapsulation and encryption. + nc.MTUDeduction += IPSecESPOverhead } - nc.MTUDeduction = mtuDeduction - return mtuDeduction + return nc.MTUDeduction } // ServiceConfig includes K8s Service CIDR and available IP addresses for NodePort. diff --git a/pkg/agent/config/node_config_test.go b/pkg/agent/config/node_config_test.go index 081ce053c7f..bc9791a7468 100644 --- a/pkg/agent/config/node_config_test.go +++ b/pkg/agent/config/node_config_test.go @@ -298,7 +298,7 @@ func TestCalculateMTUDeduction(t *testing.T) { { name: "GRE encap without IPv6", nc: &NetworkConfig{TunnelType: ovsconfig.GRETunnel}, - expectedMTUDeduction: 38, + expectedMTUDeduction: 42, }, { name: "Default encap with IPv6", @@ -306,6 +306,52 @@ func TestCalculateMTUDeduction(t *testing.T) { isIPv6: true, expectedMTUDeduction: 70, }, + { + name: "WireGuard enabled", + nc: &NetworkConfig{TrafficEncryptionMode: TrafficEncryptionModeWireGuard}, + expectedMTUDeduction: 60, + }, + { + name: "IPv6 with WireGuard enabled", + nc: &NetworkConfig{TrafficEncryptionMode: TrafficEncryptionModeWireGuard}, + isIPv6: true, + expectedMTUDeduction: 80, + }, + { + name: "Multicluster enabled with Geneve encap", + nc: &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, EnableMulticlusterGW: true}, + expectedMTUDeduction: 50, + }, + { + name: "Geneve encap with Multicluster WireGuard enabled", + nc: &NetworkConfig{ + TunnelType: ovsconfig.GeneveTunnel, + EnableMulticlusterGW: true, + MulticlusterEncryptionMode: TrafficEncryptionModeWireGuard, + }, + expectedMTUDeduction: 110, + }, + { + name: "Geneve encap with IPSec enabled", + nc: &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec}, + expectedMTUDeduction: 88, + }, + { + name: "Geneve encap with IPSec enabled and IPv6", + nc: &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec}, + isIPv6: true, + expectedMTUDeduction: 108, + }, + { + name: "VXLan encap with IPSec enabled", + nc: &NetworkConfig{TunnelType: ovsconfig.VXLANTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec}, + expectedMTUDeduction: 88, + }, + { + name: "GRE encap with IPSec enabled", + nc: &NetworkConfig{TunnelType: ovsconfig.GRETunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec}, + expectedMTUDeduction: 80, + }, } for _, tt := range tests { diff --git a/pkg/agent/multicluster/mc_route_controller.go b/pkg/agent/multicluster/mc_route_controller.go index 568371329d5..d13f1b52a10 100644 --- a/pkg/agent/multicluster/mc_route_controller.go +++ b/pkg/agent/multicluster/mc_route_controller.go @@ -129,7 +129,9 @@ func NewMCDefaultRouteController( controller.wireGuardConfig = &config.WireGuardConfig{ Port: multiclusterConfig.WireGuard.Port, Name: multiclusterWireGuardInterface, - MTU: controller.nodeConfig.NodeTransportInterfaceMTU - controller.networkConfig.MTUDeduction - config.WireGuardOverhead, + // Regardless of the tunnel type, the WireGuard device must only reduce MTU for encryption because the + // packets it transmits have been encapsulated. + MTU: nodeConfig.NodeTransportInterfaceMTU - networkConfig.WireGuardMTUDeduction, } } controller.gwInformer.Informer().AddEventHandlerWithResyncPeriod( diff --git a/pkg/agent/wireguard/client_linux.go b/pkg/agent/wireguard/client_linux.go index b34c4631952..4fa56348a16 100644 --- a/pkg/agent/wireguard/client_linux.go +++ b/pkg/agent/wireguard/client_linux.go @@ -52,6 +52,7 @@ var _ Interface = (*client)(nil) var ( linkAdd = netlink.LinkAdd linkSetUp = netlink.LinkSetUp + linkSetMTU = netlink.LinkSetMTU utilConfigureLinkAddresses = util.ConfigureLinkAddresses ) @@ -85,12 +86,18 @@ func New(nodeConfig *config.NodeConfig, wireGuardConfig *config.WireGuardConfig) func (client *client) Init(ipv4 net.IP, ipv6 net.IP) (string, error) { link := &netlink.Wireguard{LinkAttrs: netlink.LinkAttrs{Name: client.wireGuardConfig.Name, MTU: client.wireGuardConfig.MTU}} err := linkAdd(link) - // Ignore existing link as it may have already been created or managed by userspace process. - if err != nil && !errors.Is(err, unix.EEXIST) { - if errors.Is(err, unix.EOPNOTSUPP) { + if err != nil { + // Ignore existing link as it may have already been created or managed by userspace process, just ensure the MTU + // is set correctly. + if errors.Is(err, unix.EEXIST) { + if err := linkSetMTU(link, client.wireGuardConfig.MTU); err != nil { + return "", fmt.Errorf("failed to change WireGuard link MTU to %d: %w", client.wireGuardConfig.MTU, err) + } + } else if errors.Is(err, unix.EOPNOTSUPP) { return "", fmt.Errorf("WireGuard not supported by the Linux kernel (netlink: %w), make sure the WireGuard kernel module is loaded", err) + } else { + return "", err } - return "", err } if err := linkSetUp(link); err != nil { return "", err diff --git a/pkg/agent/wireguard/client_test.go b/pkg/agent/wireguard/client_test.go index cb10e6cda82..1062eb1c911 100644 --- a/pkg/agent/wireguard/client_test.go +++ b/pkg/agent/wireguard/client_test.go @@ -390,7 +390,8 @@ func Test_Init(t *testing.T) { tests := []struct { name string linkAddErr error - lindSetupErr error + linkSetUpErr error + linkSetMTUErr error utilConfigErr error expectedErr string extraIPv4 net.IP @@ -404,6 +405,16 @@ func Test_Init(t *testing.T) { linkAddErr: unix.EOPNOTSUPP, expectedErr: "WireGuard not supported by the Linux kernel (netlink: operation not supported), make sure the WireGuard kernel module is loaded", }, + { + name: "init successfully with unix.EEXIST error", + linkAddErr: unix.EEXIST, + }, + { + name: "failed to init due to linkSetMTU error", + linkAddErr: unix.EEXIST, + linkSetMTUErr: errors.New("link set mtu failed"), + expectedErr: "failed to change WireGuard link MTU to 1420: link set mtu failed", + }, { name: "failed to init due to link add error", linkAddErr: errors.New("link add failed"), @@ -411,7 +422,7 @@ func Test_Init(t *testing.T) { }, { name: "failed to init due to link setup error", - lindSetupErr: errors.New("link setup failed"), + linkSetUpErr: errors.New("link setup failed"), expectedErr: "link setup failed", }, { @@ -441,7 +452,10 @@ func Test_Init(t *testing.T) { return tt.linkAddErr } linkSetUp = func(link netlink.Link) error { - return tt.lindSetupErr + return tt.linkSetUpErr + } + linkSetMTU = func(link netlink.Link, mtu int) error { + return tt.linkSetMTUErr } utilConfigureLinkAddresses = func(idx int, ipNets []*net.IPNet) error { return tt.utilConfigErr diff --git a/test/e2e/antreaipam_test.go b/test/e2e/antreaipam_test.go index 5b3ab9acf2c..f337d8d0fed 100644 --- a/test/e2e/antreaipam_test.go +++ b/test/e2e/antreaipam_test.go @@ -267,16 +267,16 @@ func testAntreaIPAMPodConnectivitySameNode(t *testing.T, data *TestData) { }) workerNode := workerNodeName(1) - t.Logf("Creating %d agnhost Pods on '%s'", numPods+1, workerNode) + t.Logf("Creating %d toolbox Pods on '%s'", numPods+1, workerNode) for i := range podInfos { podInfos[i].os = clusterInfo.nodesOS[workerNode] - if err := data.createAgnhostPodOnNodeWithAnnotations(podInfos[i].name, podInfos[i].namespace, workerNode, nil); err != nil { - t.Fatalf("Error when creating agnhost test Pod '%s': %v", podInfos[i], err) + if err := data.createToolboxPodOnNode(podInfos[i].name, podInfos[i].namespace, workerNode, false); err != nil { + t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i], err) } defer deletePodWrapper(t, data, podInfos[i].namespace, podInfos[i].name) } - data.runPingMesh(t, podInfos, agnhostContainerName) + data.runPingMesh(t, podInfos, toolboxContainerName, true) } func testAntreaIPAMPodConnectivityDifferentNodes(t *testing.T, data *TestData) { @@ -290,7 +290,7 @@ func testAntreaIPAMPodConnectivityDifferentNodes(t *testing.T, data *TestData) { } podInfos = append(podInfos, createdPodInfos...) } - data.runPingMesh(t, podInfos, agnhostContainerName) + data.runPingMesh(t, podInfos, toolboxContainerName, true) } func testAntreaIPAMStatefulSet(t *testing.T, data *TestData, dedicatedIPPoolKey *string) { diff --git a/test/e2e/connectivity_test.go b/test/e2e/connectivity_test.go index f6bbf5e199e..fb94e0337fc 100644 --- a/test/e2e/connectivity_test.go +++ b/test/e2e/connectivity_test.go @@ -93,7 +93,9 @@ func waitForPodIPs(t *testing.T, data *TestData, podInfos []podInfo) map[string] // runPingMesh runs a ping mesh between all the provided Pods after first retrieving their IP // addresses. -func (data *TestData) runPingMesh(t *testing.T, podInfos []podInfo, ctrname string) { +// When dontFragment is true, it will specify the packet size to the maximum value the MTU allows and set DF flag to +// validate the MTU is correct. +func (data *TestData) runPingMesh(t *testing.T, podInfos []podInfo, ctrname string, dontFragment bool) { podIPs := waitForPodIPs(t, data, podInfos) t.Logf("Ping mesh test between all Pods") @@ -110,7 +112,7 @@ func (data *TestData) runPingMesh(t *testing.T, podInfos []podInfo, ctrname stri if pi2.namespace != "" { pod2Namespace = pi2.namespace } - if err := data.runPingCommandFromTestPod(pi1, podNamespace, podIPs[pi2.name], ctrname, pingCount, 0); err != nil { + if err := data.runPingCommandFromTestPod(pi1, podNamespace, podIPs[pi2.name], ctrname, pingCount, 0, dontFragment); err != nil { t.Errorf("Ping '%s' -> '%s': ERROR (%v)", k8s.NamespacedName(podNamespace, pi1.name), k8s.NamespacedName(pod2Namespace, pi2.name), err) } else { t.Logf("Ping '%s' -> '%s': OK", k8s.NamespacedName(podNamespace, pi1.name), k8s.NamespacedName(pod2Namespace, pi2.name)) @@ -131,16 +133,16 @@ func (data *TestData) testPodConnectivitySameNode(t *testing.T) { workerNode = workerNodeName(clusterInfo.windowsNodes[0]) } - t.Logf("Creating %d agnhost Pods on '%s'", numPods, workerNode) + t.Logf("Creating %d toolbox Pods on '%s'", numPods, workerNode) for i := range podInfos { podInfos[i].os = clusterInfo.nodesOS[workerNode] - if err := data.createAgnhostPodOnNode(podInfos[i].name, data.testNamespace, workerNode, false); err != nil { - t.Fatalf("Error when creating agnhost test Pod '%s': %v", podInfos[i], err) + if err := data.createToolboxPodOnNode(podInfos[i].name, data.testNamespace, workerNode, false); err != nil { + t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i], err) } defer deletePodWrapper(t, data, data.testNamespace, podInfos[i].name) } - data.runPingMesh(t, podInfos, agnhostContainerName) + data.runPingMesh(t, podInfos, toolboxContainerName, true) } // testPodConnectivityOnSameNode checks that Pods running on the same Node can reach each other, by @@ -185,13 +187,13 @@ func testHostPortPodConnectivity(t *testing.T, data *TestData) { data.testHostPortPodConnectivity(t, data.testNamespace, data.testNamespace) } -// createPodsOnDifferentNodes creates agnhost Pods through a DaemonSet. This function returns information of the created +// createPodsOnDifferentNodes creates toolbox Pods through a DaemonSet. This function returns information of the created // Pods as well as a function which will delete the Pods when called. Since Pods can be on Nodes of different OSes, podInfo // slice instead of PodName slice is used to inform caller of correct commands and options. Linux and Windows Pods are // alternating in this podInfo slice so that the test can cover different connectivity cases between different OSes. func createPodsOnDifferentNodes(t *testing.T, data *TestData, namespace, tag string) (podInfos []podInfo, cleanup func() error) { dsName := "connectivity-test" + tag - _, deleteDaemonSet, err := data.createDaemonSet(dsName, namespace, agnhostContainerName, agnhostImage, []string{"sleep", "3600"}, nil) + _, deleteDaemonSet, err := data.createDaemonSet(dsName, namespace, toolboxContainerName, toolboxImage, []string{"sleep", "3600"}, nil) if err != nil { t.Fatalf("Error when creating DaemonSet '%s': %v", dsName, err) } @@ -264,7 +266,7 @@ func (data *TestData) testPodConnectivityDifferentNodes(t *testing.T) { if len(podInfos) > maxPods { podInfos = podInfos[:maxPods] } - data.runPingMesh(t, podInfos[:numPods], agnhostContainerName) + data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true) } // testPodConnectivityDifferentNodes checks that Pods running on different Nodes can reach each @@ -315,11 +317,11 @@ func testPodConnectivityAfterAntreaRestart(t *testing.T, data *TestData, namespa podInfos, deletePods := createPodsOnDifferentNodes(t, data, namespace, "antrearestart") defer deletePods() - data.runPingMesh(t, podInfos[:numPods], agnhostContainerName) + data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true) data.redeployAntrea(t, deployAntreaDefault) - data.runPingMesh(t, podInfos[:numPods], agnhostContainerName) + data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true) } // testOVSRestartSameNode verifies that datapath flows are not removed when the Antrea Agent Pod is @@ -396,16 +398,16 @@ func testOVSFlowReplay(t *testing.T, data *TestData, namespace string) { } workerNode := workerNodeName(1) - t.Logf("Creating %d busybox test Pods on '%s'", numPods, workerNode) + t.Logf("Creating %d toolbox test Pods on '%s'", numPods, workerNode) for i := range podInfos { podInfos[i].os = clusterInfo.nodesOS[workerNode] - if err := data.createBusyboxPodOnNode(podInfos[i].name, namespace, workerNode, false); err != nil { - t.Fatalf("Error when creating busybox test Pod '%s': %v", podInfos[i].name, err) + if err := data.createToolboxPodOnNode(podInfos[i].name, namespace, workerNode, false); err != nil { + t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i].name, err) } defer deletePodWrapper(t, data, namespace, podInfos[i].name) } - data.runPingMesh(t, podInfos, busyboxContainerName) + data.runPingMesh(t, podInfos, toolboxContainerName, true) var antreaPodName string var err error @@ -487,7 +489,7 @@ func testOVSFlowReplay(t *testing.T, data *TestData, namespace string) { // This should give Antrea ~10s to restore flows, since we generate 10 "pings" with a 1s // interval. t.Logf("Running second ping mesh to check that flows have been restored") - data.runPingMesh(t, podInfos, busyboxContainerName) + data.runPingMesh(t, podInfos, toolboxContainerName, true) flows2, groups2 := dumpFlows(), dumpGroups() numFlows2, numGroups2 := len(flows2), len(groups2) @@ -515,7 +517,7 @@ func testPingLargeMTU(t *testing.T, data *TestData) { pingSize := 2000 t.Logf("Running ping with size %d between Pods %s and %s", pingSize, podInfos[0].name, podInfos[1].name) - if err := data.runPingCommandFromTestPod(podInfos[0], data.testNamespace, podIPs[podInfos[1].name], agnhostContainerName, pingCount, pingSize); err != nil { + if err := data.runPingCommandFromTestPod(podInfos[0], data.testNamespace, podIPs[podInfos[1].name], toolboxContainerName, pingCount, pingSize, false); err != nil { t.Error(err) } } diff --git a/test/e2e/framework.go b/test/e2e/framework.go index 206ff776643..1d05cfafaf4 100644 --- a/test/e2e/framework.go +++ b/test/e2e/framework.go @@ -2149,18 +2149,35 @@ func parseArpingStdout(out string) (sent uint32, received uint32, loss float32, return sent, received, loss, nil } -func (data *TestData) runPingCommandFromTestPod(podInfo podInfo, ns string, targetPodIPs *PodIPs, ctrName string, count int, size int) error { +// RunPingCommandFromTestPod uses ping to check connectivity between the Pod and the given target Pod IPs. +// If dontFragment is true and size is 0, it will set the size to the maximum value allowed by the Pod's MTU. +func (data *TestData) runPingCommandFromTestPod(podInfo podInfo, ns string, targetPodIPs *PodIPs, ctrName string, count int, size int, dontFragment bool) error { if podInfo.os != "windows" && podInfo.os != "linux" { return fmt.Errorf("OS of Pod '%s' is not clear", podInfo.name) } + var sizeIPv4, sizeIPv6 int + // TODO: GetPodInterfaceMTU should work for Windows. + if dontFragment && size == 0 && podInfo.os == "linux" { + mtu, err := data.GetPodInterfaceMTU(ns, podInfo.name, ctrName) + if err != nil { + return fmt.Errorf("error when retrieving MTU of Pod '%s': %w", podInfo.name, err) + } + // 8 ICMP header, 20 IPv4 header, 40 IPv6 header + sizeIPv4 = mtu - 28 + sizeIPv6 = mtu - 48 + } else { + sizeIPv4 = size + sizeIPv6 = size + } + if targetPodIPs.ipv4 != nil { - cmdV4 := getPingCommand(count, size, podInfo.os, targetPodIPs.ipv4) + cmdV4 := getPingCommand(count, sizeIPv4, podInfo.os, targetPodIPs.ipv4, dontFragment) if stdout, stderr, err := data.RunCommandFromPod(ns, podInfo.name, ctrName, cmdV4); err != nil { return fmt.Errorf("error when running ping command '%s': %v - stdout: %s - stderr: %s", strings.Join(cmdV4, " "), err, stdout, stderr) } } if targetPodIPs.ipv6 != nil { - cmdV6 := getPingCommand(count, size, podInfo.os, targetPodIPs.ipv6) + cmdV6 := getPingCommand(count, sizeIPv6, podInfo.os, targetPodIPs.ipv6, dontFragment) if stdout, stderr, err := data.RunCommandFromPod(ns, podInfo.name, ctrName, cmdV6); err != nil { return fmt.Errorf("error when running ping command '%s': %v - stdout: %s - stderr: %s", strings.Join(cmdV6, " "), err, stdout, stderr) } @@ -2398,6 +2415,20 @@ func (data *TestData) GetTransportInterface() (string, error) { return "", fmt.Errorf("no interface was assigned with Node IP %s", nodeIP) } +func (data *TestData) GetPodInterfaceMTU(namespace string, podName string, containerName string) (int, error) { + cmd := []string{"cat", "/sys/class/net/eth0/mtu"} + stdout, stderr, err := data.RunCommandFromPod(namespace, podName, containerName, cmd) + if stdout == "" || stderr != "" || err != nil { + return 0, fmt.Errorf("failed to get interface MTU, stdout: %s, stderr: %s, err: %v", stdout, stderr, err) + } + + mtu, err := strconv.Atoi(strings.TrimSpace(stdout)) + if err != nil { + return 0, fmt.Errorf("failed to convert MTU to int: %v", err) + } + return mtu, nil +} + func (data *TestData) GetNodeMACAddress(node, device string) (string, error) { antreaPod, err := data.getAntreaPodOnNode(node) if err != nil { @@ -2756,10 +2787,6 @@ func (data *TestData) createAgnhostPodWithSAOnNode(name string, ns string, nodeN return NewPodBuilder(name, ns, agnhostImage).OnNode(nodeName).WithCommand([]string{"sleep", "3600"}).WithHostNetwork(hostNetwork).WithServiceAccountName(serviceAccountName).Create(data) } -func (data *TestData) createAgnhostPodOnNodeWithAnnotations(name string, ns string, nodeName string, annotations map[string]string) error { - return NewPodBuilder(name, ns, agnhostImage).OnNode(nodeName).WithCommand([]string{"sleep", "3600"}).WithAnnotations(annotations).Create(data) -} - func (data *TestData) createDaemonSet(name string, ns string, ctrName string, image string, cmd []string, args []string) (*appsv1.DaemonSet, func() error, error) { podSpec := corev1.PodSpec{ Tolerations: controlPlaneNoScheduleTolerations(), @@ -2976,7 +3003,7 @@ func (data *TestData) checkAntreaAgentInfo(interval time.Duration, timeout time. return err } -func getPingCommand(count int, size int, os string, ip *net.IP) []string { +func getPingCommand(count int, size int, os string, ip *net.IP, dontFragment bool) []string { countOption, sizeOption := "-c", "-s" if os == "windows" { countOption = "-n" @@ -2986,6 +3013,10 @@ func getPingCommand(count int, size int, os string, ip *net.IP) []string { if size != 0 { cmd = append(cmd, sizeOption, strconv.Itoa(size)) } + if dontFragment { + cmd = append(cmd, "-M", "do") + } + if ip.To4() != nil { cmd = append(cmd, "-4", ip.String()) } else { diff --git a/test/e2e/ipsec_test.go b/test/e2e/ipsec_test.go index 268a7976d01..24c148ffb6f 100644 --- a/test/e2e/ipsec_test.go +++ b/test/e2e/ipsec_test.go @@ -137,7 +137,9 @@ func testIPSecTunnelConnectivity(t *testing.T, data *TestData, certAuth bool) { podInfos, deletePods := createPodsOnDifferentNodes(t, data, data.testNamespace, tag) defer deletePods() t.Logf("Executing ping tests across Nodes: '%s' <-> '%s'", podInfos[0].nodeName, podInfos[1].nodeName) - data.runPingMesh(t, podInfos[:2], agnhostContainerName) + // PMTU is wrong when using GRE+IPsec with some Linux kernel versions, do not set DF to work around. + // See https://github.com/antrea-io/antrea/issues/5922 for more details. + data.runPingMesh(t, podInfos[:2], toolboxContainerName, false) // Check that there is at least one 'up' Security Association on the Node nodeName := podInfos[0].nodeName diff --git a/test/e2e/traceflow_test.go b/test/e2e/traceflow_test.go index db675ba94e6..687c0b53f4c 100644 --- a/test/e2e/traceflow_test.go +++ b/test/e2e/traceflow_test.go @@ -1187,7 +1187,7 @@ func testTraceflowInterNode(t *testing.T, data *TestData) { podInfos[1].name = node2Pods[2] podInfos[1].namespace = data.testNamespace podInfos[1].os = "windows" - data.runPingMesh(t, podInfos, agnhostContainerName) + data.runPingMesh(t, podInfos, agnhostContainerName, false) } // Setup 2 NetworkPolicies: @@ -2429,7 +2429,7 @@ func runTestTraceflow(t *testing.T, data *TestData, tc testcase) { // Give a little time for Nodes to install OVS flows. time.Sleep(time.Second * 2) // Send an ICMP echo packet from the source Pod to the destination. - if err := data.runPingCommandFromTestPod(podInfo{srcPod, osString, "", ""}, data.testNamespace, dstPodIPs, agnhostContainerName, 2, 0); err != nil { + if err := data.runPingCommandFromTestPod(podInfo{srcPod, osString, "", ""}, data.testNamespace, dstPodIPs, agnhostContainerName, 2, 0, false); err != nil { t.Logf("Ping '%s' -> '%v' failed: ERROR (%v)", srcPod, *dstPodIPs, err) } } diff --git a/test/e2e/upgrade_test.go b/test/e2e/upgrade_test.go index 255e8153af5..df98aed402c 100644 --- a/test/e2e/upgrade_test.go +++ b/test/e2e/upgrade_test.go @@ -54,9 +54,9 @@ func TestUpgrade(t *testing.T) { nodeName := nodeName(0) podName := randName("test-pod-") - t.Logf("Creating a busybox test Pod on '%s'", nodeName) - if err := data.createBusyboxPodOnNode(podName, data.testNamespace, nodeName, false); err != nil { - t.Fatalf("Error when creating busybox test Pod: %v", err) + t.Logf("Creating a toolbox test Pod on '%s'", nodeName) + if err := data.createToolboxPodOnNode(podName, data.testNamespace, nodeName, false); err != nil { + t.Fatalf("Error when creating toolbox test Pod: %v", err) } if err := data.podWaitForRunning(defaultTimeout, podName, data.testNamespace); err != nil { t.Fatalf("Error when waiting for Pod '%s' to be in the Running state", podName) diff --git a/test/e2e/vmagent_test.go b/test/e2e/vmagent_test.go index 884c81d7c8f..048806e5a75 100644 --- a/test/e2e/vmagent_test.go +++ b/test/e2e/vmagent_test.go @@ -652,7 +652,7 @@ func createANPWithFQDN(t *testing.T, data *TestData, name string, namespace stri func runPingCommandOnVM(data *TestData, dstVM vmInfo, connected bool) error { dstIP := net.ParseIP(dstVM.ip) - cmd := getPingCommand(pingCount, 0, strings.ToLower(linuxOS), &dstIP) + cmd := getPingCommand(pingCount, 0, strings.ToLower(linuxOS), &dstIP, false) cmdStr := strings.Join(cmd, " ") expCount := pingCount if !connected { diff --git a/test/e2e/wireguard_test.go b/test/e2e/wireguard_test.go index 7e4d3fc10e7..bb12fa26095 100644 --- a/test/e2e/wireguard_test.go +++ b/test/e2e/wireguard_test.go @@ -69,7 +69,7 @@ func testPodConnectivity(t *testing.T, data *TestData) { podInfos, deletePods := createPodsOnDifferentNodes(t, data, data.testNamespace, "differentnodes") defer deletePods() numPods := 2 - data.runPingMesh(t, podInfos[:numPods], agnhostContainerName) + data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true) // Make sure that route to Pod on peer Node and route to peer gateway is targeting the WireGuard device. srcPod, err := data.getAntreaPodOnNode(nodeName(0))