diff --git a/cmd/sriov/main.go b/cmd/sriov/main.go index c31ee5b5..1281fde3 100644 --- a/cmd/sriov/main.go +++ b/cmd/sriov/main.go @@ -5,6 +5,7 @@ import ( "fmt" "runtime" "strings" + "time" "github.com/containernetworking/cni/pkg/skel" "github.com/containernetworking/cni/pkg/types" @@ -126,6 +127,8 @@ func cmdAdd(args *skel.CmdArgs) error { result.Interfaces[0].Mtu = *netConf.MTU } + doAnnounce := false + // run the IPAM plugin if netConf.IPAM.Type != "" { var r types.Result @@ -161,31 +164,12 @@ func cmdAdd(args *skel.CmdArgs) error { if !netConf.DPDKMode { err = netns.Do(func(_ ns.NetNS) error { - err := ipam.ConfigureIface(args.IfName, newResult) - if err != nil { - return err - } - - /* After IPAM configuration is done, the following needs to handle the case of an IP address being reused by a different pods. - * This is achieved by sending Gratuitous ARPs and/or Unsolicited Neighbor Advertisements unconditionally. - * Although we set arp_notify and ndisc_notify unconditionally on the interface (please see EnableArpAndNdiscNotify()), the kernel - * only sends GARPs/Unsolicited NA when the interface goes from down to up, or when the link-layer address changes on the interfaces. - * These scenarios are perfectly valid and recommended to be enabled for optimal network performance. - * However for our specific case, which the kernel is unaware of, is the reuse of IP addresses across pods where each pod has a different - * link-layer address for it's SRIOV interface. The ARP/Neighbor cache residing in neighbors would be invalid if an IP address is reused. - * In order to update the cache, the GARP/Unsolicited NA packets should be sent for performance reasons. Otherwise, the neighbors - * may be sending packets with the incorrect link-layer address. Eventually, most network stacks would send ARPs and/or Neighbor - * Solicitation packets when the connection is unreachable. This would correct the invalid cache; however this may take a significant - * amount of time to complete. - * - * The error is ignored here because enabling this feature is only a performance enhancement. - */ - _ = utils.AnnounceIPs(args.IfName, newResult.IPs) - return nil + return ipam.ConfigureIface(args.IfName, newResult) }) if err != nil { return err } + doAnnounce = true } result = newResult } @@ -209,6 +193,32 @@ func cmdAdd(args *skel.CmdArgs) error { return fmt.Errorf("error saving the pci allocation for vf pci address %s: %v", netConf.DeviceID, err) } + if doAnnounce { + _ = netns.Do(func(_ ns.NetNS) error { + /* After IPAM configuration is done, the following needs to handle the case of an IP address being reused by a different pods. + * This is achieved by sending Gratuitous ARPs and/or Unsolicited Neighbor Advertisements unconditionally. + * Although we set arp_notify and ndisc_notify unconditionally on the interface (please see EnableArpAndNdiscNotify()), the kernel + * only sends GARPs/Unsolicited NA when the interface goes from down to up, or when the link-layer address changes on the interfaces. + * These scenarios are perfectly valid and recommended to be enabled for optimal network performance. + * However for our specific case, which the kernel is unaware of, is the reuse of IP addresses across pods where each pod has a different + * link-layer address for it's SRIOV interface. The ARP/Neighbor cache residing in neighbors would be invalid if an IP address is reused. + * In order to update the cache, the GARP/Unsolicited NA packets should be sent for performance reasons. Otherwise, the neighbors + * may be sending packets with the incorrect link-layer address. Eventually, most network stacks would send ARPs and/or Neighbor + * Solicitation packets when the connection is unreachable. This would correct the invalid cache; however this may take a significant + * amount of time to complete. + */ + + /* The interface might not yet have carrier. Wait for it for a short time. */ + hasCarrier := utils.WaitForCarrier(args.IfName, 200*time.Millisecond) + + /* The error is ignored here because enabling this feature is only a performance enhancement. */ + err := utils.AnnounceIPs(args.IfName, result.IPs) + + logging.Debug("announcing IPs", "hasCarrier", hasCarrier, "IPs", result.IPs, "announceError", err) + return nil + }) + } + return types.PrintResult(result, netConf.CNIVersion) } diff --git a/pkg/utils/netlink_manager.go b/pkg/utils/netlink_manager.go index b4481acb..165f0e7d 100644 --- a/pkg/utils/netlink_manager.go +++ b/pkg/utils/netlink_manager.go @@ -30,6 +30,8 @@ type MyNetlink struct { NetlinkManager } +var netLinkLib NetlinkManager = &MyNetlink{} + // LinkByName implements NetlinkManager func (n *MyNetlink) LinkByName(name string) (netlink.Link, error) { return netlink.LinkByName(name) diff --git a/pkg/utils/packet.go b/pkg/utils/packet.go index 7628dc82..6f0a5a1e 100644 --- a/pkg/utils/packet.go +++ b/pkg/utils/packet.go @@ -6,11 +6,13 @@ import ( "fmt" "net" "syscall" + "time" current "github.com/containernetworking/cni/pkg/types/100" "github.com/vishvananda/netlink" "golang.org/x/net/icmp" "golang.org/x/net/ipv6" + "golang.org/x/sys/unix" ) var ( @@ -164,10 +166,8 @@ func SendUnsolicitedNeighborAdvertisement(srcIP net.IP, linkObj netlink.Link) er // AnnounceIPs sends either a GARP or Unsolicited NA depending on the IP address type (IPv4 vs. IPv6 respectively) configured on the interface. func AnnounceIPs(ifName string, ipConfigs []*current.IPConfig) error { - myNetLink := MyNetlink{} - // Retrieve the interface name in the container. - linkObj, err := myNetLink.LinkByName(ifName) + linkObj, err := netLinkLib.LinkByName(ifName) if err != nil { return fmt.Errorf("failed to get netlink device with name %q: %v", ifName, err) } @@ -196,3 +196,36 @@ func AnnounceIPs(ifName string, ipConfigs []*current.IPConfig) error { } return nil } + +// Blocking wait for interface ifName to have carrier (!NO_CARRIER flag). +func WaitForCarrier(ifName string, waitTime time.Duration) bool { + var nextSleepDuration time.Duration + + start := time.Now() + + for nextSleepDuration == 0 || time.Since(start) < waitTime { + if nextSleepDuration == 0 { + nextSleepDuration = 2 * time.Millisecond + } else { + time.Sleep(nextSleepDuration) + /* Grow wait time exponentionally (factor 1.5). */ + nextSleepDuration += nextSleepDuration / 2 + } + + linkObj, err := netLinkLib.LinkByName(ifName) + if err != nil { + return false + } + + /* Wait for carrier, i.e. IFF_UP|IFF_RUNNING. Note that there is also + * IFF_LOWER_UP, but we follow iproute2 ([1]). + * + * [1] https://git.kernel.org/pub/scm/network/iproute2/iproute2.git/tree/ip/ipaddress.c?id=f9601b10c21145f76c3d46c163bac39515ed2061#n86 + */ + if linkObj.Attrs().RawFlags&(unix.IFF_UP|unix.IFF_RUNNING) == (unix.IFF_UP | unix.IFF_RUNNING) { + return true + } + } + + return false +} diff --git a/pkg/utils/packet_test.go b/pkg/utils/packet_test.go new file mode 100644 index 00000000..0e9227c5 --- /dev/null +++ b/pkg/utils/packet_test.go @@ -0,0 +1,52 @@ +package utils + +import ( + "sync/atomic" + "time" + + mocks_utils "github.com/k8snetworkplumbingwg/sriov-cni/pkg/utils/mocks" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/stretchr/testify/mock" + + "github.com/vishvananda/netlink" + "golang.org/x/sys/unix" +) + +var _ = Describe("Packets", func() { + + Context("WaitForCarrier", func() { + It("should wait until the link has IFF_UP flag", func() { + DeferCleanup(func(old NetlinkManager) { netLinkLib = old }, netLinkLib) + + mockedNetLink := &mocks_utils.NetlinkManager{} + netLinkLib = mockedNetLink + + rawFlagsAtomic := new(uint32) + *rawFlagsAtomic = unix.IFF_UP + + fakeLink := &FakeLink{LinkAttrs: netlink.LinkAttrs{ + Index: 1000, + Name: "dummylink", + RawFlags: atomic.LoadUint32(rawFlagsAtomic), + }} + + mockedNetLink.On("LinkByName", "dummylink").Return(fakeLink, nil).Run(func(args mock.Arguments) { + fakeLink.RawFlags = atomic.LoadUint32(rawFlagsAtomic) + }) + + hasCarrier := make(chan bool) + go func() { + hasCarrier <- WaitForCarrier("dummylink", 5*time.Second) + }() + + Consistently(hasCarrier, "100ms").ShouldNot(Receive()) + + go func() { + atomic.StoreUint32(rawFlagsAtomic, unix.IFF_UP|unix.IFF_RUNNING) + }() + + Eventually(hasCarrier, "300ms").Should(Receive()) + }) + }) +})