From 46d475fcb916d14a9615b64a0363df57ac4b6833 Mon Sep 17 00:00:00 2001 From: Hongliang Liu Date: Fri, 4 Aug 2023 16:36:06 +0800 Subject: [PATCH] Update OVS pipeline document Signed-off-by: Hongliang Liu --- docs/assets/ovs-pipeline-antrea-proxy.svg | 4835 ----------------- docs/assets/ovs-pipeline.svg | 5997 +++++++-------------- docs/design/ovs-pipeline.md | 2052 ++++--- pkg/agent/openflow/fields.go | 8 +- 4 files changed, 3019 insertions(+), 9873 deletions(-) delete mode 100644 docs/assets/ovs-pipeline-antrea-proxy.svg diff --git a/docs/assets/ovs-pipeline-antrea-proxy.svg b/docs/assets/ovs-pipeline-antrea-proxy.svg deleted file mode 100644 index 7016a665305..00000000000 --- a/docs/assets/ovs-pipeline-antrea-proxy.svg +++ /dev/null @@ -1,4835 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/assets/ovs-pipeline.svg b/docs/assets/ovs-pipeline.svg index c60576a18e1..28e7cdca320 100644 --- a/docs/assets/ovs-pipeline.svg +++ b/docs/assets/ovs-pipeline.svg @@ -2,14 +2,14 @@ - + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> - - - + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> - - - - - - - - - + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> - - - - - - - - - - - - - - - - - - + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> @@ -610,26 +490,25 @@ orient="auto" refY="0" refX="0" - id="marker1534-5" + id="marker1488-1" style="overflow:visible" inkscape:isstock="true"> @@ -639,3886 +518,1718 @@ orient="auto" refY="0" refX="0" - id="marker5914-9-9" + id="marker1644-8" style="overflow:visible" - inkscape:isstock="true" - inkscape:collect="always"> + inkscape:isstock="true"> - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + id="path1642-1" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + - - - - - + id="path1642-1-3-0-7-49" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + + + + + + + image/svg+xml + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - ARP packets + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + y="30.55513" + x="39.744644" + id="tspan1018-5" + sodipodi:role="line">IP packets + + + + diff --git a/docs/design/ovs-pipeline.md b/docs/design/ovs-pipeline.md index 5188cfe43f5..ac5f04a69b9 100644 --- a/docs/design/ovs-pipeline.md +++ b/docs/design/ovs-pipeline.md @@ -2,8 +2,7 @@ ## Terminology -* *Node Route Controller*: the [K8s - controller](https://kubernetes.io/docs/concepts/architecture/controller/) +* *Node Route Controller*: the [K8s controller](https://kubernetes.io/docs/concepts/architecture/controller/) which is part of the Antrea Agent and watches for updates to Nodes. When a Node is added, it updates the local networking configuration (e.g. configure the tunnel to the new Node). When a Node is deleted, it performs the necessary @@ -54,75 +53,140 @@ **This document currently makes the following assumptions:** -* Antrea is used in encap mode (an overlay network is created between all Nodes) -* All the Nodes are Linux Nodes -* IPv6 is disabled -* AntreaProxy is enabled -* AntreaPolicy is enabled +* Antrea is deployed with default configurations and feature gates. +* IPv4 only. -## Dumping the Flows +## Dumping the Flows / Groups -This guide includes a representative flow dump for every table in the pipeline, -in order to illustrate the function of each table. If you have a cluster running -Antrea, you can dump the flows for a given Node as follows: +This guide includes a representative flow dump for every table in the pipeline, in order to illustrate the function of +each table. If you have a cluster running Antrea, you can dump the flows or groups on a given Node as follows: ```bash -kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows [--no-stats] [--names] +# Dump all flows. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows -O Openflow15 [--no-stats] [--names] + +# Dump all groups. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-groups -O Openflow15 [--no-stats] [--names] +``` + +where `` is the name of the Antrea Agent Pod running on that Node, and `` is the name +of the bridge created by Antrea (`br-int` by default). + +You can also dump the flows of a table or a group on a given Node as follows: + +```bash +# Dump flows of a table. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows table= -O Openflow15 [--no-stats] [--names] + +# Dump a group. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-groups -O Openflow15 [--no-stats] [--names] ``` -where `` is the name of the Antrea Agent Pod running on -that Node and `` is the name of the bridge created by Antrea -(`br-int` by default). +where `` is the name of a table in the pipeline, `` is the ID of a group. ## Registers -We use 2 32-bit OVS registers to carry information throughout the pipeline: - -* reg0 (NXM_NX_REG0): - - bits [0..3] are used to store the traffic source (from tunnel: 0, from - local gateway: 1, from local Pod: 2). It is set in [ClassifierTable]. - - bit 16 is used to indicate whether the destination MAC address of a packet - is "known", i.e. corresponds to an entry in [L2ForwardingCalcTable], which - is essentially a "dmac" table. - - bit 18 is used to indicate whether the packet should be output to the port - on which it was received. It is consumed in [L2ForwardingOutTable] - to output the packet with action `IN_PORT`. - - bit 19 is used to indicate whether the destination and source MACs of the - packet should be rewritten in [l3ForwardingTable]. The bit is set for - packets received from the tunnel port in [ClassifierTable]. The - destination MAC of such packets is the Global Virtual MAC and should be - rewritten to the destination port's MAC before output to the port. When such - a packet is destined to a Pod, its source MAC should be rewritten to the - local gateway port's MAC too. -* reg1 (NXM_NX_REG1): it is used to store the egress OF port for the packet. It - is set in [DNATTable] for traffic destined to Services and in - [L2ForwardingCalcTable] otherwise. It is consumed in [L2ForwardingOutTable] to - output each packet to the correct port. -* reg3 (NXM_NX_REG3): it is used to store selected Service Endpoint IPv4 address - in OVS group entry. It is consumed in [EndpointDNATTable]. -* reg4 (NXM_NX_REG4): - * bits [0..16] are used to store selected Service Endpoint port number in OVS - group entry. They are consumed in [EndpointDNATTable]. - * bits [17..18] are used to store the state of a Service request packet. - Marks in this field include, - * 0b001: packet needs to do Endpoint selection. - * 0b010: packet has done Endpoint selection. - * 0b011: packet has done Endpoint selection and the selection result needs to - be cached. - -## Network Policy Implementation - -Several tables of the pipeline are dedicated to [K8s Network -Policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/) -implementation ([EgressRuleTable], [EgressDefaultTable], [IngressRuleTable] and -[IngressDefaultTable]). - -The Antrea implementation of K8s Network Policy, including the communication -channel between the Controller and Agents, and how a Network Policy is mapped to -OVS flows at each Node, will be described in details in a separate document. For -the present document, we will use the Network Policy example below, and explain -how these simple ingress and egress rules map to individual flows as we describe -the relevant tables of our pipeline. +We use some OVS registers to carry information throughout the pipeline. + +| Register | Field Range | Field Name | RegMark Value | RegMark Name | Description | +|---------------|-------------|---------------------------|---------------|---------------------------------|------------------------------------------------------------------------------------------| +| NXM_NX_REG0 | bits 0-3 | PktSourceField | 0x1 | FromTunnelRegMark | Packet source is tunnel port. | +| | | | 0x2 | FromGatewayRegMark | Packet source is Antrea gateway port. | +| | | | 0x3 | FromLocalRegMark | Packet source is local Pod. | +| | | | 0x4 | FromUplinkRegMark | Packet source is uplink port. | +| | | | 0x5 | FromBridgeRegMark | Packet source is local bridge port. | +| | | | 0x6 | FromTCReturnRegMark | Packet source is TrafficControl return port. | +| | bits 4-7 | PktDestinationField | 0x1 | ToTunnelRegMark | Packet destination is tunnel port. | +| | | | 0x2 | ToGatewayRegMark | Packet destination is local Antrea gateway port. | +| | | | 0x3 | ToLocalRegMark | Packet destination is local Pod. | +| | | | 0x4 | ToUplinkRegMark | Packet destination is uplink port. | +| | | | 0x5 | ToBridgeRegMark | Packet destination is local bridge port. | +| | bit 9 | | 0b0 | NotRewriteMACRegMark | Packet's source / destination MAC address does not need to be rewritten. | +| | | | 0b1 | RewriteMACRegMark | Packet's source / destination MAC address needs to be rewritten. | +| | bit 10 | | 0b1 | APDenyRegMark | Packet denied (Drop / Reject) by Antrea NetworkPolicy. | +| | bits 11-12 | APDispositionField | 0b00 | DispositionAllowRegMark | Indicates Antrea NetworkPolicy disposition: allow. | +| | | | 0b01 | DispositionDropRegMark | Indicates Antrea NetworkPolicy disposition: drop. | +| | | | 0b11 | DispositionPassRegMark | Indicates Antrea NetworkPolicy disposition: pass. | +| | bit 13 | | 0b1 | GeneratedRejectPacketOutRegMark | Indicates packet is a generated reject response packet-out. | +| | bit 14 | | 0b1 | SvcNoEpRegMark | Indicates packet towards a Service without Endpoint (used by AntreaProxy). | +| | bit 19 | | 0b1 | RemoteSNATRegMark | Indicates packet needs SNAT on a remote Node (used by Egress). | +| | bit 22 | | 0b1 | L7NPRedirectRegMark | Indicates L7 Antrea NetworkPolicy disposition of redirect. | +| | bits 21-22 | OutputRegField | 0b01 | OutputToOFPortRegMark | Output packet to an OVS port. | +| | | | 0b10 | OutputToControllerRegMark | Send packet to Antrea Agent. | +| | bits 25-32 | PacketInOperationField | 0b00000001 | PacketInNPLoggingOperation | Indicates packet needs logging for NetworkPolicy packetIn operation. | +| | | | 0b00000010 | PacketInNPRejectOperation | Indicates packet should be rejected for NetworkPolicy packetIn operation. | +| | | | 0b00000100 | PacketInNPStoreDenyOperation | Indicates the corresponding connection has been dropped or rejected. | +| NXM_NX_REG1 | bits 0-31 | TargetOFPortField | | | Egress OVS port of packet. | +| NXM_NX_REG2 | bits 0-31 | SwapField | | | Swap values in flow fields in OpenFlow actions. | +| | | PacketInTableField | | | OVS table where it was decided to send packet to controller (Antrea Agent). | +| NXM_NX_REG3 | bits 0-31 | EndpointIPField | | | Field to store IPv4 address of selected Service Endpoint. | +| | bits 0-31 | APConjIDField | | | Field to store Conjunction ID for Antrea Policy. | +| NXM_NX_REG4 | bits 0-15 | EndpointPortField | | | Field store TCP/UDP/SCTP port of a Service's selected Endpoint. | +| | bits 16-18 | ServiceEPStateField | 0b001 | EpToSelectRegMark | Packet needs to do Service Endpoint selection. | +| | bits 16-18 | ServiceEPStateField | 0b010 | EpSelectedRegMark | Packet has done Service Endpoint selection. | +| | bits 16-18 | ServiceEPStateField | 0b011 | EpToLearnRegMark | Packet has done Service Endpoint selection and the selected Endpoint needs to be cached. | +| | bits 0-18 | EpUnionField | | | The union value of EndpointPortField and ServiceEPStateField. | +| | bit 19 | | 0b1 | ToNodePortAddressRegMark | Packet is destined to a Service of type NodePort. | +| | bit 20 | | 0b1 | AntreaFlexibleIPAMRegMark | Packet is from local Antrea IPAM Pod. | +| | bit 20 | | 0b0 | NotAntreaFlexibleIPAMRegMark | Packet is not from local Antrea IPAM Pod. | +| | bit 21 | | 0b1 | ToExternalAddressRegMark | Packet is destined to a Service's external IP. | +| | bits 22-23 | TrafficControlActionField | 0b01 | TrafficControlMirrorRegMark | Indicates packet needs to be mirrored (used by TrafficControl). | +| | | | 0b10 | TrafficControlRedirectRegMark | Indicates packet needs to be redirected (used by TrafficControl). | +| | bit 24 | | 0b1 | NestedServiceRegMark | Packet is destined to a Service which is using other other Service as Endpoints. | +| | bit 25 | | 0b1 | DSRServiceRegMark | Packet is destined to a Service working in DSR mode. | +| | | | 0b0 | NotDSRServiceRegMark | Packet is destined to a Service working not in DSR mode. | +| | bit 26 | | 0b1 | RemoteEndpointRegMark | Packet is destined to a Service selecting a remote non-hostNetwork Endpoint. | +| | bit 27 | | 0b1 | FromExternalRegMark | Packet is from Antrea gateway, but its source IP is not the gateway IP. | +| NXM_NX_REG5 | bits 0-31 | TFEgressConjIDField | | | Egress conjunction ID hit by TraceFlow packet. | +| NXM_NX_REG6 | bits 0-31 | TFIngressConjIDField | | | Ingress conjunction ID hit by TraceFlow packet. | +| NXM_NX_REG7 | bits 0-31 | ServiceGroupIDField | | | GroupID corresponding to the Service. | +| NXM_NX_REG8 | bits 0-11 | VLANIDField | | | VLAN ID. | +| | bits 12-15 | CtZoneTypeField | 0b0001 | IPCtZoneTypeRegMark | Ct zone type is IPv4. | +| | | | 0b0011 | IPv6CtZoneTypeRegMark | Ct zone type is IPv6. | +| | bits 0-15 | CtZoneField | | | Ct zone ID which is a combination of VLANIDField and CtZoneTypeField. | +| NXM_NX_XXREG3 | bits 0-127 | EndpointIP6Field | | | Field to store IPv6 address of selected Service Endpoint. | + +Note that, regmarks that have overlapped bits will not be used at the same time, like `SwapField` and `PacketInTableField`. + +## CT Marks + +| Field Range | Field Name | CT Mark Value | CT Mark Name | Description | +|-------------|-----------------------|---------------|--------------------|-----------------------------------------------------------------| +| bits 0-3 | ConnSourceCTMarkField | 0b0010 | FromGatewayCTMark | Connection source is Antrea gateway port. | +| | | 0b0101 | FromBridgeCTMark | Connection source is local bridge port. | +| bit 4 | | 0b1 | ServiceCTMark | Connection is for Service. | +| | | 0b0 | NotServiceCTMark | Connection is not for Service. | +| bit 5 | | 0b1 | ConnSNATCTMark | SNAT is performed on the connection for Service. | +| bit 6 | | 0b1 | HairpinCTMark | Hairpin connection. | +| bit 7 | | 0b1 | L7NPRedirectCTMark | Connection should be redirected to an application-aware engine. | + +## CT Labels + +| Field Range | Field Name | Description | +|-------------|-----------------------|------------------------------------| +| bits 0-31 | IngressRuleCTLabel | Ingress rule ID. | +| bits 32-63 | EgressRuleCTLabel | Egress rule ID. | +| bits 64-75 | L7NPRuleVlanIDCTLabel | VLAN ID for L7 NetworkPolicy rule. | + +## CT Zones + +| Zone ID | Zone Name | Description | +|---------|--------------|----------------------------------------------------| +| 65520 | CtZone | Tracking IPv4 connections that don't require SNAT. | +| 65510 | CtZoneV6 | Tracking IPv6 connections that don't require SNAT. | +| 65521 | SNATCtZone | Tracking IPv4 connections that require SNAT. | +| 65511 | SNATCtZoneV6 | Tracking IPv6 connections that require SNAT. | | + +## Kubernetes NetworkPolicy Implementation + +Several tables of the pipeline are dedicated to [Kubernetes +NetworkPolicy](https://kubernetes.io/docs/concepts/services-networking/network-policies/) implementation (table +[EgressRule], [EgressDefault], [IngressRule] and [IngressDefault]). + +The Antrea implementation of Kubernetes NetworkPolicy, including the communication channel between the AntreaController +and AntreaAgents, and how a Kubernetes NetworkPolicy is mapped to OVS flows at each Node, will be described in details +in a separate document. For the present document, we will use the Kubernetes NetworkPolicy example below, and explain +how these simple ingress and egress rules map to individual flows as we describe the relevant tables of our pipeline. ```yaml apiVersion: networking.k8s.io/v1 @@ -155,22 +219,19 @@ spec: port: 80 ``` -This Network Policy is applied to all Pods with the `nginx` app label in the -`default` Namespace. For these Pods, it only allows TCP traffic on port 80 from -and to Pods which also have the `nginx` app label. Because Antrea will only -install OVS flows for this Network Policy on Nodes for which some of the Pods -are the target of the policy, we have scheduled 2 `nginx` Pods on the same -Node. They received IP addresses 10.10.1.2 and 10.10.1.3 from the Antrea CNI, so -you will see these addresses show up in the OVS flows. +This Kubernetes NetworkPolicy is applied to all Pods with the `nginx` app label in the `default` Namespace. For these +Pods, it only allows TCP traffic on port 80 from and to Pods which also have the `nginx` app label. Because Antrea will +only install OVS flows for this Kubernetes NetworkPolicy on Nodes for which some of the Pods are the target of the +policy, we have scheduled 2 `app:nginx` Pods on the same Node. They received IP addresses `10.10.0.7` and `10.10.0.8` +from the Antrea CNI, so you will see these IP addresses show up in the OVS flows. -## Antrea-native Policies Implementation +## Antrea-native NetworkPolicy Implementation -In addition to the above tables created for K8s NetworkPolicy, Antrea creates -additional dedicated tables to support the [Antrea-native policies](../antrea-network-policy.md) -([AntreaPolicyEgressRuleTable] and [AntreaPolicyIngressRuleTable]). +In addition to the above tables created for Kubernetes NetworkPolicy, Antrea creates additional dedicated tables to +support the [Antrea-native NetworkPolicy](../antrea-network-policy.md) (table [AntreaPolicyEgressRule] and [AntreaPolicyIngressRule]). -Consider the following Antrea ClusterNetworkPolicy (ACNP) in the Application tier as an -example for the remainder of this document. +Consider the following Antrea ClusterNetworkPolicy (ACNP) in the Application tier as an example for the remainder of +this document. ```yaml apiVersion: crd.antrea.io/v1beta1 @@ -183,1015 +244,1224 @@ spec: appliedTo: - podSelector: matchLabels: - app: server + app: nginx ingress: - - action: Drop + - action: Allow from: - podSelector: matchLabels: - app: notClient + app: nginx ports: - protocol: TCP port: 80 + - action: Drop egress: - action: Allow to: - podSelector: matchLabels: - app: dns + app: nginx ports: - - protocol: UDP - port: 53 + - protocol: TCP + port: 80 + - action: Reject ``` -This ACNP is applied to all Pods with the `app: server` label in all -Namespaces. For these Pods, it drops TCP traffic on port 80 from all -Pods which have the `app: notClient` label. In addition to the ingress rules, -this policy also allows egress UDP traffic on port 53 to all Pods with the -label `app: dns`. Similar to K8s NetworkPolicy, Antrea will only install OVS -flows for this ACNP on Nodes for which some of the Pods are the target of the -policy. Thus, we have scheduled three Pods (appServer, appDns, appNotClient) -on the same Node and they have the following IP addresses: - -- appServer: 10.10.1.6 -- appNotClient: 10.10.1.7 -- appDns: 10.10.1.8 +This ACNP is applied to all Pods with the `app: nginx` label in all Namespaces. For these Pods, it only allows TCP traffic +on port 80 from all Pods with the label `app: nginx` and drop others. In addition to the ingress rules, this policy only +allows egress TCP traffic on port 80 to all Pods with the label `app: nginx` and reject others. Similar to Kubernetes +NetworkPolicy, Antrea will only install OVS flows for this policy on Nodes for which some of the Pods are the target of +the policy. We still use `app:nginx` Pods mentioned above as the target of this policy. -## Tables +## OVS Tables -![OVS pipeline](../assets/ovs-pipeline-antrea-proxy.svg) - -### ClassifierTable (0) - -This table is used to determine which "category" of traffic (tunnel, local -gateway or local Pod) the packet belongs to. This is done by matching on the -ingress port for the packet. The appropriate value is then written to bits -[0..3] in NXM_NX_REG0: 0 for tunnel, 1 for local gateway and 2 for local Pod. -This information is used by matches in subsequent tables. For a packet received -from the tunnel port, bit 19 in NXM_NX_REG0 is set to 1, to indicate MAC rewrite -should be performed for the packet in [L3ForwardingTable]. - -If you dump the flows for this table, you may see the following: +![OVS pipeline](../assets/ovs-pipeline.svg) -```text -1. table=0, priority=200,in_port=2 actions=set_field:0x1/0xf->reg0,resubmit(,10) -2. table=0, priority=200,in_port=1 actions=set_field:0/0xf->reg0,load:0x1->NXM_NX_REG0[19],resubmit(,30) -3. table=0, priority=190,in_port=4 actions=set_field:0x2/0xf->reg0,resubmit(,10) -4. table=0, priority=190,in_port=3 actions=set_field:0x2/0xf->reg0,resubmit(,10) -5. table=0, priority=0 actions=drop -``` +### PipelineRootClassifier -Flow 1 is for traffic coming in on the local gateway. Flow 2 is for traffic -coming in through an overlay tunnel (i.e. from another Node). The next two -flows (3 and 4) are for local Pods. - -Local traffic then goes to [SpoofGuardTable], while tunnel traffic from other -Nodes goes to [ConntrackTable]. The table-miss flow entry will drop all -unmatched packets (in practice this flow entry should almost never be used). - -### SpoofGuardTable (10) - -This table prevents IP and ARP -[spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods. For -each Pod (as identified by the ingress port), we ensure that: - -* for IP traffic, the source IP and MAC addresses are correct, i.e. match the - values configured on the interface when Antrea set-up networking for the Pod. -* for ARP traffic, the advertised IP and MAC addresses are correct, i.e. match - the values configured on the interface when Antrea set-up networking for the - Pod. - -Because Antrea currently relies on kube-proxy to load-balance traffic destined -to Services, implementing that kind of IP spoofing check for traffic coming-in -on the local gateway port is not as trivial. Traffic from local Pods destined to -Services will first go through the gateway, get load-balanced by the kube-proxy -datapath (DNAT) then sent back through the gateway. This means that legitimate -traffic can be received on the gateway port with a source IP belonging to a -local Pod. We may add some fine-grained rules in the future to accommodate for -this, but for now we just allow all IP traffic received from the gateway. We do -have an ARP spoofing check for the gateway however, since there is no reason for -the host to advertise a different MAC address on antrea-gw0. +This table serves as the primary entry point in the pipeline, directing packets to different tables based on their +respective protocols. -If you dump the flows for this table, you may see the following: +If you dump the flows for this table, you should see the following: ```text -1. table=10, priority=200,ip,in_port=2 actions=resubmit(,23) -2. table=10, priority=200,arp,in_port=2,arp_spa=10.10.0.1,arp_sha=3a:dd:79:0f:55:4c actions=resubmit(,20) -3. table=10, priority=200,arp,in_port=4,arp_spa=10.10.0.2,arp_sha=ce:99:ca:bd:62:c5 actions=resubmit(,20) -4. table=10, priority=200,arp,in_port=3,arp_spa=10.10.0.3,arp_sha=3a:41:49:42:98:69 actions=resubmit(,20) -5. table=10, priority=200,ip,in_port=4,dl_src=ce:99:ca:bd:62:c5,nw_src=10.10.0.2 actions=resubmit(,23) -6. table=10, priority=200,ip,in_port=3,dl_src=3a:41:49:42:98:69,nw_src=10.10.0.3 actions=resubmit(,23) -7. table=10, priority=0 actions=drop +1. table=PipelineRootClassifier, priority=200,arp actions=goto_table:ARPSpoofGuard +2. table=PipelineRootClassifier, priority=200,ip actions=goto_table:Classifier +3. table=PipelineRootClassifier, priority=0 actions=drop ``` -After this table, ARP traffic goes to [ARPResponderTable], while IP -traffic goes to [ServiceHairpinTable]. Traffic which does not match -any of the rules described above will be dropped by the table-miss flow entry. +- Flow 1 forwards ARP packets to table [ARPSpoofGuard]. +- Flow 2 forwards IP packets to table [Classifier]. +- Flow 3 is the default drop flow to drop other unsupported protocols, not normally used. -### ARPResponderTable (20) +### ARPSpoofGuard -The main purpose of this table is to reply to ARP requests from the local -gateway asking for the MAC address of a remote peer gateway (another Node's -gateway). This ensures that the local Node can reach any remote Pod, which in -particular is required for Service traffic which has been load-balanced to a -remote Pod backend by kube-proxy. Note that the table is programmed to reply to -such ARP requests with a "Global Virtual MAC" ("Global" because it is used by -all Antrea OVS bridges), and not with the actual MAC address of the remote -gateway. This ensures that once the traffic is received by the remote OVS -bridge, it can be directly forwarded to the appropriate Pod without actually -going through the gateway. The Virtual MAC is used as the destination MAC -address for all the traffic being tunnelled. +This table drops ARP [spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods and local Antrea gateway. +For each Pod (as identified by the ingress port), we ensure that: the advertised IP and MAC addresses are correct, i.e. +match the values configured on the interface when Antrea set up networking for the Pod. If you dump the flows for this table, you may see the following: ```text -1. table=20, priority=200,arp,arp_tpa=10.10.1.1,arp_op=1 actions=move:NXM_OF_ETH_SRC[]->NXM_OF_ETH_DST[],mod_dl_src:aa:bb:cc:dd:ee:ff,set_field:2->arp_op,move:NXM_NX_ARP_SHA[]->NXM_NX_ARP_THA[],load:0xaabbccddeeff->NXM_NX_ARP_SHA[],move:NXM_OF_ARP_SPA[]->NXM_OF_ARP_TPA[],load:0xa0a0101->NXM_OF_ARP_SPA[],IN_PORT -2. table=20, priority=190,arp actions=NORMAL -3. table=20, priority=0 actions=drop +1. table=ARPSpoofGuard, priority=200,arp,in_port="antrea-gw0",arp_spa=10.10.0.1,arp_sha=ba:5e:d1:55:aa:c0 actions=goto_table:ARPResponder +2. table=ARPSpoofGuard, priority=200,arp,in_port="nginx-d9-cd1533",arp_spa=10.10.0.7,arp_sha=2e:ba:06:b2:44:91 actions=goto_table:ARPResponder +3. table=ARPSpoofGuard, priority=200,arp,in_port="nginx-d9-b93cc5",arp_spa=10.10.0.8,arp_sha=c2:5a:5e:50:95:9b actions=goto_table:ARPResponder +4. table=ARPSpoofGuard, priority=0 actions=drop ``` -Flow 1 is the "ARP responder" for the peer Node whose local Pod subnet is -10.10.1.0/24. If we were to look at the routing table for the local Node, we -would see the following "onlink" route: +- Flow 1 matches ARP packets from local Antrea gateway. +- Flows 2-3 match ARP packets from local Pods. +- Flow 4 is the default flow to drop ARP spoofing packets which are not matched flows 1-4. -```text -10.10.1.0/24 via 10.10.1.1 dev antrea-gw0 onlink -``` +For more details of flows 1-3: -A similar route is installed on the gateway (antrea-gw0) interface every time the -Antrea Node Route Controller is notified that a new Node has joined the -cluster. The route must be marked as "onlink" since the kernel does not have a -route to the peer gateway 10.10.1.1: we trick the kernel into believing that -10.10.1.1 is directly connected to the local Node, even though it is on the -other side of the tunnel. +- Match condition `arp` is to match ARP packets. +- Match condition `in_port=` is to match the OVS port where packets are originated. +- Match condition `arp_spa=` is to match packets with ARP source protocol address, which corresponds to the + IP address of a local Pod or local Antrea gateway. +- Match condition `arp_sha=` matches packets with ARP source hardware address, which corresponds to the MAC + address of a local Pod or local Antrea gateway. -Flow 2 ensures that OVS handle the remainder of ARP traffic as a regular L2 -learning switch (using the `normal` action). In particular, this takes care of -forwarding ARP requests and replies between local Pods. +### ARPResponder -The table-miss flow entry (flow 3) will drop all other packets. This flow should -never be used because only ARP traffic should go to this table, and -ARP traffic will either match flow 1 or flow 2. +The main purpose of this table is to reply to ARP requests from local Antrea gateway asking for the MAC address of a +remote peer gateway (another Node's gateway). This ensures that the local Node can reach any remote Pod, which in particular +is required for Service traffic which has been load-balanced to a remote Pod backend by kube-proxy. Note that the table +is programmed to reply to such ARP requests with a "Global Virtual MAC" ("Global" means it is used by all Antrea OVS +bridges, which is `aa:bb:cc:dd:ee:ff`), and not with the actual MAC address of the remote gateway. This ensures that +once the traffic is received by the remote OVS bridge, it can be directly forwarded to the appropriate Pod without actually +going through the gateway. The virtual MAC is used as the destination MAC address for all the traffic being tunnelled. -### ServiceHairpinTable (23) - -When a backend Pod of a Service accesses the Service, and the Pod itself is selected -as the destination, then we have the hairpin case, in which the source IP should be -SNAT'd with a virtual hairpin IP in [hairpinSNATTable]. The source and destination -IP addresses cannot be the same, otherwise the connection will be broken. It will be -explained in detail in [hairpinSNATTable]. For response packets, the -destination IP is the virtual hairpin IP, so the destination IP should be changed back -to the IP of the backend Pod. Then the response packets can be forwarded back correctly. - -If you dump the flows for this table, you should see the flows: +If you dump the flows for this table, you may see the following: ```text -1. table=23, priority=200,ip,nw_dst=169.254.169.252 actions=move:NXM_OF_IP_SRC[]->NXM_OF_IP_DST[],load:0x1->NXM_NX_REG0[18],resubmit(,30) -2. table=23, priority=0 actions=resubmit(,24) +1. table=ARPResponder, priority=200,arp,arp_tpa=10.10.1.1,arp_op=1 actions=move:NXM_OF_ETH_SRC[]->NXM_OF_ETH_DST[],set_field:aa:bb:cc:dd:ee:ff->eth_src,set_field:2->arp_op,move:NXM_NX_ARP_SHA[]->NXM_NX_ARP_THA[],set_field:aa:bb:cc:dd:ee:ff->arp_sha,move:NXM_OF_ARP_SPA[]->NXM_OF_ARP_TPA[],set_field:10.10.1.1->arp_spa,IN_PORT +2. table=ARPResponder, priority=190,arp actions=NORMAL +3. table=ARPResponder, priority=0 actions=drop ``` -Flow 1 is used to match packet whose destination IP is virtual hairpin IP and -change the destination IP of the matched packet by loading register `NXM_OF_IP_SRC` -to `NXM_OF_IP_DST`. Bit 18 in NXM_NX_REG0 is set to 0x1, which indicates that the -packet should be output to the port on which it was received, which is done in -[L2ForwardingOutTable]. - -### ConntrackTable (30) - -The sole purpose of this table is to invoke the `ct` action on all packets and -set the `ct_zone` (connection tracking context) to a hard-coded value, then -forward traffic to [ConntrackStateTable]. If you dump the flows for this table, -you should only see 1 flow: +- Flow 1 matches ARP requests from Antrea gateway asking for the MAC address of a remote peer gateway with IP address + 10.10.1.1. The actions are taken to craft an ARP reply packet and send it back to the port where the ARP request was received. + - Match condition `arp` is to match ARP packets. + - Match condition `arp_tpa=10.10.1.1` is to match packets with ARP target protocol address, which corresponds to the IP + address of a remote peer gateway. + - Match condition `arp_op=1` is to match ARP request packets. + - Action `move:NXM_OF_ETH_SRC[]->NXM_OF_ETH_DST[]` is to set destination MAC address with the source MAC address + of the current packet. + - Action `set_field:aa:bb:cc:dd:ee:ff->eth_src` is to set source MAC address with "Global Virtual MAC" `aa:bb:cc:dd:ee:ff`. + - Action `set_field:2->arp_op` is to set the ARP type to reply. + - Action `move:NXM_NX_ARP_SHA[]->NXM_NX_ARP_THA[]` is to set ARP target hardware address with the ARP source hardware address + of the current packet. + - Action `set_field:aa:bb:cc:dd:ee:ff->arp_sha` is to set ARP source hardware address with global virtual MAC address + `aa:bb:cc:dd:ee:ff`. + - Action `move:NXM_OF_ARP_SPA[]->NXM_OF_ARP_TPA[]` is to set ARP target protocol address with source protocol address + of the current packet. + - Action `set_field:10.10.1.1->arp_spa` is to set ARP source protocol address. + - Action `IN_PORT` is to set output port, ensuring packets are sent to where they were received. +- Flow 2 handles ARP request packets normally. +- Flow 3 is the default drop flow, not normally used in practice. + +### Classifier + +This table is used to determine which "category" of traffic (tunnel, local Antrea gateway or local Pod, etc.) to +which a packet belongs to. This is achieved by matching on the ingress port of the packet. + +If you dump the flows of this table, you may see the following: ```text -1. table=30, priority=200,ip actions=ct(table=31,zone=65520) +1. table=Classifier, priority=210,ip,in_port="antrea-gw0",nw_src=10.10.0.1 actions=set_field:0x2/0xf->reg0,goto_table:SpoofGuard +2. table=Classifier, priority=200,in_port="antrea-gw0" actions=set_field:0x2/0xf->reg0,set_field:0x8000000/0x8000000->reg4,goto_table:SpoofGuard +3. table=Classifier, priority=200,in_port="antrea-tun0" actions=set_field:0x1/0xf->reg0,set_field:0x200/0x200->reg0,goto_table:UnSNAT +4. table=Classifier, priority=190,in_port="nginx-d9-cd1533" actions=set_field:0x3/0xf->reg0,goto_table:SpoofGuard +5. table=Classifier, priority=190,in_port="nginx-d9-b93cc5" actions=set_field:0x3/0xf->reg0,goto_table:SpoofGuard +6. table=Classifier, priority=0 actions=drop ``` -A `ct_zone` is simply used to isolate connection tracking rules. It is similar -in spirit to the more generic Linux network namespaces, but `ct_zone` is -specific to conntrack and has less overhead. - -After invoking the ct action, packets will be in the "tracked" (`trk`) state and -all [connection tracking -fields](https://www.openvswitch.org/support/dist-docs/ovs-fields.7.txt) will be -set to the correct value. Packets will then move on to [ConntrackStateTable]. - -Refer to [this -document](https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/) for -more information on connection tracking in OVS. - -### ConntrackStateTable (31) - -This table handles "tracked" packets (packets which are moved to the tracked -state by the previous table [ConntrackTable]) and "untracked" packets (packets -is not in tracked state). - -This table serves the following purposes: +- Flow 1 matches packets originated from local Node through Antrea gateway port. + - Match conditions `ip` and `nw_src=10.10.0.1` are to match packets originated from local Node. Note that, + `nw_src=10.10.0.1` cannot be used alone with ip family match condition like `ip`. + - Match condition `in_port="antrea-gw0"` is to match packets which are received on Antrea gateway port. + - Action `set_field:0x2/0xf->reg0` is to load `FromGatewayRegMark` to mark packet source. + - Action `goto_table:SpoofGuard` is to forward packets to table [SpoofGuard] to validate their legitimacy. +- Flow 2 matches packets originated from external network through Antrea gateway port. Since packets originating from + local Node through local Antrea gateway port are matched by flow 1, flow 2 can only match packets originated from + external network. + - Match condition `in_port="antrea-gw0"` is to match packets which are received on local Antrea gateway port. + - Action `set_field:0x2/0xf->reg0` is to load `FromGatewayRegMark` to mark packet source. + - Action `set_field:0x8000000/0x8000000->reg4` is to load `FromExternalRegMark` mark packets that are from external + network, not local Node. + - Action `goto_table:SpoofGuard` is the same as flow 1. +- Flow 3 matches packets through an overlay tunnel (i.e., from another Node). + - Match condition `in_port="antrea-tun0"` is to match packets which are received on Antrea tunnel port. + - Action `set_field:0x1/0xf->reg0` is to load `FromTunnelRegMark` to mark packet source. + - Action `set_field:0x200/0x200->reg0` is to load `RewriteMACRegMark`, which indicates that the source and destination + MAC addresses of the packets should be overwritten. This mark will be used in table [L3Forwarding]. + - Action `goto_table:UnSNAT` is to forward packets to table [UnSNAT], rather than [SpoofGuard] without further verification. + This approach is based on the understanding that tunnel-borne packets stem from remote Nodes, potentially bearing + varying source IP addresses. It's important to note that these packets undergo verification before being tunneled. + As a consequence, packets from the tunnel should be seamlessly forwarded to table [UnSNAT]. +- Flows 4-5 match packets from local Pods. + - Match condition `in_port=` is to match packets which are from local Pods. + - Action `set_field:0x3/0xf->reg0` is to load `FromLocalRegMark` to mark packet source. + - Action `goto_table:SpoofGuard` is the same as flow 1. +- Flow 6 is the default drop flow, not normally used in practice. + +### SpoofGuard + +This table is designed to drop IP [spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods. For each +Pod (as identified by the ingress port), we ensure that the source IP and MAC addresses are correct, i.e. match the +values configured on the interface when Antrea set up networking for the Pod. -* For tracked Service packets, bit 19 in NXM_NX_REG0 will be set to 0x1, then - the tracked packet will be forwarded to [EgressRuleTable] directly. -* Drop packets reported as invalid by conntrack. -* Non-Service tracked packets goes to [EgressRuleTable] directly. -* Untracked packets goes to [SessionAffinityTable] and [ServiceLBTable]. - -If you dump the flows for this table, you should see the following: +If you dump the flows for this table, you may see the following: ```text -1. table=31, priority=200,ct_state=-new+trk,ct_mark=0x21,ip actions=load:0x1->NXM_NX_REG0[19],resubmit(,50) -2. table=31, priority=190,ct_state=+inv+trk,ip actions=drop -3. table=31, priority=190,ct_state=-new+trk,ip actions=resubmit(,50) -4. table=31, priority=0 actions=resubmit(,40),resubmit(,41) +1. table=SpoofGuard, priority=200,ip,in_port="antrea-gw0" actions=goto_table:UnSNAT +2. table=SpoofGuard, priority=200,ip,in_port="nginx-d9-cd1533",dl_src=2e:ba:06:b2:44:91,nw_src=10.10.0.7 actions=goto_table:UnSNAT +3. table=SpoofGuard, priority=200,ip,in_port="nginx-d9-b93cc5",dl_src=c2:5a:5e:50:95:9b,nw_src=10.10.0.8 actions=goto_table:UnSNAT +4. table=SpoofGuard, priority=0 actions=drop ``` -Flow 1 is used to forward tracked Service packets to [EgressRuleTable] directly, -without passing [SessionAffinityTable], [ServiceLBTable] and [EndpointDNATTable]. -The flow also sets bit 19 in NXM_NX_REG0 to 0x1, which indicates that the destination -and source MACs of the matched packets should be rewritten in [l3ForwardingTable]. - -Flow 2 is used to drop packets which is reported as invalid by conntrack. - -Flow 3 is used to forward tracked non-Service packets to [EgressRuleTable] directly, -without passing [SessionAffinityTable], [ServiceLBTable] and [EndpointDNATTable]. - -Flow 4 is used to match the first packet of untracked connection and forward it to -[SessionAffinityTable] and [ServiceLBTable]. - -### SessionAffinityTable (40) +- Flow 1 matches packets recevied from local Antrea gateway port, but does not check source IP and MAC address. There are + some cases where the source IP of the packets through local Antrea gateway is not the local Antrea gateway IP: + - When Antrea is deployed with kube-proxy and AntreaProxy is not enabled, packets from local Pods destined to Services + will first go through the gateway, get load-balanced by the kube-proxy datapath (DNAT) then sent back through the + gateway. This means that legitimate packets can be received on the gateway port with a source IP belonging to a local + Pod. + - When both AntreaProxy and proxyAll are enabled, packets from external destined to Services will be routed to OVS + through the gateway without changing the source IP of the packets. + - When Antrea is deployed with kube-proxy and AntreaProxy is enabled, packets from external destined to Services will + get load-balanced by the kube-proxy datapath (DNAT), then be routed to OVS through the gateway without SNAT. +- Flows 2-3 matches regular IP packets from local Pods. + - Match condition `dl_src=` is to match packets with source MAC address, which corresponds to the MAC + address of a local Pod. + - Match condition `nw_src=` is to match packets with source IP address, which corresponds to the IP + address of a local Pod. + - Other match conditions and actions are the same as flow 1. +- Flow 4 is the default flow to drop IP spoofing packets. + +### UnSNAT + +This table is used to invoke `ct` action on the responded packets from Service connections that have been committed in +`SNATCtZone` or `SNATCtZoneV6`. After invoking `ct` action, packets will be in the "tracked" state and all [connection +tracking fields](https://www.openvswitch.org//support/dist-docs/ovs-fields.7.txt) will be set to the correct value. + +Ct zone is simply used to isolate connection tracking rules. It is conceptually similar to the more generic Linux network +namespaces, but ct zone is specific to conntrack and has less overhead. Please refer to [this +document](https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/) for more information on connection tracking +in OVS. -If `service.spec.sessionAffinity` of a Service is `None`, this table will set the value -of bits [16..18] in NXM_NX_REG4 to 0b001, which indicates that the Service needs to do -Endpoint selection. If you dump the flow, you should see the flow: +If you dump the flows for this table, you may see the following: ```text -table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18] +1. table=UnSNAT, priority=200,ip,nw_dst=169.254.0.253 actions=ct(table=ConntrackZone,zone=65521,nat) +2. table=UnSNAT, priority=200,ip,nw_dst=10.10.0.1 actions=ct(table=ConntrackZone,zone=65521,nat) +3. table=UnSNAT, priority=0 actions=goto_table:ConntrackZone ``` -If `service.spec.sessionAffinity` of a Service is `ClientIP`, when a client accesses -the Service for the first time, a learned flow with hard timeout which equals -`service.spec.sessionAffinityConfig.clientIP.timeoutSeconds` of the Service will be -generated in this table. This will be explained in detail in chapter [ServiceLBTable]. +- Flows 1-2 matches reply packets of SNATed Service connections. + - Match condition `nw_dst=169.254.0.253` is to match packets SNAT'd with a virtual IP. + - Match condition `nw_dst=10.10.0.1` is to match packets SNAT'd with local Antrea gateway IP. This will also match + packets which arefrom local Pods destined to local Antrea gateway, but it will not cause any side effect since such + connections will never be committed in `SNATCtZone`. + - Action `ct(table=ConntrackZone,zone=65521,nat)` is to invoke the `ct` action on matched packets. The packets will be + forwarded to table [ConntrackZone] and restored the "tracked" state in `SNATCtZone`, like the original IP before SNAT. +- Flow 3 is the default flow to forward packets to table [ConntrackZone]. -### ServiceLBTable (41) +### Conntrack -This table is used to implement Service Endpoint selection. Note that, currently, only -ClusterIP Service request from Pods is supported. NodePort, LoadBalancer and ClusterIP -whose client is from K8s Node will be supported in the future. +This table is used to invoke the `ct` action on packets from all connections. After invoking `ct` action, packets will +be in the "tracked" state. It's worth noting that when upon invoking `ct` action with `CtZone` to packets that have +"tracked" state in `SNATCtZone`, the "tracked" state in `SNATCtZone` will be inaccessible in `CtZone`. This transition +occurs because the "tracked" state shifts to the current ct zone. As previously mentioned, a ct zone is similar in +spirit to the more generic Linux network namespaces, uniquely containing a "tracked" state within each ct zone. -When a ClusterIP Service is created with `service.spec.sessionAffinity` set to `None`, if you -dump the flows, you should see the following flow: +If you dump the flows for this table, you may see the following: ```text -1. table=41, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19],group:5 +1. table=ConntrackZone, priority=200,ip actions=ct(table=ConntrackState,zone=65520,nat) +2. table=ConntrackZone, priority=0 actions=goto_table:ConntrackState ``` -Among the match conditions of the above flow: +- Flow 1 invokes `ct` action on packets from all connections. The packets will be forwarded to table [ConntrackStateTable] + and restored the "tracked" state in `CtZone`. For Service connections, the original IP before DNAT will be restored. +- Flow 2 is an auto-generated flow that should remain unused. -* `reg4=0x10000/0x70000`, value of bits [16..18] in NXM_NX_REG4 is 0b001, which is used - to match Service packet whose state is to do Endpoint selection. The value of - bits [16..18] in NXM_NX_REG4 is set in [SessionAffinityTable] by flow `table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18]`. +### ConntrackState -The actions of the above flow: +This table handles "tracked" packets from connections tracked in `CtZone` or `CtZoneV6`. The packets which are moved to +the "tracked" state in the previous table [ConntrackTable]). -* `load:0x2->NXM_NX_REG4[16..18]` is used to set the value of bits [16..18] in NXM_NX_REG4 - to 0b002, which indicates that Endpoint selection "is performed". Note that, Endpoint - selection has not really been done yet - it will be done by group action. The current - action should have been done in target OVS group entry after Endpoint selection. However, - we set the bits here, for the purpose of supporting more Endpoints in an OVS group. - Please check PR [#2101](https://github.com/antrea-io/antrea/pull/2101) to learn more information. -* `load:0x1->NXM_NX_REG0[19]` is used to set the value of bit 19 in NXM_NX_REG0 to 0x1, - which means that the source and destination MACs need to be rewritten. -* `group:5` is used to set the target OVS group. Note that, the target group needs to be - created first before the flow is created. - -Dump the group entry with command `ovs-ofctl dump-groups br-int 5`, you should see the -following: +If you dump the flows for this table, you may see the following: ```text -group_id=5,type=select,\ -bucket=bucket_id:0,weight:100,actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42),\ -bucket=bucket_id:1,weight:100,actions=load:0xa0a0003->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42),\ -bucket=bucket_id:2,weight:100,actions=load:0xa0a0004->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42) +1. table=ConntrackState, priority=200,ct_state=+inv+trk,ip actions=drop +2. table=ConntrackState, priority=190,ct_state=-new+trk,ct_mark=0/0x10,ip actions=goto_table:AntreaPolicyEgressRule +3. table=ConntrackState, priority=190,ct_state=-new+trk,ct_mark=0x10/0x10,ip actions=set_field:0x200/0x200->reg0,goto_table:AntreaPolicyEgressRule +4. table=ConntrackState, priority=0 actions=goto_table:PreRoutingClassifier ``` -For the above OVS group, there are three buckets which have the same weight. Every bucket -has the same chance to be selected since they have the same weight. The selected bucket -will load Endpoint IPv4 address to NXM_NX_REG3, Endpoint port number to bits [0..15] -in NXM_NX_REG4. Then the matched packet will be resubmitted to [EndpointDNATTable]. +- Flow 1 drops packets which are reported as invalid by conntrack. + - Match condition `ct_state=+inv+trk` is to match packets which are reported as invalid by conntrack. + - Action `drop` is to drop packets. +- Flow 2 matches packets of non-Service connections which are in "tracked" state and committed to connection tracking module. + - Match condition `ct_state=-new+trk` is to match packets which are tracked in `CtZone` but not new. + - Match condition `ct_mark=0/0x10` is to match `NotServiceCTMark`, indicating that packets are from non-Service connections. + - Action `goto_table:AntreaPolicyEgressRule` is to forward packets to table [AntreaPolicyEgressRule] to do egress + policy enforcement. +- Flow 3 matches packets of Service which are in "tracked" state and committed to connection tracking module. + - Match condition `ct_state=-new+trk` is the same as flow 2. + - Match condition `ct_mark=0x10/0x10` is to match `ServiceCTMark`, indicating that packets are from Service connections. + `ServiceCTMark` is persisted when the corresponding connection is committed in `CtZone`, like sample flows 2-3 in + table [EndpointDNAT]. + - Action `set_field:0x200/0x200->reg0` is to load `RewriteMACRegMark`, which indicates that the destination and source + MACs of packets should be overwritten. This mark will be used in table [L3Forwarding]. + - Action `goto_table:AntreaPolicyEgressRule` is to forward packets to table [AntreaPolicyEgressRule] to do egress + policy enforcement directly, skipping the tables for Service Endpoint selection. +- Flow 4 matches packets which are not matched by flows 1-3 and forwards them to table[PreRoutingClassifier]. + +### PreRoutingClassifier + +This table sequentially resubmits the first packet from untracked connections to table [SessionAffinity] and table +[ServiceLB] to do Service Endpoint selection. -When a ClusterIP Service is created with `service.spec.sessionAffinity` set to `ClientIP`, you may -see the following flows: +If you dump the flows for this table, you should see the following: ```text -1. table=41, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=load:0x3->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19],group:5 -2. table=41, priority=190,tcp,reg4=0x30000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=\ - learn(table=40,hard_timeout=300,priority=200,delete_learned,cookie=0x2040000000008, \ - eth_type=0x800,nw_proto=6,NXM_OF_TCP_DST[],NXM_OF_IP_DST[],NXM_OF_IP_SRC[],\ - load:NXM_NX_REG3[]->NXM_NX_REG3[],load:NXM_NX_REG4[0..15]->NXM_NX_REG4[0..15],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19]),\ - load:0x2->NXM_NX_REG4[16..18],\ - resubmit(,42) +1. table=PreRoutingClassifier, priority=200,ip actions=resubmit(,SessionAffinity),resubmit(,ServiceLB) +2. table=PreRoutingClassifier, priority=0 actions=goto_table:SessionAffinity ``` -When a client (assumed that the source IP is 10.10.0.2) accesses the ClusterIP for the first -time, the first packet of the connection will be matched by flow 1. Note that the action -`load:0x3->NXM_NX_REG4[16..18]` indicates that the Service Endpoint selection result needs -to be cached. - -Dump the group entry with command `ovs-ofctl dump-groups br-int 5`, you should see the -following: +- Flow 1 sequentially resubmits packets to table [SessionAffinity] and table [ServiceLB]. +- Flow 2 is the auto-generated flow which is never used. -```text -group_id=5,type=select,\ -bucket=bucket_id:0,weight:100,actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41),\ -bucket=bucket_id:1,weight:100,actions=load:0xa0a0003->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41),\ -bucket=bucket_id:2,weight:100,actions=load:0xa0a0004->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41) -``` +### SessionAffinity -Note the action `resubmit(,41)` resubmits the first packet of a ClusterIP Service connection -back to [ServiceLBTable], not resubmits the packet to [EndpointDNATTable]. Then the -packet will be matched by flow 2 since value of bits [16..18] in NXM_NX_REG4 is 0b011. One -action of the flow is to generate a learned flow in [SessionAffinityTable], the other -action is to resubmit the packet to [EndpointDNATTable]. +This table is used to implement Service session affinity. The learned flows that cache the selected Endpoints are +installed here. -Now if you dump flows of table [SessionAffinityTable], you may see the following flows: +If you dump the flows for this table, you may see the following: ```text -1. table=40, hard_timeout=300, priority=200,tcp,nw_src=10.10.0.2,nw_dst=10.107.100.231,tp_dst=443 \ - actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19] -2. table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18] +1. table=SessionAffinity, hard_timeout=300, priority=200,tcp,nw_src=10.10.0.7,nw_dst=10.96.76.15,tp_dst=80 \ + actions=set_field:0x50/0xffff->reg4,set_field:0/0x4000000->reg4,set_field:0xa0a0007->reg3,set_field:0x20000/0x70000->reg4,set_field:0x200/0x200->reg0 +2. table=SessionAffinity, priority=0 actions=set_field:0x10000/0x70000->reg4 ``` -Note that, flow 1 (the generated learned flow) has higher priority than flow 2 in table -[SessionAffinityTable]. When a particular client accesses the ClusterIP once again, the first -packet of the connection will be matched by flow 1 due to the match condition `nw_src=10.10.0.2`. - -The actions of flow 1: - -* `load:0xa0a0004->NXM_NX_REG3[]` is used to load Endpoint IPv4 address to NXM_NX_REG3. -* `load:0x50->NXM_NX_REG4[0..15]` is used to load Endpoint port number to bits [0..15] in - NXM_NX_REG4. -* `load:0x2->NXM_NX_REG4[16..18]` is used to set the value of bits [16..18] in NXM_NX_REG4 to - 0b010, which indicates that the Service has done Endpoint selection. -* `load:0x1->NXM_NX_REG0[19]` is used to set the value of bit 19 in NXM_NX_REG0 to 0x1, which - indicates that the source and destination MACs need to be rewritten. - -Note that, if the value of bits [16..18] in NXM_NX_REG4 is 0b010 (set by action `load:0x2->NXM_NX_REG4[16..18]` -in table [SessionAffinityTable]), then packet will not be matched by any flows in table -[ServiceLBTable] except the last one. The last one just forwards the packet to table -[EndpointDNATTable] without selecting target OVS group. Then connections from a particular -client will always access the same backend Pod within the session timeout setting by -`service.spec.sessionAffinityConfig.clientIP.timeoutSeconds`. - -### EndpointDNATTable (42) - -The table implements DNAT for Service traffic after Endpoint selection for the first -packet of a Service connection. +- Flow 1 is a learned flow generated by flow 8 in [ServiceLB]. It matches the first packet of the subsequent connections + (the first is to do Endpoint selection and trigger learned flow) destined to a Service with setting + `service.spec.sessionAffinity` to `ClientIP`. When a client accesses the Service for the first time, this flow with + hard timeout which equals `service.spec.sessionAffinityConfig.clientIP.timeoutSeconds` of the Service will be installed. + - Field `hard_timeout=300` is the hard timeout of the learned flow. After the hard timeout, the learned flow will be deleted. + - Match condition `tcp` is to match TCP packets, generated by `eth_type=0x800` and `nw_proto=6` in table [ServiceLB]. + - Match condition `nw_src=10.10.0.7` is to match packets with source IP address, which corresponds to the IP address + of a client, generated by `NXM_OF_IP_SRC[]` in table [ServiceLB], flow 8. + - Match condition `nw_dst=10.96.76.15` is to match packets with destination IP address, which corresponds to the IP + address of a Service, generated by `NXM_OF_IP_DST[]` in table [ServiceLB], flow 8. + - Match condition `tp_dst=80` is to match packets with destination port, which corresponds to the port of a Service, + generated by `NXM_OF_TCP_DST[]` in table [ServiceLB], flow 8. + - Action `set_field:0x50/0xffff->reg4` is to set `EndpointPortField`, indicating that the packet is to do Endpoint + selection. + - Action `set_field:0/0x4000000->reg4` is to set bit 26 of `NXM_NX_REG4` to 0b0, which indicates that the cached + Endpoint is on remote Node. If the cached Endpoint is on local Node, bit 26 of `NXM_NX_REG4` should be 0b1. It is + generated by `load:NXM_NX_REG4[26]->NXM_NX_REG4[26]` in table [ServiceLB], flow 8. + - Action `set_field:0xa0a0007->reg3` is to load the selected Endpoint IP to `EndpointIPField`. It is generated by + `load:NXM_NX_REG3[]->NXM_NX_REG3[]` in table [ServiceLB], flow 8. + - Action `set_field:0x20000/0x70000->reg4` is to load `EpSelectedRegMark`, which indicates that the packet has done + Endpoint selection. + - Action `set_field:0x200/0x200->reg0` is to load `RewriteMACRegMark`, which indicates that the destination and source + MACs of packets should be overwritten. +- Flow 2 is to match the first packet of connections destined to Services. + - Action `set_field:0x10000/0x70000->reg4` is to load `EpToSelectRegMark`, which indicates that the packet is to do + Endpoint selection. + +### ServiceLB + +This table is used to implement Service Endpoint selection. By default, only ClusterIP Service requests from Pods is +supported. NodePort, LoadBalancer and ClusterIP whose client is non-Pod requests are supported when `proxyAll` is enabled. -If you dump the flows for this table, you should see flows like the following: +If you dump the flows for this table, you may see the following: ```text -1. table=42, priority=200,tcp,reg3=0xc0a84d64,reg4=0x2192b/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=192.168.77.100:6443),exec(load:0x21->NXM_NX_CT_MARK[])) -2. table=42, priority=200,tcp,reg3=0xc0a84d65,reg4=0x2286d/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=192.168.77.101:10349),exec(load:0x21->NXM_NX_CT_MARK[])) -3. table=42, priority=200,tcp,reg3=0xa0a0004,reg4=0x20050/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.0.4:80),exec(load:0x21->NXM_NX_CT_MARK[])) -4. table=42, priority=200,tcp,reg3=0xa0a0102,reg4=0x20050/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.1.2:80),exec(load:0x21->NXM_NX_CT_MARK[])) -5. table=42, priority=200,udp,reg3=0xa0a0002,reg4=0x20035/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.0.2:53),exec(load:0x21->NXM_NX_CT_MARK[])) -6. table=42, priority=190,reg4=0x20000/0x70000 actions=load:0x1->NXM_NX_REG4[16..18],resubmit(,41) -7. table=42, priority=0 actions=resubmit(,45) +1. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.101.255.29,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x9->reg7,group:9 +2. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.105.31.235,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0xc->reg7,group:12 +3. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.96.76.15,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x30000/0x70000->reg4,set_field:0xa->reg7,group:10 +4. table=ServiceLB, priority=190,tcp,reg4=0x30000/0x70000,nw_dst=10.96.76.15,tp_dst=80 actions=learn(table=SessionAffinity,hard_timeout=300,priority=200,delete_learned,cookie=0x203000000000a,\ + eth_type=0x800,nw_proto=6,NXM_OF_TCP_DST[],NXM_OF_IP_DST[],NXM_OF_IP_SRC[],load:NXM_NX_REG4[0..15]->NXM_NX_REG4[0..15],load:NXM_NX_REG4[26]->NXM_NX_REG4[26],load:NXM_NX_REG3[]->NXM_NX_REG3[],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[9]),\ + set_field:0x20000/0x70000->reg4,goto_table:EndpointDNAT +5. table=ServiceLB, priority=0 actions=goto_table:EndpointDNAT ``` -For flow 1-5, DNAT is performed with the IPv4 address stored in NXM_NX_REG3 and port number stored in -bits[0..15] in NXM_NX_REG4 by `ct commit` action. Note that, the match condition `reg4=0x2192b/0x7ffff` -is a union value. The value of bits [0..15] is port number. The value of bits [16..18] is 0b010, -which indicates that Service has done Endpoint selection. Service ct_mark `0x21` is also marked. +- Flows 1-2 match the first packet of connections destined to Services whose `service.spec.sessionAffinity` set to `None`. + - Match condition `tcp` matches the protocol of Service. This could also be `udp` or `sctp`. + - Match condition `reg4=0x10000/0x70000` is to match packets with `EpToSelectRegMark` (loaded in table [SessionAffinity], + flow 2), which indicates the corresponding Services should do Endpoint selection. + - Match condition `nw_dst=` is to match the IP of Service ClusterIP. + - Match condition `tp_dst=` is to match the port of Service ClusterIP. + - Action `set_field:0x200/0x200->reg0` is to load `RewriteMACRegMark`, which indicates that the destination and source + MACs of packets should be overwritten. This mark will be used in table [L3Forwarding]. + - Action `set_field:0x20000/0x70000->reg4` is to load `EpSelectedRegMark`, which indicates that Endpoint selection "is + performed". This mark will be used in table [EndpointDNAT]. Note that, Endpoint selection has not really been done + yet - it will be done by group action. The current action should have been done in target OVS group entry after + Endpoint selection. However, we set the bits here, for the purpose of supporting more Endpoints in an OVS group. + Please check PR [#2101](https://github.com/antrea-io/antrea/pull/2101) to learn more information. + - Action `set_field:0xa->reg7` loads the value of group ID to `ServiceGroupIDField`, which is used by the implementation + of NetworkPolicy. + - Action `group:10` sets the target OVS group. Note that, the target group needs to be created before the flow is created. +- Flow 3 matches the first packet of connection destined to Services whose `service.spec.sessionAffinity` set to `ClientIP`. + - Action `set_field:0x30000/0x70000->reg4` is to load `EpToLearnRegMark`, which Packet has done Service Endpoint + selection and the selected Endpoint needs to be cached. + - Other match conditions and actions are the same with flows 1-2. +- Flow 4 matches the packet previously matched by flow 3 (sending it to the related OVS group to do Endpoint selection + and resubmitting it back to this table). This flow will generate a learned flow in table [SessionAffinity] to match + the packets of subsequent connections of the same client IP, ensuring that the packets are forwarded to the same + Endpoint selected by the first time. + - Match condition `reg4=0x30000/0x70000` is to match packets with `EpToLearnRegMark` (loaded in table [ServiceLB], + flow 3), which indicates the corresponding Services should do Endpoint selection and cache the selection result. + - Action `learn` is to generate a learned flow in table [SessionAffinity]. + - Field `table=SessionAffinity` is the table where learned flow is generated. + - Field `hard_timeout=300` is the hard timeout of learned flow. + - Field `priority=200` is the priority of learned flow. + - Field `delete_learned` means that learned flow will be deleted after hard timeout. + - Field `cookie=0x203000000000a` is the cookie of learned flow. + - Field `eth_type=0x800` generates a match condition in learned flow to match packets with IPv4 protocol. + - Field `nw_proto=6` generates a match condition in learned flow to match the packets with TCP protocol. + - Field `NXM_OF_TCP_DST[]` generates a match condition in learned flow to match the packets with the TCP destination + port (Service port) of the current packet. In learned flow, it could be like `tcp_dst=80`. This field could be + also `NXM_OF_UDP_DST[]` or `NXM_OF_SCTP_DST[]` if the protocol of the Service is UDP or SCTP. + - Field `NXM_OF_IP_DST[]` generates a match condition in learned flow to match the packets with the destination IP + (Service IP) of the current packet. In learned flow, it could be like `nw_dst=10.96.76.15`. + - Field `NXM_OF_IP_SRC[]` generates a match condition in the learned flow to match the packets with the source IP + (client IP) of the current packet. In learned flow, it could be like `nw_src=10.10.0.7`. + - Field `load:NXM_NX_REG4[0..15]->NXM_NX_REG4[0..15]` generates an action in learned flow, which loads the value of + `EndpointPortField` in current flow to the same bits of `NXM_NX_REG4` in learned flow. In learned flow, it could + be like `set_field:0x50/0xffff->reg4`. The generated action is used to cache the selected Endpoint port. + - Field `load:NXM_NX_REG4[26]->NXM_NX_REG4[26]` generates an action in learned flow, which loads bit 26 of + `NXM_NX_REG4` in current flow to the same bits of `NXM_NX_REG4` in learned flow. In learned flow, it could be like + `set_field:0/0x4000000->reg4` or `set_field:0x4000000/0x4000000->reg4`. The generated action is used to indicate + that the selected Endpoint is on local Node or not. + - Field `load:NXM_NX_REG3[]->NXM_NX_REG3[]` generates an action in the learned flow, which loads `EndpointIPField` + in the current packet to the same bits of `NXM_NX_REG3` in learned flow. In the learned flow, it could be like + `set_field:0xa0a0007->reg3`. The generated action is used to cache the selected Endpoint IP. + - Field `load:0x2->NXM_NX_REG4[16..18]` generates an action in learned flow, which loads `EpSelectedRegMark`. In the + learned flow, it should be `set_field:0x20000/0x70000->reg4`. The generated action is used to indicate that Endpoint + selection "is performed". + - Field `load:0x1->NXM_NX_REG0[9]` generates an action in the learned flow, which loads `RewriteMACRegMark`. In + learned flow, it should be `set_field:0x200/0x200->reg0`. The generated action is used to indicate that the + destination and source MACs of packets should be overwritten. + - Action `set_field:0x20000/0x70000->reg4` is to load `EpSelectedRegMark`, which indicates that Endpoint selection + "is performed". This mark will be used in table [EndpointDNAT]. + - Action `goto_table:EndpointDNAT` is to send the packet to table [EndpointDNAT] after generating learned flow. + - Other match conditions are the same with flows 1-2. +- Flow 4 is the default auto-generated flow. + +The Endpoint selection is performed in OVS groups. If you dump the groups, you may see the following: -If none of the flows described above are hit, flow 6 is used to forward packet back to table [ServiceLBTable] -to select Endpoint again. +```text +9. group_id=9,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0xa0a0007->reg3,set_field:0x50/0xffff->reg4,resubmit(,EndpointDNAT),\ + bucket=bucket_id:1,weight:100,actions=set_field:0xa0a0008->reg3,set_field:0x50/0xffff->reg4,resubmit(,EndpointDNAT) +10. group_id=10,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0xa0a0008->reg3,set_field:0x50/0xffff->reg4,resubmit(,ServiceLB),\ + bucket=bucket_id:1,weight:100,actions=set_field:0xa0a0007->reg3,set_field:0x50/0xffff->reg4,resubmit(,ServiceLB) +12. group_id=12,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0x4000/0x4000->reg0,resubmit(,EndpointDNAT) +``` -Flow 7 is used to match non-Service packet. +- Group 9 is the target of flow 6. It is used to select an Endpoint for Service whose `service.spec.sessionAffinity` set + to `None`. There are two buckets in this group. Every bucket has the same chance to be selected since they have the + same weight. + - Action `load:0xa0a0007->NXM_NX_REG3[]` is to load Endpoint IPv4 address `10.10.0.7` to `EndpointIPField`. + - Action `load:0x50->NXM_NX_REG4[0..15]` is to load Endpoint port number `80` to `EndpointPortField`. + - Action `resubmit(,EndpointDNAT)` resubmits packets to table [EndpointDNAT]. +- Group 10 is the target of flow 7. It is used to select an Endpoint for Service whose `service.spec.sessionAffinity` + set to `ClientIP` like group 9. + - Action `load:0xa0a0008->NXM_NX_REG3[]` is to load Endpoint IPv4 address `10.10.0.8` to `EndpointIPField`. + - Action `load:0x50->NXM_NX_REG4[0..15]` is to load Endpoint port number `80` to `EndpointPortField`. + - Action `resubmit(,ServiceLB)` resubmits packets back to table [ServiceLB]. Then the packet will be matched by + flow 8. +- Group 12 is the target of flow 6. The group has only a single bucket. + - Action `load:0x4000/0x4000->reg0` is to load `SvcNoEpRegMark`, which indicates that the Service has no Endpoint. + - Action `resubmit(,EndpointDNAT)` resubmits packets to table [EndpointDNAT]. -### AntreaPolicyEgressRuleTable (45) +### EndpointDNAT -For this table, you will need to keep in mind the ACNP -[specification](#antrea-native-policies-implementation) -that we are using. +The table implements DNAT for Service connection after Endpoint selection in table [ServiceLB] -This table is used to implement the egress rules across all Antrea-native policies, -except for policies that are created in the Baseline Tier. Antrea-native policies -created in the Baseline Tier will be enforced after K8s NetworkPolicies, and their -egress rules are installed in the [EgressDefaultTable] and [EgressRuleTable] -respectively, i.e. +If you dump the flows for this table, you may see the following:: ```text -Baseline Tier -> EgressDefaultTable(60) -K8s NetworkPolicy -> EgressRuleTable(50) -All other Tiers -> AntreaPolicyEgressRuleTable(45) +1. table=EndpointDNAT, priority=200,reg0=0x4000/0x4000 actions=controller(reason=no_match,id=62373,userdata=04) +2. table=EndpointDNAT, priority=200,tcp,reg3=0xa0a0007,reg4=0x20050/0x7ffff actions=ct(commit,table=AntreaPolicyEgressRule,zone=65520,nat(dst=10.10.0.7:80),exec(set_field:0x10/0x10->ct_mark,move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +3. table=EndpointDNAT, priority=200,tcp,reg3=0xa0a0008,reg4=0x20050/0x7ffff actions=ct(commit,table=AntreaPolicyEgressRule,zone=65520,nat(dst=10.10.0.8:80),exec(set_field:0x10/0x10->ct_mark,move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +4. table=EndpointDNAT, priority=190,reg4=0x20000/0x70000 actions=set_field:0x10000/0x70000->reg4,resubmit(,ServiceLB) +5. table=EndpointDNAT, priority=0 actions=goto_table:AntreaPolicyEgressRule ``` -Since the example ACNP resides in the Application tier, if you dump the flows for -table 45, you should see something like this: +- Flow 1 matches the first packet of connections destined to any Services that have no Endpoint. + - Match condition `reg0=0x4000/0x4000` is to match `SvcNoEpRegMark`, which indicates that the Service has no Endpoint. + The mark is loaded in OVS group, like group 12 mentioned above. + - Action `controller(reason=no_match,id=62373,userdata=04)` forwards the packet to Antrea Agent to do further process. +- Flows 2-3 matches the first packet of connections destined to any Services that have selected the Endpoint whose IPv4 + address stored in `EndpointIPField` and port number stored in `EndpointPortField`. + - Match condition `reg4=0x20050/0x7ffff` is a union matching of `EndpointPortField` and `ServiceEPStateField`. The + value of `ServiceEPStateField` is 0b010 (`EpSelectedRegMark`), which indicates that Service has done Endpoint selection. + - Action `ct` performs DNAT and set some bits of ct mark. After this action, a new packet will be forwarded from the original + packet, with replacing the destination IP address and port with the Endpoint's IP address and port, to table + [AntreaPolicyEgressRule]. + - Field `commit` means to commit connection to the connection tracking module. + - Field `table=AntreaPolicyEgressRule` is the table where packets will be forwarded. + - Field `zone=65520` is to commit connection to `CtZone`. + - Field `nat(dst=:)` is to replace the destination IP and destination port (DNAT). + - Field `exec` sets some bits of ct mark. + - Action `set_field:0x10/0x10->ct_mark` is to load `ServiceCTMark`, which indicates that packet is from Service connections. + - Action `move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3]` is load value of `PktSourceField` to `ConnSourceCTMarkField`. +- Flow 4 is forward the packet which is not matched by flows 1-3 back to table [ServiceLB]. +- Flow 5 is the auto-generated flow to match the packet of non-Service connections. + +### AntreaPolicyEgressRule + +This table is used to implement the egress rules across all Antrea-native NetworkPolicies, except for NetworkPolicies +that are created in the Baseline Tier. Antrea-native NetworkPolicies created in the Baseline Tier will be enforced after +Kubernetes NetworkPolicies, and their egress rules are installed in table [EgressDefault] and [EgressRule] respectively, +i.e. ```text -1. table=45, priority=64990,ct_state=-new+est,ip actions=resubmit(,61) -2. table=45, priority=14000,conj_id=1,ip actions=load:0x1->NXM_NX_REG5[],ct(commit,table=61,zone=65520,exec(load:0x1->NXM_NX_CT_LABEL[32..63])) -3. table=45, priority=14000,ip,nw_src=10.10.1.6 actions=conjunction(1,1/3) -4. table=45, priority=14000,ip,nw_dst=10.10.1.8 actions=conjunction(1,2/3) -5. table=45, priority=14000,udp,tp_dst=53 actions=conjunction(1,3/3) -6. table=45, priority=0 actions=resubmit(,50) +Antrea-native NetworkPolicy Baseline Tier -> EgressDefault +Kubernetes NetworkPolicy -> EgressRule +Antrea-native NetworkPolicy other Tiers -> AntreaPolicyEgressRule ``` -Similar to [K8s NetworkPolicy implementation](#egressruletable-50), -AntreaPolicyEgressRuleTable also relies on the OVS built-in `conjunction` action to -implement policies efficiently. - -The above example flows read as follow: if the source IP address is in set -{10.10.1.6}, and the destination IP address is in the set {10.10.1.8}, and the -destination TCP port is in the set {53}, then use the `conjunction` action with -id 1, which stores the `conj_id` 1 in `ct_label[32..63]` for egress metrics collection -purposes, and forwards the packet to EgressMetricsTable, then [L3ForwardingTable]. -Otherwise, go to [EgressRuleTable] if no conjunctive flow above priority 0 is matched. -This corresponds to the case where the packet is not matched by any of the Antrea-native -policy egress rules in any tier (except for the "baseline" tier). - -If the `conjunction` action is matched, packets are "allowed" or "dropped" -based on the `action` field of the policy rule. If allowed, they follow a similar -path as described in the following [EgressRuleTable] section. - -Unlike the default of K8s NetworkPolicies, Antrea-native policies have no such -default rules. Hence, they are evaluated as-is, and there is no need for a -AntreaPolicyEgressDefaultTable. - -### EgressRuleTable (50) +Antrea-native NetworkPolicy relies on the OVS built-in `conjunction` action to implement policies efficiently. This +enables us to do a conjunctive match across multiple dimensions (source IP, destination IP, port) efficiently without +"exploding" the number of flows. By definition of a conjunctive match, we have at least 2 dimensions. For our use-case +we have at most 3 dimensions. -For this table, you will need to keep mind the Network Policy -[specification](#network-policy-implementation) that we are using. We have 2 -Pods running on the same Node, with IP addresses 10.10.1.2 to 10.10.1.3. They -are allowed to talk to each other using TCP on port 80, but nothing else. +The only requirements on `conj_id` is for it to be a unique 32-bit integer within the table. At the moment we use a +single custom allocator, which is common to all tables that can have NetworkPolicy flows installed ([AntreaPolicyEgressRule], +[EgressRule], [EgressDefault], [AntreaPolicyIngressRule] , [IngressRule] and [EgressDefault]). -This table is used to implement the egress rules across all Network Policies. If -you dump the flows for this table, you should see something like this: +For this table, you will need to keep in mind the Antrea-native NetworkPolicy [specification] +(#antrea-native-networkpolicy-implementation) that we are using. Since the sample Antrea-native NetworkPolicy resides in +the Application tier. If you dump the flows for this table, you may see the following: ```text -1. table=50, priority=210,ct_state=-new+est,ip actions=goto_table:70 -2. table=50, priority=200,ip,nw_src=10.10.1.2 actions=conjunction(2,1/3) -3. table=50, priority=200,ip,nw_src=10.10.1.3 actions=conjunction(2,1/3) -4. table=50, priority=200,ip,nw_dst=10.10.1.2 actions=conjunction(2,2/3) -5. table=50, priority=200,ip,nw_dst=10.10.1.3 actions=conjunction(2,2/3) -6. table=50, priority=200,tcp,tp_dst=80 actions=conjunction(2,3/3) -7. table=50, priority=190,conj_id=2,ip actions=load:0x2->NXM_NX_REG5[],ct(commit,table=61,zone=65520,exec(load:0x2->NXM_NX_CT_LABEL[32..63])) -8. table=50, priority=0 actions=goto_table:60 +1. table=AntreaPolicyEgressRule, priority=64990,ct_state=-new+est,ip actions=goto_table:EgressMetric +2. table=AntreaPolicyEgressRule, priority=64990,ct_state=-new+rel,ip actions=goto_table:EgressMetric +3. table=AntreaPolicyEgressRule, priority=14000,ip,nw_src=10.10.0.7 actions=conjunction(2,1/3) +4. table=AntreaPolicyEgressRule, priority=14000,ip,nw_src=10.10.0.8 actions=conjunction(2,1/3) +5. table=AntreaPolicyEgressRule, priority=14000,ip,nw_dst=10.10.0.7 actions=conjunction(2,2/3) +6. table=AntreaPolicyEgressRule, priority=14000,ip,nw_dst=10.10.0.8 actions=conjunction(2,2/3) +7. table=AntreaPolicyEgressRule, priority=14000,tcp,tp_dst=80 actions=conjunction(2,3/3) +8. table=AntreaPolicyEgressRule, priority=14000,conj_id=2,ip actions=set_field:0x2->reg5,ct(commit,table=EgressMetric,zone=65520,exec(set_field:0x200000000/0xffffffff00000000->ct_label)) +9. table=AntreaPolicyEgressRule, priority=13999,ip,nw_src=10.10.0.7 actions=conjunction(3,1/2) +10. table=AntreaPolicyEgressRule, priority=13999,ip,nw_src=10.10.0.8 actions=conjunction(3,1/2) +11. table=AntreaPolicyEgressRule, priority=13999,ip actions=conjunction(3,2/2) +12. table=AntreaPolicyEgressRule, priority=13999,conj_id=3 actions=set_field:0x3->reg3,set_field:0x400/0x400->reg0,set_field:0x4000000/0xfe000000->reg0,set_field:0xd/0xff->reg2,group:2 +13. table=AntreaPolicyEgressRule, priority=0 actions=goto_table:EgressRule ``` -Notice how we use the OVS built-in `conjunction` action to implement policies -efficiently. This enables us to do a conjunctive match across multiple -dimensions (source IP, destination IP, port) efficiently without "exploding" the -number of flows. By definition of a conjunctive match, we have at least 2 -dimensions. For our use-case we have at most 3 dimensions. - -The only requirements on `conj_id` is for it to be a unique 32-bit integer -within the table. At the moment we use a single custom allocator, which is -common to all tables that can have NetworkPolicy flows installed (45, 50, -60, 85, 90 and 100). This is why `conj_id` is set to 2 in the above example -(1 was allocated for the egress rule of our Antrea-native NetworkPolicy example -in the previous section). - -The above example flows read as follow: if the source IP address is in set -{10.10.1.2, 10.10.1.3}, and the destination IP address is in the set {10.10.1.2, -10.10.1.3}, and the destination TCP port is in the set {80}, then use the -`conjunction` action with id 2, which goes to [EgressMetricsTable], and then -[L3ForwardingTable]. Otherwise, packet goes to [EgressDefaultTable]. - -If the Network Policy specification includes exceptions (`except` field), then -the table will include multiple flows with conjunctive match, corresponding to -each CIDR that is present in `from` or `to` fields, but not in `except` field. -Network Policy implementation details are not covered in this document. - -If the `conjunction` action is matched, packets are "allowed" and forwarded -directly to [L3ForwardingTable]. Other packets go to [EgressDefaultTable]. If a -connection is established - as a reminder all connections are committed in -[ConntrackCommitTable] - its packets go straight to [L3ForwardingTable], with no -other match required (see flow 1 above, which has the highest priority). In -particular, this ensures that reply traffic is never dropped because of a -Network Policy rule. However, this also means that ongoing connections are not -affected if the K8s Network Policies are updated. - -One thing to keep in mind is that for Service traffic, these rules are applied -after the packets have gone through the local gateway and through kube-proxy. At -this point the ingress port is no longer the Pod port, but the local gateway -port. Therefore we cannot use the port as the match condition to identify if the -Pod has been applied a Network Policy - which is what we do for the -[IngressRuleTable] -, but instead have to use the source IP address. - -### EgressDefaultTable (60) - -This table complements [EgressRuleTable] for Network Policy egress rule -implementation. In K8s, when a Network Policy is applied to a set of Pods, the -default behavior for these Pods become "deny" (it becomes an [isolated Pod]( -https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). -This table is in charge of dropping traffic originating from Pods to which a Network -Policy (with an egress rule) is applied, and which did not match any of the -allowlist rules. - -Accordingly, based on our Network Policy example, we would expect to see flows -to drop traffic originating from our 2 Pods (10.10.1.2 and 10.10.1.3), which is -confirmed by dumping the flows: +- Flows 1-2 matches packets of connections with states `ESTABLISHED` or `RELATED` but not `NEW`, forwarding them to table + [EgressMetric] to skip egress-related flows. +- Flows 3-4 matches packets sourced from `10.10.0.7` or `10.10.0.8`. Action `conjunction(2,1/3)` signifies that the first + dimension of all three dimensions for `conj_id` 2. +- Flows 5-6 matches packets destined to `10.10.0.7` or `10.10.0.8` . Action `conjunction(2,2/3)` signifies that the second + dimension of all three dimensions for `conj_id` 2. +- Flow 7 matches packets destined to TCP port `80`. Action `conjunction(2,3/3)` signifies that the third dimension of all + three dimensions for `conj_id` 2. +- Flow 8 matches packets meeting all three dimensions for `conj_id` 2 (rule's action is `Allow`). + - Action `set_field:0x2->reg5` is to load `conj_id` to `APConjIDField`, which is used by feature Traceflow. + - Action `ct` is to persist `conj_id` to ct label in `CtZone`. + - Field `commit` means to commit connection to the connection tracking module. + - Field `table=EgressMetric` is the table where packets will be forwarded. + - Field `zone=65520` is to commit connection to `CtZone`. + - Field `exec` sets some bits of ct label. + - Action `set_field:0x200000000/0xffffffff00000000->ct_label` is to load current `conj_id` value to `EgressRuleCTLabel` + for egress metrics collection purposes. +- Flows 9-10 matches packets sourced from `10.10.0.7` or `10.10.0.8`. Action `conjunction(5,1/2)` signifies that the first + dimension of all two dimensions for `conj_id` 3. +- Flow 11 matches IPv4 packets. Action `conjunction(5,2/2)` signifies that the second dimension of all two dimensions for + `conj_id` 3. +- Flow 12 matches packets meeting all two dimensions for `conj_id` 3 (rule's action is `Reject`). + - Action `set_field:0x3->reg3` is to load `conj_id` value to `APConjIDField`, which is used by feature Traceflow. + - Action `set_field:0x400/0x400->reg0` is to load `APDenyRegMark`, indicating that the packet was denied (Drop / Reject), + then Kubernetes default drop will not be recorded in this reg. + - Action `set_field:0x4000000/0xfe000000->reg0` is to load 0b10 to `PacketInOperationField`, indicating that the packet + should be rejected. + - Action `set_field:0xd/0xff->reg2` is to load the current table ID to `PacketInTableField` for logging. + - Action `group:2` is to send the packet to group 2 for logging. +- Flow 13 is the auto-generated flow. + +If a connection is established, its packets go straight to table [EgressMetric], with no other match required (see flows +1-2 above, which has the highest priority). In particular, this ensures that reply traffic is never dropped because of a +Antrea-native NetworkPolicy or Kubernetes NetworkPolicy rule. However, this also means that ongoing connections are not +affected if the NetworkPolicy is updated. + +Unlike the default of Kubernetes NetworkPolicy, Antrea-native NetworkPolicy have no such default rules. Hence, they are +evaluated as-is, and there is no need for a table [AntreaPolicyEgressDefault]. + +### EgressRule + +For this table, you will need to keep mind the Kubernetes NetworkPolicy [specification](#kubernetes-networkpolicy-implementation) +that we are using. We have 2 Pods running on the same Node, with IP addresses 10.10.0.7 and 10.10.0.89. They are allowed +to talk to each other using TCP on port 80, but nothing else. + +This table is used to implement the egress rules across all Kubernetes NetworkPolicies. If you dump the flows for this table, +you may see the following: ```text -1. table=60, priority=200,ip,nw_src=10.10.1.2 actions=drop -2. table=60, priority=200,ip,nw_src=10.10.1.3 actions=drop -3. table=60, priority=0 actions=goto_table:61 +1. table=EgressRule, priority=200,ip,nw_src=10.10.0.8 actions=conjunction(5,1/3) +2. table=EgressRule, priority=200,ip,nw_src=10.10.0.7 actions=conjunction(5,1/3) +3. table=EgressRule, priority=200,ip,nw_dst=10.10.0.8 actions=conjunction(5,2/3) +4. table=EgressRule, priority=200,ip,nw_dst=10.10.0.7 actions=conjunction(5,2/3) +5. table=EgressRule, priority=200,tcp,tp_dst=80 actions=conjunction(5,3/3) +6. table=EgressRule, priority=190,conj_id=5,ip actions=set_field:0x5->reg5,ct(commit,table=EgressMetric,zone=65520,exec(set_field:0x500000000/0xffffffff00000000->ct_label)) +7. table=EgressRule, priority=0 actions=goto_table:EgressDefaultRule ``` -This table is also used to implement Antrea-native policy egress rules that are -created in the Baseline Tier. Since the Baseline Tier is meant to be enforced -after K8s NetworkPolicies, the corresponding flows will be created at a lower -priority than K8s default drop flows. For example, a baseline rule to drop -egress traffic to 10.0.10.0/24 for a Namespace will look like the following: +- Flows 1-2 matches packets sourced from `10.10.0.7` or `10.10.0.8`. Action `conjunction(5,1/3)` signifies that the first + dimension of all three dimensions for `conj_id` 5. +- Flows 3-4 matches packets destined to `10.10.0.7` or `10.10.0.8` . Action `conjunction(5,2/3)` signifies that the second + dimension of all three dimensions for `conj_id` 5. +- Flow 5 matches packets destined to TCP port `80`. Action `conjunction(5,3/3)` signifies that the third dimension of all + three dimensions for `conj_id` 5. +- Flow 6 matches packets meeting all three dimensions for `conj_id` 5. + - Action `set_field:0x5->reg5` is to load `conj_id` to `APConjIDField`, which is used by feature Traceflow. + - Action `ct` is to persist `conj_id` to ct label in `CtZone`. + - Field `commit` means to commit connection to the connection tracking module. + - Field `table=EgressMetric` is the table where packets will be forwarded. + - Field `zone=65520` is to commit connection to `CtZone`. + - Field `exec` sets some bits of ct label. + - Action `set_field:0x500000000/0xffffffff00000000->ct_label` is to load current `conj_id` value to `EgressRuleCTLabel` + for egress metrics collection purposes. +- Flow 7 is the auto-generated flow. + +### EgressDefault + +This table complements table [EgressRule] for Kubernetes NetworkPolicy egress rule implementation. In Kubernetes, when a +NetworkPolicy is applied to a set of Pods, the default behavior for these Pods become "deny" (it becomes an [isolated Pod]( +https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). +This table is in charge of dropping traffic originating from Pods to which a NetworkPolicy (with an egress rule) is +applied, and which did not match any of the allow list rules. + +Accordingly, based on our NetworkPolicy example, we would expect to see flows to drop traffic originating from our +2 Pods (10.10.0.7 and 10.10.0.8), which is confirmed by dumping the flows: ```text -1. table=60, priority=80,ip,nw_src=10.10.1.11 actions=conjunction(5,1/2) -2. table=60, priority=80,ip,nw_src=10.10.1.10 actions=conjunction(5,1/2) -3. table=60, priority=80,ip,nw_dst=10.0.10.0/24 actions=conjunction(5,2) -4. table=60, priority=80,conj_id=5,ip actions=load:0x3->NXM_NX_REG5[],load:0x1->NXM_NX_REG0[20],resubmit(,61) +1. table=EgressDefaultRule, priority=200,ip,nw_src=10.10.0.8 actions=drop +2. table=EgressDefaultRule, priority=200,ip,nw_src=10.10.0.7 actions=drop +3. table=EgressDefaultRule, priority=0 actions=goto_table:EgressMetric ``` -The table-miss flow entry, which is used for non-isolated Pods, forwards -traffic to the next table EgressMetricsTable, then ([L3ForwardingTable]). +This table is also used to implement Antrea-native NetworkPolicy egress rules that are created in the Baseline Tier. +Since the Baseline Tier is meant to be enforced after Kubernetes NetworkPolicies, the corresponding flows will be created +at a lower priority than Kubernetes NetworkPolicy default drop flows. These flows are like flows 3-12 in table +[AntreaPolicyEgressRule]. -### L3ForwardingTable (70) +### EgressMetric -This is the L3 routing table. It implements the following functionality: +This table is used to collect egress metrics for Antrea-native NetworkPolicies and Kubernetes NetworkPolicies. -* Tunnelled traffic coming-in from a peer Node and destined to a local Pod is - directly forwarded to the Pod. This requires setting the source MAC to the MAC - of the local gateway interface and setting the destination MAC to the Pod's - MAC address. Then the packets will go to [L3DecTTLTable] for decrementing - the IP TTL value. Such packets can be identified by bit 19 of the NXM_NX_REG0 - register (which was set to 1 in the [ClassifierTable]) and the destination IP - address (which should match the IP address of a local Pod). We therefore - install one flow for each Pod created locally on the Node. For example: +If you dump the flows for this table, you may see the following: ```text -table=70, priority=200,ip,reg0=0x80000/0x80000,nw_dst=10.10.0.2 actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:12:9e:a6:47:d0:70,goto_table:72 +1. table=EgressMetric, priority=200,ct_state=+new,ct_label=0x200000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +2. table=EgressMetric, priority=200,ct_state=-new,ct_label=0x200000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +3. table=EgressMetric, priority=200,ct_state=+new,ct_label=0x500000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +4. table=EgressMetric, priority=200,ct_state=-new,ct_label=0x500000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +5. table=EgressMetric, priority=200,reg0=0x400/0x400,reg3=0x3 actions=drop +6. table=EgressMetric, priority=0 actions=goto_table:L3Forwarding ``` -* All tunnelled traffic destined to the local gateway (i.e. for which the - destination IP matches the local gateway's IP) is forwarded to the gateway - port by rewriting the destination MAC (from the Global Virtual MAC to the - local gateway's MAC). - -```text -table=70, priority=200,ip,reg0=0x80000/0x80000,nw_dst=10.10.0.1 actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:80 -``` +- Flows 1-2 matches packets from the sample Antrea-native NetworkPolicy egress rule with action `Allow`. + - Match conditions `ct_state=+new` or `ct_state=-new` are to match packets with state `NEW` or not `NEW`. + - Match condition `ct_label=0x200000000/0xffffffff00000000` is to match `EgressRuleCTLabel` with value 2, which is + loaded in table [AntreaPolicyEgressRule], flow 8. + - Action `goto_table:L3Forwarding` is to forward packets to table [L3Forwarding] since the action of the sample rule + is `Allow`. +- Flows 3-4 matches packets from the sample Kubernetes NetworkPolicy egress rule with action `Allow`. + - Match conditions `ct_state=+new` or `ct_state=-new` are to match packets with state `NEW` or not `NEW`. + - Match condition `ct_label=0x500000000/0xffffffff00000000` is to match `EgressRuleCTLabel` with value 5, which is + loaded in table [EgressRule], flow 6. + - Action `goto_table:L3Forwarding` is the same as flows 1-2. +- Flow 5 matches packets of the sample Antrea-native NetworkPolicy egress rule with action `Drop` or `Reject`, then + drop them. + - Match condition `reg0=0x400/0x400` is to match `APDenyRegMark`, which is loaded in table [AntreaPolicyEgressRule], + flow 12, indicating that the packets should be denied. + - Match condition `reg3=0x3` is to match `APConjIDField`, which is loaded in table [AntreaPolicyEgressRule], flow 12. + - Action `drop` is to drop packets. +- Flow 6 is the auto-generated flow. + +### L3Forwarding + +This is the L3 routing table. -* All reply traffic of connections initiated through the gateway port, i.e. for - which the first packet of the connection (SYN packet for TCP) was received - through the gateway. Such packets can be identified by the packet's direction - in `ct_state` and the `ct_mark` value `0x20` which is committed in - [ConntrackCommitTable] when the first packet of the connection was handled. - A flow will overwrite the destination MAC to the local gateway MAC to ensure - that they get forwarded through the gateway port. This is required to handle - the following cases: - - reply traffic for connections from a local Pod to a ClusterIP Service, which - are handled by kube-proxy and go through DNAT. In this case the destination - IP address of the reply traffic is the Pod which initiated the connection to - the Service (no SNAT by kube-proxy). We need to make sure that these packets - are sent back through the gateway so that the source IP can be rewritten to - the ClusterIP ("undo" DNAT). If we do not use connection tracking and do not - rewrite the destination MAC, reply traffic from the backend will go directly - to the originating Pod without going first through the gateway and - kube-proxy. This means that the reply traffic will arrive at the originating - Pod with the incorrect source IP (it will be set to the backend's IP instead - of the Service IP). - - when hair-pinning is involved, i.e. connections between 2 local Pods, for - which NAT is performed. One example is a Pod accessing a NodePort Service - for which `externalTrafficPolicy` is set to `Local` using the local Node's - IP address, as there will be no SNAT for such traffic. Another example could - be `hostPort` support, depending on how the feature is implemented. +If you dump the flows for this table, you may see the following: ```text -table=70, priority=210,ct_state=+rpl+trk,ct_mark=0x20,ip actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:80 +1. table=L3Forwarding, priority=210,ip,nw_dst=10.10.0.1 actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +2. table=L3Forwarding, priority=210,ct_state=+rpl+trk,ct_mark=0x2/0xf,ip actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +3. table=L3Forwarding, priority=200,ip,reg0=0/0x200,nw_dst=10.10.0.0/24 actions=goto_table:L2ForwardingCalc +4. table=L3Forwarding, priority=200,ip,nw_dst=10.10.1.0/24 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:aa:bb:cc:dd:ee:ff->eth_dst,set_field:192.168.77.103->tun_dst,set_field:0x10/0xf0->reg0,goto_table:L3DecTTL +5. table=L3Forwarding, priority=200,ip,reg0=0x200/0x200,nw_dst=10.10.0.7 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:2e:ba:06:b2:44:91->eth_dst,goto_table:L3DecTTL +6. table=L3Forwarding, priority=200,ip,reg0=0x200/0x200,nw_dst=10.10.0.8 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:c2:5a:5e:50:95:9b->eth_dst,goto_table:L3DecTTL +7. table=L3Forwarding, priority=190,ct_mark=0x10/0x10,reg0=0x202/0x20f actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +8. table=L3Forwarding, priority=190,ct_state=-rpl+trk,ip,reg0=0x3/0xf,reg4=0/0x100000 actions=goto_table:EgressMark +9. table=L3Forwarding, priority=190,ct_state=-rpl+trk,ip,reg0=0x1/0xf actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,goto_table:EgressMark +10. table=L3Forwarding, priority=0 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc ``` -* All traffic destined to a remote Pod is forwarded through the appropriate - tunnel. This means that we install one flow for each peer Node, each one - matching the destination IP address of the packet against the Pod subnet for - the Node. In case of a match the source MAC is set to the local gateway MAC, - the destination MAC is set to the Global Virtual MAC and we set the OF - `tun_dst` field to the appropriate value (i.e. the IP address of the remote - gateway). Traffic then goes to [L3DecTTLTable]. - For a given peer Node, the flow may look like this: +- Flow 1 matches packets destined to local Antrea gateway IP. + - Match condition `nw_dst=10.10.0.1` is to match packets destined to local gateway IP. + - Action `set_field:ba:5e:d1:55:aa:c0->eth_dst` is to rewrite the destination MAC address of packets to the local + gateway MAC address. Note that, this action is not necessary for Pod-to-gateway request packets because the + destination MAC address is already the local gateway MAC address. However, it is used by feature like AntreaIPAM which + is not enabled by default. + - Action `set_field:0x20/0xf0->reg0` is to load `ToGatewayRegMark`, which indicates that the output port is local + Antrea gateway. + - Action `goto_table:L3DecTTL` is to forward packets to table [L3DecTTL] to decrease TTL value. +- Flow 2 matches reply packets from connections initiated through the local Antrea gateway, i.e. for which the first + packet of the connection (SYN packet for TCP) was received through the gateway, to ensure that reply packets can be + forwarded back to the local gateway, guaranteeing the availability of the connection. This is required to handle the + following cases: + - Reply traffic for connections from a local Pod to a ClusterIP Service, which are handled by kube-proxy and go through + DNAT. In this case the destination IP address of the reply traffic is the Pod which initiated the connection to the + Service (no SNAT by kube-proxy). We need to make sure that these packets are sent back through the gateway so that + the source IP can be rewritten to the ClusterIP ("undo" DNAT). If we do not use connection tracking and do not rewrite + the destination MAC, reply traffic from the backend will go directly to the originating Pod without going first + through the gateway and kube-proxy. This means that the reply traffic will arrive at the originating Pod with the + incorrect source IP (it will be set to the backend's IP instead of the Service IP). + - when hair-pinning is involved, i.e. connections between 2 local Pods, for which NAT is performed. One example is a + Pod accessing a NodePort Service for which externalTrafficPolicy is set to Local using the local Node's IP address, + as there will be no SNAT for such traffic. Another example could be hostPort support, depending on how the feature + is implemented. + For match conditions and actions: + - Match condition `ct_state=+rpl+trk` is to match reply "tracked" packets. + - Match condition `ct_mark=0x2/0xf` is to match `FromGatewayCTMark`, indicating that packets are from connections + originated from the local Antrea gateway port. + - Actions `set_field:ba:5e:d1:55:aa:c0->eth_dst`, `set_field:0x20/0xf0->reg0` and `goto_table:L3DecTTL` are the same + with flow 1. +- Flow 3 matches packets from intra-Node connections (not including Service connections). + - Match condition `reg0=0/0x200` is to match `NotRewriteMACRegMark`, indicating that the destination and source MACs + of packets should not be overwritten. For Service or inter-Node connections, `RewriteMACRegMark` is loaded, like + flows 4-6. + - Match condition `nw_dst=10.10.0.0/24` is to match packets whose destination IP is in local Pod CIDR. + - Action `goto_table:L2ForwardingCalc` is to forward packets to table [L2ForwardingCalc], rather than table [L3DecTTL], + since it is no need to decrease TTL value of packets from connections among local Pods. +- Flow 4 matches packets destined to remote Pod CIDR. This means that we install one flow for each peer Node, each one + matching the destination IP address of the packet against the Pod subnet for the Node. + - Match condition `nw_dst=10.10.1.0/24` is to match packets destined to remote Pod CIDR. + - Action `set_field:ba:5e:d1:55:aa:c0->eth_src` is to rewrite the source MAC address of packets to the local gateway + MAC address. + - Action `set_field:aa:bb:cc:dd:ee:ff->eth_dst` is to rewrite the destination MAC address of packets to the Global + Virtual MAC address. + - Action `set_field:192.168.77.103->tun_dst` is to set destination IP address (i.e. the IP address of the remote gateway) + of tunnel to remote Node. + - Action `set_field:0x10/0xf0->reg0` is to load `ToRemoteRegMark`, which indicates that the output port is tunnel. +- Flow 5-6 matches packets destined to local Pods. The packets could be from Service or inter-Node connections. + - Match condition `reg0=0x200/0x200` is to match `RewriteMACRegMark`, indicating that the destination and source MACs + of packets should be overwritten. This is to match Service or inter-Node packets from connections. + - Match condition `nw_dst=` is to match packets whose destination IP is the IP address of a local Pod. + - Action `set_field:ba:5e:d1:55:aa:c0->eth_src` is to rewrite the source MAC address of packets to the local gateway + MAC address. + - Action `set_field:->eth_dst` is to rewrite the destination MAC address of packets to the local Pod + MAC address. + - Action `goto_table:L3DecTTL` is the same as flow 1. +- Flow 7 is to match packets of Service connections sourced from local Antrea gateway and destined to external network. + - Match condition `ct_mark=0x10/0x10` is to match `ServiceCTMark`, indicating that packets are from Service connections. + - Match condition `reg0=0x202/0x20f` is to match `RewriteMACRegMark` and `FromGatewayRegMark`, indicating that packets + are from Service connections originated from local Antrea gateway. + - Action `set_field:ba:5e:d1:55:aa:c0->eth_dst` is to rewrite the destination MAC address of packets to the local + gateway MAC address. + - Action `set_field:0x20/0xf0->reg0` is to load `ToGatewayRegMark`, which indicates that the output port is local + Antrea gateway. + - Action `goto_table:L3DecTTL` is the same as flow 1. +- Flow 8 is to match packets that are from local Pods and destined to external network, and send them to table [EgressMark], + where SNAT IPs are looked up for the packets. + - Match condition `ct_state=-rpl+trk` is to match request "tracked" packets. + - Match condition `reg0=0x3/0xf` is to match `FromLocalRegMark`, indicating that packets are from connections + originated from local Pods. + - Match condition `reg4=0/0x100000` is to match `NotAntreaFlexibleIPAMRegMark` since Egress can only be applied to + non-AntreaIPAM Pods. For packets from AntreaIPAM Pods, `AntreaFlexibleIPAMRegMark` will be loaded. + - Action `goto_table:EgressMark` is to forward packets to table [EgressMark] to lookup SNAT IPs. +- Flow 9 is to match packets that are from remote Pods and destined to external network, which needs to perform SNAT for + feature Egress, and send them to table [EgressMark], where SNAT IPs are looked up for the packets. + - Match condition `ct_state=-rpl+trk` is the same as flow 10. + - Match condition `reg0=0x1/0xf` is to match `FromTunnelRegMark`, indicating that packets are from remote Pods through + tunnel. + - Match condition `set_field:ba:5e:d1:55:aa:c0->eth_dst` is to rewrite the destination MAC address of packets to the + local gateway MAC address. This is because the packets are from remote Pods through tunnel, and the destination MAC + is the Global Virtual MAC address. + - Action `goto_table:EgressMark` is the same as flow 10. +- Flow 10 is the table-miss flow entry to match packets which are from local Pods and destined to external network. + - Action `set_field:0x20/0xf0->reg0` is the same as flow 1 or 2. + +### EgressMark + +This table is created only when the Egress feature is enabled. It includes flows to implement Egresses and select the +right SNAT IPs for egress traffic from Pods to external network. -```text -table=70, priority=200,ip,nw_dst=10.10.1.0/24 actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:aa:bb:cc:dd:ee:ff,load:0x1->NXM_NX_REG1[],set_field:0x10000/0x10000->reg0,load:0xc0a80102->NXM_NX_TUN_IPV4_DST[],goto_table:72 -``` - -If none of the flows described above are hit, traffic goes directly to -[L2ForwardingCalcTable]. This is the case for external traffic, whose -destination is outside the cluster (such traffic has already been -forwarded to the local gateway by the local source Pod, and only L2 switching -is required), as well as for local Pod-to-Pod traffic. +If you dump the flows for this table, you may see the following: ```text -table=70, priority=0 actions=goto_table:80 + 1. table=EgressMark, priority=210,ip,nw_dst=192.168.77.102 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc + 2. table=EgressMark, priority=210,ip,nw_dst=192.168.77.103 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc + 3. table=EgressMark, priority=210,ip,nw_dst=10.96.0.0/12 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc + 4. table=EgressMark, priority=200,ip,in_port="nginx-d9-dfc134" actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:aa:bb:cc:dd:ee:ff->eth_dst,set_field:192.168.77.103->tun_dst,set_field:0x10/0xf0->reg0,set_field:0x80000/0x80000->reg0,goto_table:L2ForwardingCalc + 5. table=EgressMark, priority=200,ct_state=+new+trk,ip,tun_dst=192.168.77.102 actions=set_field:0x1/0xff->pkt_mark,set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc + 6. table=EgressMark, priority=200,ct_state=+new+trk,ip,in_port="nginx-d9-b93cc5" actions=set_field:0x1/0xff->pkt_mark,set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc + 7. table=EgressMark, priority=190,ct_state=+new+trk,ip,reg0=0x1/0xf actions=drop + 8. table=EgressMark, priority=0 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc ``` -When the Egress feature is enabled, extra flows will be added to -[L3ForwardingTable], which send the egress traffic from Pods to external network -to [SNATTable]. The following two flows match traffic to local Pods and traffic -to the local Node IP respectively, and keep them in the normal forwarding path -(to [L2ForwardingCalcTable]), so they will not be sent to [SNATTable]: - -```text -table=70, priority=200,ip,reg0=0/0x80000,nw_dst=10.10.1.0/24 actions=goto_table:80 -table=70, priority=200,ip,reg0=0x2/0xffff,nw_dst=192.168.1.1 actions=goto_table:80 -``` +- Flows 1-2 match packets which are from local Pods and destined to the transport IP of remote Nodes to skip Egress SNAT. + - Match condition `nw_dst=` is to match packets destined to transport IP of remote Nodes. + - Action `set_field:0x20/0xf0->reg0` is to load `ToGatewayRegMark`, which indicates that the output port is local + Antrea gateway. + - Action `goto_table:L2ForwardingCalc` is to forward packets to table [L2ForwardingCalc]. +- Flow 3 match packets which are from local Pods and destined to Services to skip Egress SNAT. + - Match condition `nw_dst=10.96.0.0/12` is to match packets destined to Service CIDR. + - Actions `set_field:0x20/0xf0->reg0` and `goto_table:L2ForwardingCalc` are the same as flows 1-2. +- Flow 4 match packets which are from local Pods selected by an Egress and its SNAT IP of the Egress is configured on + remote Node. + - Match condition `nginx-d9-dfc134` is to match packets from the local Pod. + - Action `set_field:ba:5e:d1:55:aa:c0->eth_src` is to rewrite the source MAC address of packets to the local gateway + MAC address. + - Action `set_field:aa:bb:cc:dd:ee:ff->eth_dst` is to rewrite the destination MAC address of packets to the Global + Virtual MAC address. + - Action `set_field:192.168.77.203->tun_dst` is to set destination IP address (transport IP of remote Nodes) of tunnel + to remote Node. + - Action `set_field:0x10/0xf0->reg0` is to load `ToRemoteRegMark`, which indicates that the output port is tunnel. + - Action `set_field:0x80000/0x80000->reg0` is to load `EgressSNATRegMark`, which indicates that packets should be + SNAT'd on a remote Node. +- Flow 5 match packets which are from remote Pods selected by an Egress and its SNAT IP of the Egress is configured on + local Node and sets an 8 bits ID allocated for the SNAT IP to pkt_mark. + - Match condition `ct_state=+new+trk` is to match the first "tracked" packet. + - Match condition `tun_dst=` is to match packets destined to transport IP of local Node, ensure + that the packets are from remote Nodes. + - Action `set_field:0x1/0xff->pkt_mark` is to set the 8 bits ID allocated for SNAT IP to pkt_mark. The ID is for + iptables SNAT rules to match the packets and perform SNAT with the right SNAT IP (Antrea Agent adds an iptables SNAT + rule for each local SNAT IP that matches the ID). + - Actions `set_field:0x20/0xf0->reg0` and `goto_table:L2ForwardingCalc` are the same as flows 1-2. +- Flow 6 match packets which are from local Pods selected by an Egress and its SNAT IP of the Egress is configured on + local Node. + - Match condition `ct_state=+new+trk` is the same as flow 5. + - Match condition `nginx-d9-b93cc5` is to match packets from a local Pod. + - Actions `set_field:0x1/0xff->pkt_mark`, `set_field:0x20/0xf0->reg0` and `goto_table:L2ForwardingCalc` are the same as + flow 5. +- Flow 7 is to drop egress traffic tunnelled from remote Nodes that does not match any SNAT IP configured on local Node. + - Match condition `ct_state=+new+trk` is the same as flow 5. + - Match condition `reg0=0x1/0xf` is to match `FromTunnelRegMark`, indicating that packets are from remote Pods through + tunnel. +- Flow 8 is to match "tracked" but not the first packets from Egress connections and forward them to table [L2ForwardingCalc]. + - Actions `set_field:0x20/0xf0->reg0` and `goto_table:L2ForwardingCalc` are the same as flows 1-2. + +Note that, when no Egress applies to Pods on the Node, and no SNAT IP is configured on the Node, [SNATMark] just has +flows 1-3, 7-8; when there is an Egress applied to a Pod on local Node, and the SNAT IP of the Egress is configured on a +remote Node, flow 4 will be added; when there is an Egress applied to a Pod on remote Node, and the SNAT IP of the Egress +is configured on local Node, flow 5 will be added; when there is an Egress applied to a Pod on local Node, and the SNAT +IP of the Egress is configured on local Node, flow 6 will be added. + +### L3DecTTL + +This is the table to decrement TTL for IP packets. -The following two flows send the traffic not matched by other flows to -[SNATTable]. One of the flows is for egress traffic from local Pods; another -one is for egress traffic from remote Pods, which is tunnelled to this Node to -be SNAT'd with a SNAT IP configured on the Node. In the latter case, the flow -also rewrites the destination MAC to the local gateway interface MAC. +If you dump the flows for this table, you may see the following: ```text -table=70, priority=190,ip,reg0=0x2/0xf actions=goto_table:71 -table=70, priority=190,ip,reg0=0/0xf actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:71 +1. table=L3DecTTL, priority=210,ip,reg0=0x2/0xf actions=goto_table:SNATMark +2. table=L3DecTTL, priority=200,ip actions=dec_ttl,goto_table:SNATMark +3. table=L3DecTTL, priority=0 actions=goto_table:SNATMark ``` -### SNATTable (71) +- Flow 1 matches packets which enter OVS pipeline from local Antrea gateway, as the host IP stack should have decremented + the TTL already for such packets, TTL should not be decremented again. + - Match condition `reg0=0x2/0xf` is to match `FromGatewayRegMark`, indicating that packets are from local Antrea gateway. +- Flow 2 is to decrement TTL for packets which are not matched by flow 1. +- Flow 3 is an auto-generated flow that should remain unused. -This table is created only when the Egress feature is enabled. It includes flows -to implement Egresses and select the right SNAT IPs for egress traffic from Pods -to external network. +### SNATMark -When no Egress applies to Pods on the Node, and no SNAT IP is configured on the -Node, [SNATTable] just has two flows. One drops egress traffic tunnelled from -remote Nodes that does not match any SNAT IP configured on this Node, and the -default flow that sends egress traffic from local Pods, which do not have any -Egress applied, to [L2ForwardingCalcTable]. Such traffic will be SNAT'd with -the default SNAT IP (by an iptables masquerade rule). +This table marks connections requiring SNAT within the OVS pipeline, distinct from Egress SNAT handled by iptables. -```text -table=71, priority=190,ct_state=+new+trk,ip,reg0=0/0xf actions=drop -table=71, priority=0 actions=goto_table:80 -``` - -When there is an Egress applied to a Pod on the Node, a flow will be added for -the Pod's egress traffic. If the SNAT IP of the Egress is configured on the -local Node, the flow sets an 8 bits ID allocated for the SNAT IP to pkt_mark. -The ID is for iptables SNAT rules to match the packets and perfrom SNAT with -the right SNAT IP (Antrea Agent adds an iptables SNAT rule for each local SNAT -IP that matches the ID). +If you dump the flows for this table, you may see the following: ```text -table=71, priority=200,ct_state=+new+trk,ip,in_port="pod1-7e503a" actions=set_field:0x1/0xff->pkt_mark,goto_table:80 +1. table=SNATMark, priority=200,ct_state=+new+trk,ip,reg0=0x22/0xff actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +2. table=SNATMark, priority=200,ct_state=+new+trk,ip,reg0=0x12/0xff,reg4=0x200000/0x2200000 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark)) +3. table=SNATMark, priority=190,ct_state=+new+trk,ip,nw_src=10.10.0.7,nw_dst=10.10.0.7 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +4. table=SNATMark, priority=190,ct_state=+new+trk,ip,nw_src=10.10.0.8,nw_dst=10.10.0.8 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +5. table=SNATMark, priority=0 actions=goto_table:SNAT ``` -When the SNAT IP of the Egress is on a remote Node, the flow will tunnel the -packets to the remote Node with the tunnel's destination IP to be the SNAT IP. -The packets will be SNAT'd on the remote Node. The same as a normal tunnel flow -in [L3ForwardingTable], the flow will rewrite the packets' source and -destination MAC addresses, load the SNAT IP to NXM_NX_TUN_IPV4_DST, and send the -packets to [L3DecTTLTable]. +- Flow 1 matches packets whose source and destination are both local Antrea gateway port. Such hair-pin connection should + be SNAT'd with the virtual Service IP. + - Match condition `ct_state=+new+trk` is to match the first packet tracked in `CtZone`. + - Match condition `reg0=0x22/0xff` is to match `FromGatewayRegMark` and `ToGatewayRegMark`, indicating that packets + are from local Antrea gateway port and also destined to it. + - Action `ct` is applied to matched packets with the commit parameter in `CtZone` to persist some ct marks. + - Field `commit` means to commit connection to the connection tracking module. Note that, a packet can be committed + in the same ct zone multiple times. For Service connections, the first `commit` is performed table [EndpointDNAT]. + - Field `table=SNAT` is the table where packets will be forwarded. + - Field `zone=65520` is to commit connection to `CtZone`. + - Field `exec` is to persist some ct marks. + - Action `set_field:0x20/0x20->ct_mark` is to load `ConnSNATCTMark`, indicating that the connection requires SNAT. + - Action `set_field:0x40/0x40->ct_mark` is to load `HairpinCTMark`, indicating that this is a hair-pin connection. +- Flow 2 matches packets whose source is local Antrea gateway port and destination is a remote Pod. Such connection should + be SNAT'd with the IP address of local Antrea gateway. + - Match condition `ct_state=+new+trk` is the same as flow 1. + - Match condition `reg0=0x12/0xff` is to match `FromGatewayRegMark` and `ToTunnelRegMark`, indicating that packets are + from local Antrea gateway port and destined to a remote Pod through tunnel. + - Match condition `reg4=0x200000/0x2200000` is to match `ToExternalAddressRegMark` and `NotDSRServiceRegMark`, + indicating that packets are destined to a Service's external IP, like NodePort, LoadBalancerIP or ExternalIP, but it + is not DSR mode. + - Action `ct` is the same as flow 1 except that `HairpinCTMark` is not loaded since this is not a hair-pin connection. +- Flow 3-4 match packets whose source and destination are the same local Pod. Such hair-pin connection should be SNAT'd + with the IP address of local Antrea gateway. + - Match condition `ct_state=+new+trk` is the same as flow 1. + - Match condition `nw_src=` and `nw_dst=` are to match packets whose source and + destination are both the IP address of a local Pod. + - Action `ct` is the same as flow 1. +- Flow 5 is the auto-generated flow. + +### SNAT + +This table performs SNAT for connections requiring SNAT within the OVS pipeline. -```text -table=71, priority=200,ct_state=+new+trk,ip,in_port="pod2-357c21" actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:aa:bb:cc:dd:ee:ff,load:0x1->NXM_NX_REG1[],set_field:0x10000/0x10000->reg0,load:0xc0a80a66->NXM_NX_TUN_IPV4_DST[],goto_table:72 -``` - -Last, when a SNAT IP configured for Egresses is on the local Node, an additional -flow is added in [SNATTable] for egress traffic from remote Node that should -use the SNAT IP. The flow matches the tunnel destination IP (which should be -equal to the SNAT IP), and sets the 8 bits ID of the SNAT IP to pkt_mark. +If you dump the flows for this table, you should see the following: ```text -table=71, priority=200,ct_state=+new+trk,ip,tun_dst="192.168.10.101" actions=set_field:0x1/0xff->pkt_mark,goto_table:80 +1. table=SNAT, priority=200,ct_state=+new+trk,ct_mark=0x40/0x40,ip,reg0=0x2/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=169.254.0.253),exec(set_field:0x10/0x10->ct_mark,set_field:0x40/0x40->ct_mark)) +2. table=SNAT, priority=200,ct_state=+new+trk,ct_mark=0x40/0x40,ip,reg0=0x3/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=10.10.0.1),exec(set_field:0x10/0x10->ct_mark,set_field:0x40/0x40->ct_mark)) +3. table=SNAT, priority=200,ct_state=-new-rpl+trk,ct_mark=0x20/0x20,ip actions=ct(table=L2ForwardingCalc,zone=65521,nat) +4. table=SNAT, priority=190,ct_state=+new+trk,ct_mark=0x20/0x20,ip,reg0=0x2/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=10.10.0.1),exec(set_field:0x10/0x10->ct_mark)) +5. table=SNAT, priority=0 actions=goto_table:L2ForwardingCalc ``` -### L3DecTTLTable (72) +- Flow 1 is to match packets from hair-pin connections initiated through the local Antrea gateway port. Such connection + should be SNAT'd with the virtual Service IP. + - Match condition `ct_state=+new+trk` is to match the first packet from connections tracked in `CtZone`. + - Match condition `ct_mark=0x40/0x40` is to match `HairpinCTMark` in `CtZone`, indicating that this is hair-pin connection. + - Match condition `reg0=0x2/0xf` is to match `FromGatewayRegMark`, indicating that packets from connections initiated + through the local Antrea gateway port + - Action `ct` is applied to matched packets with the commit parameter to perform SNAT and persist some ct marks in + `SNATCtZone`. + - Field `commit` means to commit connection to the connection tracking module. + - Field `table=L2ForwardingCalc` is the table where packets will be forwarded. + - Field `zone=65521` is to commit connection to `SNATCtZone`. + - Field `nat(src=169.254.0.253)` is to perform SNAT with virtual Service IP `169.254.0.253`. + - Field `exec` is to persist some ct marks in `SNATCtZone`. + - Action `set_field:0x10/0x10->ct_mark` is to load `ServiceCTMark` in `SNATCtZone`, indicating that this is a + Service connection. + - Action `set_field:0x40/0x40->ct_mark` is to load `HairpinCTMark` in `SNATCtZone`, indicating that this is a + hair-pin connection. +- Flow 2 is to match packets from hair-pin connections initiated through a local Pod. Such connection should be SNAT'd + with the IP address of local Antrea gateway. + - Match conditions `ct_state=+new+trk` and `ct_mark=0x40/0x40` are the same as flow 1. + - Match condition `reg0=0x3/0xf` is to match `FromLocalRegMark`, indicating that packets from connections initiated + through a local Pod. + - Action `ct` is the same as flow 1 except that `nat(src=10.10.0.1)` is used instead of `nat(src=169.254.0.253)` since + the connection should be SNAT'd with the IP address of local Antrea gateway. +- Flow 3 is to match the subsequent request packets of connection whose first request packet has been committed in + `SNATCtZone`, then invoke `ct` action on the packets again to recover "tracked" state in `SNATCtZone`. + - Match condition `ct_state=-new-rpl+trk` is to match request "tracked" packets, but not new (the first packet. + - Match condition `ct_mark=0x20/0x20` is to match `ConnSNATCTMark`, indicating that the connection requires SNAT. + - Action `ct` is applied to matched packets to recover "tracked" state in `SNATCtZone`. +- Flow 4 is to match the first packet of connections (non-hairpin) destined to external Service IP initiated through the + Antrea gateway, and the Endpoint is a remote Pod, then perform SNAT in `SNATCtZone` with the Antrea gateway IP. + - Match conditions `ct_state=+new+trk` and `ct_mark=0x20/0x20` are the same as flow 3. + - Match condition `reg0=0x2/0xf` is the same as flow 2. + - Action `ct` is the same as flow 2 except that `HairpinCTMark` is not loaded since this is not a hair-pin connection. +- Flow 5 is the auto-generated flow. + +### L2ForwardingCalc + +This is essentially the "dmac" table of the switch. We program one flow for each port (tunnel port, local Antrea gateway +port, and local Pod ports). -This is the table to decrement TTL for the IP packets destined to remote Nodes -through a tunnel, or the IP packets received from a tunnel. But for the packets -that enter the OVS pipeline from the local gateway and are destined to a remote -Node, TTL should not be decremented in OVS on the source Node, because the host -IP stack should have already decremented TTL if that is needed. - -If you dump the flows for this table, you should see flows like the following: +If you dump the flows for this table, you may see the following: ```text -1. table=72, priority=210,ip,reg0=0x1/0xf, actions=goto_table:80 -2. table=72, priority=200,ip, actions=dec_ttl,goto_table:80 -3. table=72, priority=0, actions=goto_table:80 +1. cookie=0x2010000000000, table=L2ForwardingCalc, priority=200,dl_dst=ba:5e:d1:55:aa:c0 actions=set_field:0x2->reg1,set_field:0x200000/0x600000->reg0,goto_table:IngressSecurityClassifier +2. cookie=0x2010000000000, table=L2ForwardingCalc, priority=200,dl_dst=aa:bb:cc:dd:ee:ff actions=set_field:0x1->reg1,set_field:0x200000/0x600000->reg0,goto_table:IngressSecurityClassifier +3. cookie=0x2010000000000, table=L2ForwardingCalc, priority=200,dl_dst=2e:ba:06:b2:44:91 actions=set_field:0x8->reg1,set_field:0x200000/0x600000->reg0,goto_table:IngressSecurityClassifier +4. cookie=0x2010000000000, table=L2ForwardingCalc, priority=200,dl_dst=c2:5a:5e:50:95:9b actions=set_field:0x9->reg1,set_field:0x200000/0x600000->reg0,goto_table:IngressSecurityClassifier +5. cookie=0x2000000000000, table=L2ForwardingCalc, priority=0 actions=goto_table:IngressSecurityClassifier ``` -The first flow is to bypass the TTL decrement for the packets from the gateway -port. - -### L2ForwardingCalcTable (80) +- Flow 1 is to match packets destined to the local Antrea gateway. + - Match condition `dl_dst=ba:5e:d1:55:aa:c0` is to match packets destined to the local Antrea gateway MAC address. + - Action `set_field:0x2->reg1` is to load output OVS port number to `TargetOFPortField`. + - Action `set_field:0x200000/0x600000->reg0` is to load `OutputToOFPortRegMark`, indicating that packets should output + to an OVS port. + - Action `goto_table:IngressSecurityClassifier` is to forward packets to table [IngressSecurityClassifier]. +- Flow 2 is to match packets destined to tunnel. + - Match condition `dl_dst=aa:bb:cc:dd:ee:ff` is to match packets destined to the Global Virtual MAC address, which is + used for tunnel traffic. + - Actions are the same as flow 1. +- Flows 3-4 are to match packets destined to local Pods. + - Match conditions `dl_dst=2e:ba:06:b2:44:91` and `dl_dst=c2:5a:5e:50:95:9b` are to match packets destined to the MAC + addresses of local Pods. + - Actions are the same as flow 1. +- Flow 4 is the auto-generated flow. + +In above flows 1-5, we load `OutputToOFPortRegMark` to indicate that there was a matching entry for the destination MAC +address and that the packet must be forwarded. We also use the `TargetOFPortField` to store the egress port for packet, +which will be used as a parameter to the `output` OpenFlow action in table [Output]. + +### IngressSecurityClassifier + +This table is to classify packets before entering the tables for ingress security. -This is essentially the "dmac" table of the switch. We program one flow for each -port (tunnel port, gateway port, and local Pod ports), as you can see if you -dump the flows: +If you dump the flows for this table, you should see the following: ```text -1. table=80, priority=200,dl_dst=aa:bb:cc:dd:ee:ff actions=set_field:0x1->reg1,set_field:0x10000/0x10000->reg0,goto_table:105 -2. table=80, priority=200,dl_dst=e2:e5:a4:9b:1c:b1 actions=set_field:0x2->reg1,set_field:0x10000/0x10000->reg0,goto_table:105 -3. table=80, priority=200,dl_dst=12:9e:a6:47:d0:70 actions=set_field:0x3->reg1,set_field:0x10000/0x10000->reg0,goto_table:90 -4. table=80, priority=200,dl_dst=ba:a8:13:ca:ed:cf actions=set_field:0x4->reg1,set_field:0x10000/0x10000->reg0,goto_table:90 -5. table=80, priority=0 actions=goto_table:105 +1. table=IngressSecurityClassifier, priority=210,pkt_mark=0x80000000/0x80000000,ct_state=-rpl+trk,ip actions=goto_table:ConntrackCommit +2. table=IngressSecurityClassifier, priority=200,reg0=0x20/0xf0 actions=goto_table:IngressMetric +3. table=IngressSecurityClassifier, priority=200,reg0=0x10/0xf0 actions=goto_table:IngressMetric +4. table=IngressSecurityClassifier, priority=200,reg0=0x40/0xf0 actions=goto_table:IngressMetric +5. table=IngressSecurityClassifier, priority=200,ct_mark=0x40/0x40 actions=goto_table:ConntrackCommit +6. table=IngressSecurityClassifier, priority=0 actions=goto_table:AntreaPolicyIngressRule ``` -For each port flow (1 through 5 in the example above), we set bit 16 of the -NXM_NX_REG0 register to indicate that there was a matching entry for the -destination MAC address and that the packet must be forwarded. In the last table -of the pipeline ([L2ForwardingOutTable]), we will drop all packets for which -this bit is not set. We also use the NXM_NX_REG1 register to store the egress -port for the packet, which will be used as a parameter to the `output` OpenFlow -action in [L2ForwardingOutTable]. - -The packets that match local Pods' MAC entries will go to the first table -([AntreaPolicyIngressRuleTable] when AntreaPolicy is enabled, or -[IngressRuleTable] when AntreaPolicy is not enabled) for NetworkPolicy ingress -rules. Other packets will go to [ConntrackCommitTable]. Specifically, packets -to the gateway port or the tunnel port will also go to [ConntrackCommitTable] -and bypass the NetworkPolicy ingress rule tables, as NetworkPolicy ingress rules -are not enforced for these packets on the source Node. - -What about L2 multicast / broadcast traffic? ARP requests will never reach this -table, as they will be handled by the OpenFlow `normal` action in the -[ArpResponderTable]. As for the rest, if it is IP traffic, it will hit the -"last" flow in this table and go to [ConntrackCommitTable]; and finally the last -table of the pipeline ([L2ForwardingOutTable]), and get dropped there since bit -16 of the NXM_NX_REG0 will not be set. Traffic which is non-ARP and non-IP -(assuming any can be received by the switch) is actually dropped much earlier in -the pipeline ([SpoofGuardTable]). In the future, we may need to support more -cases for L2 multicast / broadcast traffic. - -### AntreaPolicyIngressRuleTable (85) - -This table is very similar to [AntreaPolicyEgressRuleTable], but implements -the ingress rules of Antrea-native Policies. Depending on the tier to which the policy -belongs to, the rules will be installed in a table corresponding to that tier. -The ingress table to tier mappings is as follows: +- Flow 1 is to match locally generated request packets and forward them to table [ConntrackCommit] directly to bypass + all tables for ingress security. + - Match condition `pkt_mark=0x80000000/0x80000000` is to match packets with iptables fwmark 0x80000000, which is set + by iptables rules in the host network namespace to mark locally generated packets. + - Match condition `ct_state=-rpl+trk` is to match request packets. +- Flow 2-4 are to match some packets destined to local Antrea gateway, tunnel, uplink port by matching `ToGatewayRegMark`, + `ToTunnelRegMark` or `ToUplinkRegMark` respectively and forward them + to table [IngressMetric] directly to bypass tables for ingress security rules. + - Match condition `reg0=0x20/0xf0` is to match `ToGatewayRegMark`, indicating that packets are destined to local + Antrea gateway. + - Match condition `reg0=0x10/0xf0` is to match `ToTunnelRegMark`, indicating that packets are destined to tunnel. + - Match condition `reg0=0x40/0xf0` is to match `ToUplinkRegMark`, indicating that packets are destined to uplink. +- Flow 5 is to match packets from hair-pin connections and forward them to table [ConntrackCommit] directly to bypass + all tables for ingress security. + - Match condition `ct_mark=0x40/0x40` is to match `HairpinCTMark`, indicating that packets are from hair-pin connections. + +### AntreaPolicyIngressRule + +This table is very similar to table [AntreaPolicyEgressRule], but implements the ingress rules of Antrea-native +NetworkPolicies. Depending on the tier to which the policy belongs to, the rules will be installed in a table +corresponding to that tier. The ingress table to tier mappings is as follows: ```text -Baseline Tier -> IngressDefaultTable(100) -K8s NetworkPolicy -> IngressRuleTable(90) -All other Tiers -> AntreaPolicyIngressRuleTable(85) +Antrea-native NetworkPolicy Baseline Tier -> IngressDefault +Kubernetes NetworkPolicy -> IngressRule +Antrea-native NetworkPolicy other Tiers -> AntreaPolicyIngressRule ``` -Again for this table, you will need to keep in mind the ACNP -[specification](#antrea-native-policies-implementation) that we are using. -Since the example ACNP resides in the Application tier, if you dump the flows -for table 85, you should see something like this: +Again this table, you will need to keep in mind the Antrea-native NetworkPolicy [specification](#antrea-native-networkpolicy-implementation) +that we are using. Since the sample Antrea-native NetworkPolicy resides in the Application tier. One more tip, the OVS +ports of the sample Pods we use are `8` and `9`. If you dump the flows for this table, you may see the following: ```text -1. table=85, priority=64990,ct_state=-new+est,ip actions=resubmit(,105) -2. table=85, priority=14000,conj_id=4,ip actions=load:0x4->NXM_NX_REG3[],load:0x1->NXM_NX_REG0[20],resubmit(,101) -3. table=85, priority=14000,ip,nw_src=10.10.1.7 actions=conjunction(4,1/3) -4. table=85, priority=14000,ip,reg1=0x19c actions=conjunction(4,2/3) -5. table=85, priority=14000,tcp,tp_dst=80 actions=conjunction(4,3/3) -6. table=85, priority=0 actions=resubmit(,90) +1. table=AntreaPolicyIngressRule, priority=64990,ct_state=-new+est,ip actions=goto_table:IngressMetric +2. table=AntreaPolicyIngressRule, priority=64990,ct_state=-new+rel,ip actions=goto_table:IngressMetric +3. table=AntreaPolicyIngressRule, priority=14000,ip,nw_src=10.10.0.8 actions=conjunction(6,1/3) +4. table=AntreaPolicyIngressRule, priority=14000,ip,nw_src=10.10.0.7 actions=conjunction(6,1/3) +5. table=AntreaPolicyIngressRule, priority=14000,reg1=0x8 actions=conjunction(6,2/3) +6. table=AntreaPolicyIngressRule, priority=14000,reg1=0x9 actions=conjunction(6,2/3) +7. table=AntreaPolicyIngressRule, priority=14000,tcp,tp_dst=80 actions=conjunction(6,3/3) +8. table=AntreaPolicyIngressRule, priority=14000,conj_id=6,ip actions=set_field:0x6->reg6,ct(commit,table=IngressMetric,zone=65520,exec(set_field:0x6/0xffffffff->ct_label)) +9. table=AntreaPolicyIngressRule, priority=13999,reg1=0x8 actions=conjunction(4,1/2) +10. table=AntreaPolicyIngressRule, priority=13999,reg1=0x9 actions=conjunction(4,1/2) +11. table=AntreaPolicyIngressRule, priority=13999,ip actions=conjunction(4,2/2) +12. table=AntreaPolicyIngressRule, priority=13999,conj_id=4 actions=set_field:0x4->reg3,set_field:0x400/0x400->reg0,goto_table:IngressMetric +13. table=AntreaPolicyIngressRule, priority=0 actions=goto_table:IngressRule ``` -As for [AntreaPolicyEgressRuleTable], flow 1 (highest priority) ensures that for -established connections packets go straight to IngressMetricsTable, -then [L2ForwardingOutTable], with no other match required. - -The rest of the flows read as follows: if the source IP address is in set -{10.10.1.7}, and the destination OF port is in the set {412} (which -correspond to IP addresses {10.10.1.6}), and the destination TCP port -is in the set {80}, then use `conjunction` action with id 4, which loads -the `conj_id` 4 into NXM_NX_REG3, a register used by Antrea internally to -indicate the disposition of the packet is Drop, and forward the packet to -IngressMetricsTable for it to be dropped. - -Otherwise, go to [IngressRuleTable] if no conjunctive flow above priority 0 is matched. -This corresponds to the case where the packet is not matched by any of the Antrea-native -policy ingress rules in any tier (except for the "baseline" tier). -One notable difference is how we use OF ports to identify the destination of -the traffic, while we use IP addresses in [AntreaPolicyEgressRuleTable] to -identify the source of the traffic. More details regarding this can be found -in the following [IngressRuleTable] section. - -As seen in [AntreaPolicyEgressRuleTable], the default action is to evaluate K8s -Network Policy [IngressRuleTable] and a AntreaPolicyIngressDefaultTable does not exist. - -### IngressRuleTable (90) - -This table is very similar to [EgressRuleTable], but implements ingress rules -for Network Policies. Once again, you will need to keep mind the Network Policy -[specification](#network-policy-implementation) that we are using. We have 2 -Pods running on the same Node, with IP addresses 10.10.1.2 to 10.10.1.3. They -are allowed to talk to each other using TCP on port 80, but nothing else. +- Flows 1-2 matches packets of connections with states `ESTABLISHED` or `RELATED` but not `NEW`, forwarding them to table + [IngressMetric] to skip ingress-related flows. +- Flows 3-4 matches packets sourced from `10.10.0.7` or `10.10.0.8`. Action `conjunction(6,1/3)` signifies that the first + dimension of all three dimensions for `conj_id` 6. +- Flows 5-6 matches packets destined to port `8` or `9`. Action `conjunction(6,2/3)` signifies that the second dimension of + all three dimensions for `conj_id` 6. +- Flow 7 matches packets destined to TCP port `80`. Action `conjunction(6,3/3)` signifies that the third dimension of all + three dimensions for `conj_id` 6. +- Flow 8 matches packets meeting all three dimensions for `conj_id` 6 (rule's action is `Allow`). + - Action `set_field:0x6->reg5` is to load `conj_id` to `APConjIDField`, which is used by feature Traceflow. + - Action `ct` is to persist `conj_id` to ct label in `CtZone`. + - Field `commit` means to commit connection to the connection tracking module. + - Field `table=IngressMetric` is the table where packets will be forwarded. + - Field `zone=65520` is to commit connection to `CtZone`. + - Field `exec` sets some bits of ct label. + - Action `set_field:0x600000000/0xffffffff00000000->ct_label` is to load current `conj_id` value to `IngressRuleCTLabel` + for ingress metrics collection purposes. +- Flows 9-10 matches packets destined to port 8 or 9. Action `conjunction(4,1/2)` signifies that the first dimension of + all two dimensions for `conj_id` 4. +- Flow 11 matches IPv4 packets. Action `conjunction(4,2/2)` signifies that the second dimension of all two dimensions for + `conj_id` 4. +- Flow 12 matches packets meeting all two dimensions for `conj_id` 4 (rule's action is `Drop`). + - Action `set_field:0x4->reg3` is to load `conj_id` value to `APConjIDField`, which is used by feature Traceflow. + - Action `set_field:0x400/0x400->reg0` is to load `APDenyRegMark`, indicating that the packet was denied (Drop / Reject), + then Kubernetes default drop will not be recorded in this reg. +- Flow 13 is the auto-generated flow. + +### IngressRule + +This table is very similar to table [EgressRule], but implements ingress rules for Kubernetes NetworkPolicies. Once again, +you will need to keep mind the Kubernetes NetworkPolicy [specification](#kubernetes-networkpolicy-implementation) that +we are using. We have 2 Pods running on the same Node, with IP addresses 10.10.1.2 to 10.10.1.3. They are allowed to talk +to each other using TCP on port 80, but nothing else. If you dump the flows for this table, you should see something like this: ```text -1. table=90, priority=210,ct_state=-new+est,ip actions=goto_table:101 -2. table=90, priority=210,pkt_mark=0x1/0x1 actions=goto_table:105 -3. table=90, priority=200,ip,nw_src=10.10.1.2 actions=conjunction(3,1/3) -4. table=90, priority=200,ip,nw_src=10.10.1.3 actions=conjunction(3,1/3) -5. table=90, priority=200,ip,reg1=0x3 actions=conjunction(3,2/3) -6. table=90, priority=200,ip,reg1=0x4 actions=conjunction(3,2/3) -7. table=90, priority=200,tcp,tp_dst=80 actions=conjunction(3,3/3) -8. table=90, priority=190,conj_id=3,ip actions=load:0x3->NXM_NX_REG6[],ct(commit,table=101,zone=65520,exec(load:0x3->NXM_NX_CT_LABEL[0..31])) -9. table=90, priority=0 actions=goto_table:100 +1. table=IngressRule, priority=200,ip,nw_src=10.10.0.7 actions=conjunction(7,1/3) +2. table=IngressRule, priority=200,ip,nw_src=10.10.0.8 actions=conjunction(7,1/3) +3. table=IngressRule, priority=200,reg1=0x8 actions=conjunction(7,2/3) +4. table=IngressRule, priority=200,reg1=0x9 actions=conjunction(7,2/3) +5. table=IngressRule, priority=200,tcp,tp_dst=80 actions=conjunction(7,3/3) +6. table=IngressRule, priority=190,conj_id=7,ip actions=set_field:0x7->reg6,ct(commit,table=IngressMetric,zone=65520,exec(set_field:0x7/0xffffffff->ct_label)) +7. table=IngressRule, priority=0 actions=goto_table:IngressDefaultRule ``` -As for [EgressRuleTable], flow 1 (highest priority) ensures that for established -connections - as a reminder all connections are committed in -[ConntrackCommitTable] - packets go straight to IngressMetricsTable, -then [L2ForwardingOutTable], with no other match required. - -Flow 2 ensures that the traffic initiated from the host network namespace cannot -be dropped because of Network Policies. This ensures that K8s [liveness -probes](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/) -can go through. An iptables rule in the mangle table of the host network -namespace is responsible for marking the locally-generated packets with the -`0x1/0x1` mark. Note that the flow will be different for Windows worker Node or -when OVS userspace (netdev) datapath is used. This is because either there is no -way to add mark for particular traffic (i.e. Windows) or matching the mark in -OVS is not properly supported (i.e. netdev datapath). As a result, the flow will -match source IP instead, however, NodePort Service access by external clients -will be masqueraded as a local gateway IP to bypass Network Policies. This may -be fixed after AntreaProxy can serve NodePort traffic. - -The rest of the flows read as follows: if the source IP address is in set -{10.10.1.2, 10.10.1.3}, and the destination OF port is in the set {3, 4} (which -correspond to IP addresses {10.10.1.2, 10.10.1.3}, and the destination TCP port -is in the set {80}, then use `conjunction` action with id 3, which stores the -`conj_id` 3 in `ct_label[0..31]` for egress metrics collection purposes, and forwards -the packet to IngressMetricsTable, then [L2ForwardingOutTable]. Otherwise, go to -[IngressDefaultTable]. One notable difference is how we use OF ports to identify -the destination of the traffic, while we use IP addresses in [EgressRuleTable] -to identify the source of the traffic. We do this as an increased security measure -in case a local Pod is misbehaving and trying to access another local Pod using -the correct destination MAC address but a different destination IP address to bypass -an egress Network Policy rule. This is also why the Network Policy ingress rules -are enforced after the egress port has been determined. - -### IngressDefaultTable (100) - -This table is similar in its purpose to [EgressDefaultTable], and it complements -[IngressRuleTable] for Network Policy ingress rule implementation. In K8s, when -a Network Policy is applied to a set of Pods, the default behavior for these -Pods become "deny" (it becomes an [isolated +- Flows 1-2 matches packets sourced from `10.10.0.7` or `10.10.0.8`. Action `conjunction(7,1/3)` signifies that the first + dimension of all three dimensions for `conj_id` 7. +- Flows 3-4 matches packets destined to port `8` or `9` . Action `conjunction(7,2/3)` signifies that the second dimension + of all three dimensions for `conj_id` 7. +- Flow 5 matches packets destined to TCP port `80`. Action `conjunction(7,3/3)` signifies that the third dimension of all + three dimensions for `conj_id` 7. +- Flow 6 matches packets meeting all three dimensions for `conj_id` 7. + - Action `set_field:0x7->reg5` is to load `conj_id` to `APConjIDField`, which is used by feature Traceflow. + - Action `ct` is to persist `conj_id` to ct label in `CtZone`. + - Field `commit` means to commit connection to the connection tracking module. + - Field `table=IngressMetric` is the table where packets will be forwarded. + - Field `zone=65520` is to commit connection to `CtZone`. + - Field `exec` sets some bits of ct label. + - Action `set_field:0x7/0xffffffff->ct_label` is to load current `conj_id` value to `IngressRuleCTLabel` for ingress + metrics collection purposes. +- Flow 7 is the auto-generated flow. + +### IngressDefault + +This table is similar in its purpose to table [IngressDefault], and it complements table [IngressRule] for Kubernetes +NetworkPolicy ingress rule implementation. In Kubernetes, when a NetworkPolicy is applied to a set of Pods, the default +behavior for these Pods become "deny" (it becomes an [isolated Pod](https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). This -table is in charge of dropping traffic destined to Pods to which a Network -Policy (with an ingress rule) is applied, and which did not match any of the -allowlist rules. - -Accordingly, based on our Network Policy example, we would expect to see flows -to drop traffic destined to our 2 Pods (3 and 4), which is confirmed by dumping -the flows: - -```text -1. table=100, priority=200,ip,reg1=0x3 actions=drop -2. table=100, priority=200,ip,reg1=0x4 actions=drop -3. table=100, priority=0 actions=goto_table:105 -``` - -Similar to the [EgressDefaultTable], this table is also used to implement -Antrea-native policy ingress rules that are created in the Baseline Tier. -Since the Baseline Tier is meant to be enforced after K8s NetworkPolicies, the -corresponding flows will be created at a lower priority than K8s default drop flows. -For example, a baseline rule to isolate ingress traffic for a Namespace will look -like the following: - -```text -table=100, priority=80,ip,reg1=0xb actions=conjunction(6,2/3) -table=100, priority=80,ip,reg1=0xc actions=conjunction(6,2/3) -table=100, priority=80,ip,nw_src=10.10.1.9 actions=conjunction(6,1/3) -table=100, priority=80,ip,nw_src=10.10.1.7 actions=conjunction(6,1/3) -table=100, priority=80,tcp,tp_dst=8080 actions=conjunction(6,3/3) -table=100, priority=80,conj_id=6,ip actions=load:0x6->NXM_NX_REG3[],load:0x1->NXM_NX_REG0[20],resubmit(,101) -``` - -The table-miss flow entry, which is used for non-isolated Pods, forwards -traffic to the next table ([ConntrackCommitTable]). - -### ConntrackCommitTable (105) +table is in charge of dropping traffic destined to Pods to which a NetworkPolicy (with an ingress rule) is applied, +and which did not match any of the allow list rules. -As mentioned before, this table is in charge of committing all new connections -which are not dropped because of Network Policies. If you dump the flows for this -table, you should see something like this: +Accordingly, based on our Kubernetes NetworkPolicy example, we would expect to see flows to drop traffic destined to +our 2 Pods (port `8` and `9`), which is confirmed by dumping the flows: ```text -1. table=105, priority=200,ct_state=+new+trk,ip,reg0=0x1/0xf actions=ct(commit,table=108,zone=65520,exec(load:0x20->NXM_NX_CT_MARK[])) -2. table=105, priority=190,ct_state=+new+trk,ip actions=ct(commit,table=108,zone=65520) -3. table=105, priority=0 actions=goto_table:108 +1. table=IngressDefaultRule, priority=200,ip,reg1=0x8 actions=drop +2. table=IngressDefaultRule, priority=200,ip,reg1=0x9 actions=drop +3. table=IngressDefaultRule, priority=0 actions=goto_table:IngressMetric ``` -Flow 1 ensures that we commit connections initiated through the gateway -interface and mark them with a `ct_mark` of `0x20`. This ensures that -[ConntrackStateTable] can perform its functions correctly and rewrite the -destination MAC address to the gateway's MAC address for connections which -require it. Such connections include Pod-to-ClusterIP traffic. Note that the -`0x20` mark is applied to *all* connections initiated through the gateway -(i.e. for which the first packet of the connection was received through the -gateway) and that [ConntrackStateTable] will perform the destination MAC address -for the reply traffic of *all* such connections. In some cases (the ones -described for [ConntrackStateTable]), this rewrite is necessary. For others -(e.g. a connection from the host to a local Pod), this rewrite is not necessary -but is also harmless, as the destination MAC is already correct. +Similar to table [EgressDefault], this table is also used to implement Antrea-native NetworkPolicy ingress rules that +are created in the Baseline Tier. Since the Baseline Tier is meant to be enforced after Kubernetes NetworkPolicies, the +corresponding flows will be created at a lower priority than Kubernetes NetworkPolicy default drop flows. These flows +are like flows 3-12 in table [AntreaPolicyIngressRule]. -Flow 2 commits all other new connections. +### IngressMetric -All traffic then goes to [HairpinSNATTable]. +This table is very similar to table [EgressMetric], but used to collect ingress metrics for Antrea-native NetworkPolicies. -### HairpinSNATTable (108) - -The table is used to handle Service hairpin case, which indicates that the -packet should be output to the port on which it was received. - -If you dump the flows for this table, you should see the flows: +If you dump the flows for this table, you may see the following: ```text -1. table=108, priority=200,ip,nw_src=10.10.0.4,nw_dst=10.10.0.4 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -2. table=108, priority=200,ip,nw_src=10.10.0.2,nw_dst=10.10.0.2 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -3. table=108, priority=200,ip,nw_src=10.10.0.3,nw_dst=10.10.0.3 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -4. table=108, priority=0 actions=resubmit(,110) +1. table=IngressMetric, priority=200,ct_state=+new,ct_label=0x6/0xffffffff,ip actions=goto_table:ConntrackCommit +2. table=IngressMetric, priority=200,ct_state=-new,ct_label=0x6/0xffffffff,ip actions=goto_table:ConntrackCommit +3. table=IngressMetric, priority=200,ct_state=+new,ct_label=0x7/0xffffffff,ip actions=goto_table:ConntrackCommit +4. table=IngressMetric, priority=200,ct_state=-new,ct_label=0x7/0xffffffff,ip actions=goto_table:ConntrackCommit +5. table=IngressMetric, priority=200,reg0=0x400/0x400,reg3=0x4 actions=drop +6. table=IngressMetric, priority=0 actions=goto_table:ConntrackCommit ``` -Flow 1-3 are used to match Service packets from Pods. The source IP of the matched -packets by flow 1-3 should be SNAT'd with a virtual hairpin IP since the source and -destination IP addresses should not be the same. Without SNAT, response packets from -a Pod will not be forwarded back to OVS pipeline as the destination IP is the Pod's -own IP, then the connection is interrupted because the conntrack state is only stored -in OVS ct zone, not in the Pod. With SNAT, the destination IP will be the virtual -hairpin IP and forwarded back to OVS pipeline. Note that, bit 18 in NXM_NX_REG0 is -set to 0x1, and it is consumed in [L2ForwardingOutTable] to output the packet -to the port on which it was received with action `IN_PORT`. +- Flows 1-2 matches packets from the sample Antrea-native NetworkPolicy ingress rule with action `Allow`. + - Match conditions `ct_state=+new` or `ct_state=-new` are to match packets with state `NEW` or not `NEW`. + - Match condition `ct_label=0x6/0xffffffff` is to match `IngressRuleCTLabel` with value 6, which is loaded in table + [AntreaPolicyIngressRule], flow 8. + - Action `goto_table:ConntrackCommit` is to forward packets to table [ConntrackCommit] since the action of the sample + rule is `Allow`. +- Flows 3-4 matches packets from the sample Kubernetes NetworkPolicy ingress rule with action `Allow`. + - Match conditions `ct_state=+new` or `ct_state=-new` are to match packets with state `NEW` or not `NEW`. + - Match condition `ct_label=0x7/0xffffffff` is to match `EgressRuleCTLabel` with value 7, which is loaded in table + [IngressRule], flow 6. + - Action `goto_table:ConntrackCommit` is the same as flows 1-2. +- Flow 5 matches packets of the sample Antrea-native NetworkPolicy ingress rule with action `Drop` or `Reject`, then + drop them. + - Match condition `reg0=0x400/0x400` is to match `APDenyRegMark`, which is loaded in table [AntreaPolicyIngressRule], + flow 12, indicating that the packets should be denied. + - Match condition `reg3=0x4` is to match `APConjIDField`, which is loaded in table [AntreaPolicyIngressRule], flow 12 + - Action `drop` is to drop packets. +- Flow 6 is the auto-generated flow. + +### ConntrackCommit + +This table is in charge of committing all new non-Service connections. -### L2ForwardingOutTable (110) - -It is a simple table and if you dump the flows for this table, you should only -see 2 flows: +If you dump the flows for this table, you should see the following: ```text -1. table=110, priority=200,ip,reg0=0x10000/0x10000 actions=output:NXM_NX_REG1[] -2. table=110, priority=0, actions=drop +1. table=ConntrackCommit, priority=200,ct_state=+new+trk-snat,ct_mark=0/0x10,ip actions=ct(commit,table=Output,zone=65520,exec(move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +2. table=ConntrackCommit, priority=0 actions=goto_table:Output ``` -The first flow outputs all unicast packets to the correct port (the port was -resolved by the "dmac" table, [L2ForwardingCalcTable]). IP packets for which -[L2ForwardingCalcTable] did not set bit 16 of NXM_NX_REG0 will be dropped. +- Flow 1 is to match new non-Service connections and commit them to the connection tracking module. + - Match condition `ct_state=+new+trk-snat` is to match the first packet from connections tracked in `CtZone`. + - Match condition `ct_mark=0x0/0x10` is to match `NotServiceCTMark` in `CtZone`, indicating that packets are from + non-Service connection. + - Action `ct` is applied to matched packets with the commit parameter to persist a ct mark in `CtZone`. + - Field `commit` means to commit connection to the connection tracking module. + - Field `table=Output` is the table where packets will be forwarded. + - Field `zone=65521` is to commit connection to `CtZone`. + - Field `exec` is to persist some ct marks in `CtZone`. + - Action `move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3]` is load value of `PktSourceField` to `ConnSourceCTMarkField`. +- Flow 2 is the auto-generated flow that should remain unused. -## Tables (AntreaProxy is disabled) - -![OVS pipeline](../assets/ovs-pipeline.svg) +### Output -### DNATTable (40) +This is the table to output packets to an OVS port, controller or drop them. -This table is created only when AntreaProxy is disabled. Its only job is to -send traffic destined to Services through the local gateway interface, without any -modifications. kube-proxy will then take care of load-balancing the connections -across the different backends for each Service. - -If you dump the flows for this table, you should see something like this: +If you dump the flows for this table, you should see the following: ```text -1. table=40, priority=200,ip,nw_dst=10.96.0.0/12 actions=set_field:0x2->reg1,load:0x1->NXM_NX_REG0[16],goto_table:105 -2. table=40, priority=0 actions=goto_table:45 +1. table=Output, priority=210,ct_mark=0x40/0x40 actions=IN_PORT +2. table=Output, priority=200,reg0=0x200000/0x600000 actions=output:NXM_NX_REG1[] +3. table=Output, priority=200,reg0=0x2400000/0xfe600000 actions=meter:256,controller(reason=no_match,id=62373,userdata=01.01) +4. table=Output, priority=200,reg0=0x4400000/0xfe600000 actions=meter:256,controller(reason=no_match,id=62373,userdata=01.02) +5. table=Output, priority=0 actions=drop ``` -In the example above, 10.96.0.0/12 is the Service CIDR (this is the default -value used by `kubeadm init`). This flow is not actually required for -forwarding, but to bypass [EgressRuleTable] and [EgressDefaultTable] for Service -traffic on its way to kube-proxy through the gateway. If we omitted this flow, -such traffic would be unconditionally dropped if a Network Policy is applied on -the originating Pod. For such traffic, we instead enforce Network Policy egress -rules when packets come back through the gateway and the destination IP has been -rewritten by kube-proxy (DNAT to a backend for the Service). We cannot output -the Service traffic to the gateway port directly as we haven't committed the -connection yet; instead we store the port in NXM_NX_REG1 - similarly to how we -process non-Service traffic in [L2ForwardingCalcTable] - and forward it to -[ConntrackCommitTable]. By committing the connection we ensure that reply -traffic (traffic from the Service backend which has already gone through -kube-proxy for source IP rewrite) will not be dropped because of Network -Policies. - -The table-miss flow entry (flow 2) for this table forwards all non-Service -traffic to the next table, [AntreaPolicyEgressRuleTable]. - -[ClassifierTable]: #classifiertable-0 -[SpoofGuardTable]: #spoofguardtable-10 -[ARPResponderTable]: #arprespondertable-20 -[ServiceHairpinTable]: #servicehairpintable-23 -[ConntrackTable]: #conntracktable-30 -[ConntrackStateTable]: #conntrackstatetable-31 -[DNATTable]: #dnattable-40 -[SessionAffinityTable]: #sessionaffinitytable-40 -[ServiceLBTable]: #servicelbtable-41 -[EndpointDNATTable]: #endpointdnattable-42 -[AntreaPolicyEgressRuleTable]: #antreapolicyegressruletable-45 -[EgressRuleTable]: #egressruletable-50 -[EgressDefaultTable]: #egressdefaulttable-60 -[L3ForwardingTable]: #l3forwardingtable-70 -[SNATTable]: #snattable-71 -[L3DecTTLTable]: #l3decttltable-72 -[L2ForwardingCalcTable]: #l2forwardingcalctable-80 -[AntreaPolicyIngressRuleTable]: #antreapolicyingressruletable-85 -[IngressRuleTable]: #ingressruletable-90 -[IngressDefaultTable]: #ingressdefaulttable-100 -[ConntrackCommitTable]: #conntrackcommittable-105 -[HairpinSNATTable]: #hairpinsnattable-108 -[L2ForwardingOutTable]: #l2forwardingouttable-110 +- Flow 1 is to output packets from hair-pin connections to the ingress port. + - Match condition `ct_mark=0x40/0x40` is to match `HairpinCTMark`, indicating that packets are from hair-pin connections. + - Action `IN_PORT` is to output packets to the ingress port. +- Flow 2 is to output packets to an OVS port. + - Match condition `reg0=0x200000/0x600000` is to match `OutputToOFPortRegMark`, indicating that packets should output + to an OVS port. + - Action `output:NXM_NX_REG1[]` is to output packets to the OVS port stored in `TargetOFPortField`. +- Flow 3-4 are to output packets to the controller. + - Match condition `reg0=0x2400000/0xfe600000` is to match `OutputToControllerRegMark`, indicating that packets should + output to the controller. + - Action `meter:256` is to meter packets with meter ID 256. + - Action `controller(reason=no_match,id=62373,userdata=01.01)` is to output packets to the controller with reason + `no_match`, ID 62373 and userdata 01.01. +- Flow 5 is to drop packets. + +[ARPSpoofGuard]: #arpspoofguard +[Classifier]: #classifier +[SpoofGuard]: #spoofguard +[UnSNAT]: #unsnat +[PreRoutingClassifier]: #preroutingclassifier +[SessionAffinity]: #sessionaffinity +[ServiceLB]: #servicelb +[EndpointDNAT]: #endpointdnat +[AntreaPolicyEgressRule]: #antreapolicyegressrule +[EgressRule]: #egressrule +[EgressDefault]: #egressdefault +[EgressMetric]: #egressmetric +[L3Forwarding]: #l3forwarding +[EgressMark]: #egressmark +[L3DecTTL]: #l3decttl +[SNATMark]: #snatmark +[L2ForwardingCalc]: #l2forwardingcalc +[IngressSecurityClassifier]: #ingresssecurityclassifier +[IngressRule]: #ingressrule +[IngressDefault]: #ingressdefault +[IngressMetric]: #ingressmetric +[Output]: #output diff --git a/pkg/agent/openflow/fields.go b/pkg/agent/openflow/fields.go index 87d0521af2e..073ab31432f 100644 --- a/pkg/agent/openflow/fields.go +++ b/pkg/agent/openflow/fields.go @@ -109,12 +109,12 @@ var ( APConjIDField = binding.NewRegField(3, 0, 31) // reg4(NXM_NX_REG4) - // reg4[0..15]: Field to store the selected Service Endpoint port. + // reg4[0..15]: Field to store the selected Service Endpoint port number. EndpointPortField = binding.NewRegField(4, 0, 15) // reg4[16..18]: Field to store the state of a packet accessing a Service. Marks in this field include: - // - 0b001: packet need to do service selection. - // - 0b010: packet has done service selection. - // - 0b011: packet has done service selection and the selection result needs to be cached. + // - 0b001: packet needs to do Endpoint selection. + // - 0b010: packet has done Endpoint selection. + // - 0b011: packet has done Endpoint selection and the selection result needs to be cached. ServiceEPStateField = binding.NewRegField(4, 16, 18) EpToSelectRegMark = binding.NewRegMark(ServiceEPStateField, 0b001) EpSelectedRegMark = binding.NewRegMark(ServiceEPStateField, 0b010)