From 9a2b76c5f8e9196d2f7516284ca183f102090bd4 Mon Sep 17 00:00:00 2001 From: Hongliang Liu Date: Fri, 4 Aug 2023 16:36:06 +0800 Subject: [PATCH 1/5] Update OVS pipeline document Resolves #5200 Signed-off-by: Hongliang Liu --- docs/assets/ovs-pipeline-antrea-proxy.svg | 4835 ---------------- docs/assets/ovs-pipeline.svg | 6069 +++++++-------------- docs/design/ovs-pipeline.md | 2461 ++++++--- pkg/agent/openflow/fields.go | 8 +- 4 files changed, 3515 insertions(+), 9858 deletions(-) delete mode 100644 docs/assets/ovs-pipeline-antrea-proxy.svg diff --git a/docs/assets/ovs-pipeline-antrea-proxy.svg b/docs/assets/ovs-pipeline-antrea-proxy.svg deleted file mode 100644 index 7016a665305..00000000000 --- a/docs/assets/ovs-pipeline-antrea-proxy.svg +++ /dev/null @@ -1,4835 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/assets/ovs-pipeline.svg b/docs/assets/ovs-pipeline.svg index c60576a18e1..6630ac656f0 100644 --- a/docs/assets/ovs-pipeline.svg +++ b/docs/assets/ovs-pipeline.svg @@ -2,14 +2,14 @@ - + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> - - - + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> - - - - - - - - - + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> - - - - - - - - - - - - - - - - - - + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> @@ -610,26 +490,25 @@ orient="auto" refY="0" refX="0" - id="marker1534-5" + id="marker1488-1" style="overflow:visible" inkscape:isstock="true"> @@ -639,3886 +518,1844 @@ orient="auto" refY="0" refX="0" - id="marker5914-9-9" + id="marker1644-8" style="overflow:visible" - inkscape:isstock="true" - inkscape:collect="always"> + inkscape:isstock="true"> - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + id="path1642-1" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + - - - - - + id="path1642-1-3-0-7-2-4-8" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + + + + + + + image/svg+xml + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - EgressRule + EgressDefaultRule + EgressMetric + EgressMark + L3DecTTL + SNATMark + L2ForwardingCalc + SessionAffinity + NodePortMark + L3Forwarding + SNAT + TrafficControl + IngressSecurityClassifier + IngressRule + AntreaPolicyIngressRule + IngressDefaultRule + IngressMetric + ConntrackCommit + Output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - ARP packets + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + y="30.55513" + x="39.744644" + id="tspan1018-5" + sodipodi:role="line">IP packets + + + + diff --git a/docs/design/ovs-pipeline.md b/docs/design/ovs-pipeline.md index a967b123086..356531f56cf 100644 --- a/docs/design/ovs-pipeline.md +++ b/docs/design/ovs-pipeline.md @@ -1,1197 +1,1852 @@ # Antrea OVS Pipeline +## Introduction + +This document outlines the Open vSwitch (OVS) pipeline Antrea uses to implement its networking functionalities. The +following assumptions are currently in place: + +- Antrea is deployed in encap mode, establishing an overlay network across all Nodes. +- All the Nodes are Linux Nodes. +- IPv6 is disabled. +- Option `antreaProxy.proxyAll` (referred to as `proxyAll` later in this document) is enabled. +- Two Alpha features `TrafficControl` and `L7NetworkPolicy` are enabled. +- Default settings are maintained for other features and options. + +The document references version v1.15 of Antrea. + ## Terminology -* *Node Route Controller*: the [K8s - controller](https://kubernetes.io/docs/concepts/architecture/controller/) - which is part of the Antrea Agent and watches for updates to Nodes. When a - Node is added, it updates the local networking configuration (e.g. configure - the tunnel to the new Node). When a Node is deleted, it performs the necessary - clean-ups. -* *peer Node*: this is how we refer to other Nodes in the cluster, to which the - local Node is connected through a Geneve, VXLAN, GRE, or STT tunnel. -* *Global Virtual MAC*: a virtual MAC address which is used as the destination - MAC for all tunnelled traffic across all Nodes. This simplifies networking by - enabling all Nodes to use this MAC address instead of the actual MAC address - of the appropriate remote gateway. This enables each vSwitch to act as a - "proxy" for the local gateway when receiving tunnelled traffic and directly - take care of the packet forwarding. At the moment, we use an hard-coded value - of `aa:bb:cc:dd:ee:ff`. -* *Antrea-native Policies*: Antrea ClusterNetworkPolicy and Antrea NetworkPolicy - CRDs, as documented [here](../antrea-network-policy.md). -* *`normal` action*: OpenFlow defines this action to submit a packet to "the - traditional non-OpenFlow pipeline of the switch". That is, if a flow uses this - action, then the packets in the flow go through the switch in the same way - that they would if OpenFlow was not configured on the switch. Antrea uses this - action to process ARP traffic as a regular learning L2 switch would. -* *table-miss flow entry*: a "catch-all" entry in a OpenFlow table, which is - used if no other flow is matched. If the table-miss flow entry does not exist, - by default packets unmatched by flow entries are dropped (discarded). -* *conjunctive match fields*: an efficient way in OVS to implement conjunctive - matches, that is a match for which we have multiple fields, each one with a - set of acceptable values. See [OVS - fields](http://www.openvswitch.org/support/dist-docs/ovs-fields.7.txt) for - more information. -* *conntrack*: a connection tracking module that can be used by OVS to match on +### Antrea / Kubernetes + +- *Node Route Controller*: the [Kubernetes controller](https://kubernetes.io/docs/concepts/architecture/controller/) + which is a part of antrea-agent and watches for updates to Nodes. When a Node is added, it updates the local + networking configurations (e.g. configure the tunnel to the new Node). When a Node is deleted, it performs the + necessary clean-ups. +- *peer Node*: this is how we refer to other Nodes in the cluster, to which the local Node is connected through a Geneve, + VXLAN, GRE, or STT tunnel. +- *Antrea-native NetworkPolicy*: Antrea ClusterNetworkPolicy and Antrea NetworkPolicy CRDs, as documented + [here](../antrea-network-policy.md). +- *Service session affinity*: a Service attribute that selects the same backend Pods for connections from a particular + client. For a K8s Service, session affinity can be enabled by setting `service.spec.sessionAffinity` to `ClientIP` + (default is `None`). See [Kubernetes Service](https://kubernetes.io/docs/concepts/services-networking/service/) for + more information about session affinity. + +### OpenFlow + +- *table-miss flow*: a "catch-all" flow in an OpenFlow table, which is used if no other flow is matched. If the table-miss + flow does not exist, by default packets unmatched by flows are dropped (discarded). +- *action `conjunction`*: an efficient way in OVS to implement conjunctive matches, is a match for which multiple fields + are required to match conjunctively, each within a set of acceptable values. See [OVS + fields](http://www.openvswitch.org/support/dist-docs/ovs-fields.7.txt) for more information. +- *action `normal`*: OpenFlow defines this action to submit a packet to "the traditional non-OpenFlow pipeline of + the switch". In other words, if a flow uses this action, the packets matched by the flow traverse the switch in + the same manner as they would if OpenFlow were not configured on the switch. Antrea uses this action to process + ARP packets as a regular learning L2 switch would. +- *action `group`*: an action used to process forwarding decisions on multiple OVS ports. Examples include: + load-balancing, multicast, and active/standby. See [OVS group + action](https://docs.openvswitch.org/en/latest/ref/ovs-actions.7/#the-group-action) for more information. +- *action `IN_PORT`*: an action to output packets to the port on which they were received. This is the only standard way + to output the packets to the input port. +- *action `ct`*: an action to commit connections to the connection tracking module, which OVS can use to match the state of a TCP, UDP, ICMP, etc., connection. See the [OVS Conntrack - tutorial](https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/) for - more information. -* *dmac table*: a traditional L2 switch has a "dmac" table which maps - learned destination MAC address to the appropriate egress port. It is often - the same physical table as the "smac" table (which matches on the source MAC - address and initiate MAC learning if the address is unknown). -* *group action*: an action which is used to process forwarding decisions - on multiple OVS ports. Examples include: load-balancing, multicast, and active/standby. - See [OVS group action](https://docs.openvswitch.org/en/latest/ref/ovs-actions.7/#the-group-action) - for more information. -* *IN_PORT action*: an action to output the packet to the port on which it was - received. This is the only standard way to output the packet to the input port. -* *session affinity*: a load balancer feature that always selects the same backend - Pod for connections from a particular client. For a K8s Service, session - affinity can be enabled by setting `service.spec.sessionAffinity` to `ClientIP` - (default is `None`). See [K8s Service](https://kubernetes.io/docs/concepts/services-networking/service/) - for more information about session affinity. - -**This document currently makes the following assumptions:** - -* Antrea is used in encap mode (an overlay network is created between all Nodes) -* All the Nodes are Linux Nodes -* IPv6 is disabled -* AntreaProxy is enabled -* AntreaPolicy is enabled - -## Dumping the Flows - -This guide includes a representative flow dump for every table in the pipeline, -in order to illustrate the function of each table. If you have a cluster running -Antrea, you can dump the flows for a given Node as follows: + tutorial](https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/) for more information. +- *reg mark*: a value stored in an OVS register conveying information for a packet across the pipeline. Explore all reg + marks in the pipeline in the [OVS Registers] section. +- *ct mark*: a value stored in the field `ct_mark` of OVS conntrack, conveying information for a connection throughout + its entire lifecycle across the pipeline. Explore all values used in the pipeline in the [Ct Marks] section. +- *ct label*: a value stored in the field `ct_label` of OVS conntrack, conveying information for a connection throughout + its entire lifecycle across the pipeline. Explore all values used in the pipeline in the [Ct Labels] section. +- *ct zone*: a zone is to isolate connection tracking rules stored in the field `ct_zone` of OVS conntrack. It is + conceptually similar to the more generic Linux network namespace but is specific to conntrack and has less + overhead. Explore all the zones used in the pipeline in the [Ct Zones] section. + +### Misc + +- *dmac table*: a traditional L2 switch has a "dmac" table that maps the learned destination MAC address to the appropriate + egress port. It is often the same physical table as the "smac" table (which matches the source MAC address and + initiates MAC learning if the address is unknown). +- *Global Virtual MAC*: a virtual MAC address that is used as the destination MAC for all tunneled traffic across all + Nodes. This simplifies networking by enabling all Nodes to use this MAC address instead of the actual MAC address of + the appropriate remote gateway. This allows each OVS to act as a "proxy" for the local gateway when receiving + tunneled traffic and directly take care of the packet forwarding. Currently, we use a hard-coded value of + `aa:bb:cc:dd:ee:ff`. +- *Virtual Service IP*: a virtual IP address used as the source IP address for hairpin Service connections through the + Antrea gateway port. Currently, we use a hard-coded value of `169.254.0.253`. +- *Virtual NodePort DNAT IP*: a virtual IP address used as a DNAT IP address for NodePort Service connections through + Antrea gateway port. Currently, we use a hard-coded value of `169.254.0.252`. + +## Dumping the Flows / Groups + +This guide includes a representative flow dump for every table in the pipeline, to illustrate the function of each +table. If you have a cluster running Antrea, you can dump the flows or groups on a given Node as follows: + +```bash +# Dump all flows. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows -O Openflow15 [--no-stats] [--names] + +# Dump all groups. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-groups -O Openflow15 [--names] +``` + +where `` is the name of the antrea-agent Pod running on that Node, and `` is the name +of the bridge created by Antrea (`br-int` by default). + +You can also dump the flows for a specific table or group as follows: ```bash -kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows [--no-stats] [--names] +# Dump flows of a table. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows table= -O Openflow15 [--no-stats] [--names] + +# Dump a group. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-groups -O Openflow15 [--names] ``` -where `` is the name of the Antrea Agent Pod running on -that Node and `` is the name of the bridge created by Antrea -(`br-int` by default). - -## Registers - -We use 2 32-bit OVS registers to carry information throughout the pipeline: - -* reg0 (NXM_NX_REG0): - - bits [0..3] are used to store the traffic source (from tunnel: 0, from - local gateway: 1, from local Pod: 2). It is set in [ClassifierTable]. - - bit 16 is used to indicate whether the destination MAC address of a packet - is "known", i.e. corresponds to an entry in [L2ForwardingCalcTable], which - is essentially a "dmac" table. - - bit 18 is used to indicate whether the packet should be output to the port - on which it was received. It is consumed in [L2ForwardingOutTable] - to output the packet with action `IN_PORT`. - - bit 19 is used to indicate whether the destination and source MACs of the - packet should be rewritten in [l3ForwardingTable]. The bit is set for - packets received from the tunnel port in [ClassifierTable]. The - destination MAC of such packets is the Global Virtual MAC and should be - rewritten to the destination port's MAC before output to the port. When such - a packet is destined to a Pod, its source MAC should be rewritten to the - local gateway port's MAC too. -* reg1 (NXM_NX_REG1): it is used to store the egress OF port for the packet. It - is set in [DNATTable] for traffic destined to Services and in - [L2ForwardingCalcTable] otherwise. It is consumed in [L2ForwardingOutTable] to - output each packet to the correct port. -* reg3 (NXM_NX_REG3): it is used to store selected Service Endpoint IPv4 address - in OVS group entry. It is consumed in [EndpointDNATTable]. -* reg4 (NXM_NX_REG4): - * bits [0..16] are used to store selected Service Endpoint port number in OVS - group entry. They are consumed in [EndpointDNATTable]. - * bits [17..18] are used to store the state of a Service request packet. - Marks in this field include, - * 0b001: packet needs to do Endpoint selection. - * 0b010: packet has done Endpoint selection. - * 0b011: packet has done Endpoint selection and the selection result needs to - be cached. - -## Network Policy Implementation - -Several tables of the pipeline are dedicated to [K8s Network -Policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/) -implementation ([EgressRuleTable], [EgressDefaultTable], [IngressRuleTable] and -[IngressDefaultTable]). - -The Antrea implementation of K8s Network Policy, including the communication -channel between the Controller and Agents, and how a Network Policy is mapped to -OVS flows at each Node, will be described in details in a separate document. For -the present document, we will use the Network Policy example below, and explain -how these simple ingress and egress rules map to individual flows as we describe -the relevant tables of our pipeline. +where `` is the name of a table in the pipeline, and `` is the ID of a group. + +## OVS Registers + +We use some OVS registers to carry information throughout the pipeline. To enhance usability, we assign friendly names +to the registers we use. + +| Register | Field Range | Field Name | Reg Mark Value | Reg Mark Name | Description | +|---------------|-------------|---------------------------------|----------------|---------------------------------|------------------------------------------------------------------------------------------------------| +| NXM_NX_REG0 | bits 0-3 | PktSourceField | 0x1 | FromTunnelRegMark | Packet source is tunnel port. | +| | | | 0x2 | FromGatewayRegMark | Packet source is the local Antrea gateway port. | +| | | | 0x3 | FromPodRegMark | Packet source is local Pod port. | +| | | | 0x4 | FromUplinkRegMark | Packet source is uplink port. | +| | | | 0x5 | FromBridgeRegMark | Packet source is local bridge port. | +| | | | 0x6 | FromTCReturnRegMark | Packet source is TrafficControl return port. | +| | bits 4-7 | PktDestinationField | 0x1 | ToTunnelRegMark | Packet destination is tunnel port. | +| | | | 0x2 | ToGatewayRegMark | Packet destination is the local Antrea gateway port. | +| | | | 0x3 | ToLocalRegMark | Packet destination is local Pod port. | +| | | | 0x4 | ToUplinkRegMark | Packet destination is uplink port. | +| | | | 0x5 | ToBridgeRegMark | Packet destination is local bridge port. | +| | bit 9 | | 0b0 | NotRewriteMACRegMark | Packet's source/destination MAC address does not need to be rewritten. | +| | | | 0b1 | RewriteMACRegMark | Packet's source/destination MAC address needs to be rewritten. | +| | bit 10 | | 0b1 | APDenyRegMark | Packet denied (Drop/Reject) by Antrea NetworkPolicy. | +| | bits 11-12 | APDispositionField | 0b00 | DispositionAllowRegMark | Indicates Antrea NetworkPolicy disposition: allow. | +| | | | 0b01 | DispositionDropRegMark | Indicates Antrea NetworkPolicy disposition: drop. | +| | | | 0b11 | DispositionPassRegMark | Indicates Antrea NetworkPolicy disposition: pass. | +| | bit 13 | | 0b1 | GeneratedRejectPacketOutRegMark | Indicates packet is a generated reject response packet-out. | +| | bit 14 | | 0b1 | SvcNoEpRegMark | Indicates packet towards a Service without Endpoint. | +| | bit 19 | | 0b1 | RemoteSNATRegMark | Indicates packet needs SNAT on a remote Node. | +| | bit 22 | | 0b1 | L7NPRedirectRegMark | Indicates L7 Antrea NetworkPolicy disposition of redirect. | +| | bits 21-22 | OutputRegField | 0b01 | OutputToOFPortRegMark | Output packet to an OVS port. | +| | | | 0b10 | OutputToControllerRegMark | Send packet to Antrea Agent. | +| | bits 25-32 | PacketInOperationField | | | Field to store NetworkPolicy packetIn operation. | +| NXM_NX_REG1 | bits 0-31 | TargetOFPortField | | | Egress OVS port of packet. | +| NXM_NX_REG2 | bits 0-31 | SwapField | | | Swap values in flow fields in OpenFlow actions. | +| | bits 0-7 | PacketInTableField | | | OVS table where it was decided to send packets to the controller (Antrea Agent). | +| NXM_NX_REG3 | bits 0-31 | EndpointIPField | | | Field to store IPv4 address of the selected Service Endpoint. | +| | | APConjIDField | | | Field to store Conjunction ID for Antrea Policy. | +| NXM_NX_REG4 | bits 0-15 | EndpointPortField | | | Field store TCP/UDP/SCTP port of a Service's selected Endpoint. | +| | bits 16-18 | ServiceEPStateField | 0b001 | EpToSelectRegMark | Packet needs to do Service Endpoint selection. | +| | bits 16-18 | ServiceEPStateField | 0b010 | EpSelectedRegMark | Packet has done Service Endpoint selection. | +| | bits 16-18 | ServiceEPStateField | 0b011 | EpToLearnRegMark | Packet has done Service Endpoint selection and the selected Endpoint needs to be cached. | +| | bits 0-18 | EpUnionField | | | The union value of EndpointPortField and ServiceEPStateField. | +| | bit 19 | | 0b1 | ToNodePortAddressRegMark | Packet is destined for a Service of type NodePort. | +| | bit 20 | | 0b1 | AntreaFlexibleIPAMRegMark | Packet is from local Antrea IPAM Pod. | +| | bit 20 | | 0b0 | NotAntreaFlexibleIPAMRegMark | Packet is not from local Antrea IPAM Pod. | +| | bit 21 | | 0b1 | ToExternalAddressRegMark | Packet is destined for a Service's external IP. | +| | bits 22-23 | TrafficControlActionField | 0b01 | TrafficControlMirrorRegMark | Indicates packet needs to be mirrored (used by TrafficControl). | +| | | | 0b10 | TrafficControlRedirectRegMark | Indicates packet needs to be redirected (used by TrafficControl). | +| | bit 24 | | 0b1 | NestedServiceRegMark | Packet is destined for a Service using other Services as Endpoints. | +| | bit 25 | | 0b1 | DSRServiceRegMark | Packet is destined for a Service working in DSR mode. | +| | | | 0b0 | NotDSRServiceRegMark | Packet is destined for a Service working in non-DSR mode. | +| | bit 26 | | 0b1 | RemoteEndpointRegMark | Packet is destined for a Service selecting a remote non-hostNetwork Endpoint. | +| | bit 27 | | 0b1 | FromExternalRegMark | Packet is from Antrea gateway, but its source IP is not the gateway IP. | +| | bit 28 | | 0b1 | FromLocalRegMark | Packet is from a local Pod or the Node. | +| NXM_NX_REG5 | bits 0-31 | TFEgressConjIDField | | | Egress conjunction ID hit by TraceFlow packet. | +| NXM_NX_REG6 | bits 0-31 | TFIngressConjIDField | | | Ingress conjunction ID hit by TraceFlow packet. | +| NXM_NX_REG7 | bits 0-31 | ServiceGroupIDField | | | GroupID corresponding to the Service. | +| NXM_NX_REG8 | bits 0-11 | VLANIDField | | | VLAN ID. | +| | bits 12-15 | CtZoneTypeField | 0b0001 | IPCtZoneTypeRegMark | Ct zone type is IPv4. | +| | | | 0b0011 | IPv6CtZoneTypeRegMark | Ct zone type is IPv6. | +| | bits 0-15 | CtZoneField | | | Ct zone ID is a combination of VLANIDField and CtZoneTypeField. | +| NXM_NX_REG9 | bits 0-31 | TrafficControlTargetOFPortField | | | Field to cache the OVS port to output packets to be mirrored or redirected (used by TrafficControl). | +| NXM_NX_XXREG3 | bits 0-127 | EndpointIP6Field | | | Field to store IPv6 address of the selected Service Endpoint. | + +Note that reg marks that have overlapped bits will not be used at the same time, such as `SwapField` and `PacketInTableField`. + +## OVS Ct Mark + +We use some bits of the `ct_mark` field of OVS conntrack to carry information throughout the pipeline. To enhance +usability, we assign friendly names to the bits we use. + +| Field Range | Field Name | Ct Mark Value | Ct Mark Name | Description | +|-------------|-----------------------|---------------|--------------------|-----------------------------------------------------------------| +| bits 0-3 | ConnSourceCTMarkField | 0b0010 | FromGatewayCTMark | Connection source is the Antrea gateway port. | +| | | 0b0101 | FromBridgeCTMark | Connection source is the local bridge port. | +| bit 4 | | 0b1 | ServiceCTMark | Connection is for Service. | +| | | 0b0 | NotServiceCTMark | Connection is not for Service. | +| bit 5 | | 0b1 | ConnSNATCTMark | SNAT'd connection for Service. | +| bit 6 | | 0b1 | HairpinCTMark | Hair-pin connection. | +| bit 7 | | 0b1 | L7NPRedirectCTMark | Connection should be redirected to an application-aware engine. | + +## OVS Ct Label + +We use some bits of the `ct_label` field of OVS conntrack to carry information throughout the pipeline. To enhance +usability, we assign friendly names to the bits we use. + +| Field Range | Field Name | Description | +|-------------|-----------------------|------------------------------------| +| bits 0-31 | IngressRuleCTLabel | Ingress rule ID. | +| bits 32-63 | EgressRuleCTLabel | Egress rule ID. | +| bits 64-75 | L7NPRuleVlanIDCTLabel | VLAN ID for L7 NetworkPolicy rule. | + +## OVS Ct Zone + +We use some OVS conntrack zones to isolate connection tracking rules. To enhance usability, we assign friendly names to +the ct zones. + +| Zone ID | Zone Name | Description | +|---------|--------------|----------------------------------------------------| +| 65520 | CtZone | Tracking IPv4 connections that don't require SNAT. | +| 65521 | SNATCtZone | Tracking IPv4 connections that require SNAT. | + +## Kubernetes NetworkPolicy Implementation + +Several tables of the pipeline are dedicated to [Kubernetes +NetworkPolicy](https://kubernetes.io/docs/concepts/services-networking/network-policies/) implementation (tables +[EgressRule], [EgressDefaultRule], [IngressRule], and [IngressDefaultRule]). + +Throughout this document, the following K8s NetworkPolicy example is used to demonstrate how simple ingress and egress +policy rules are mapped to OVS flows. + +This K8s NetworkPolicy is applied to Pods with the label `app: web` in the `default` Namespace. For these Pods, only TCP +traffic on port 80 from Pods with the label `app: client` and to Pods with the label `app: db` is allowed. Because +Antrea will only install OVS flows for this K8s NetworkPolicy on Nodes that have Pods selected by the policy, we have +scheduled an `app: web` Pod on the current Node from which the sample flows in this document are dumped. The Pod has +been assigned an IP address `10.10.0.19` from the Antrea CNI, so you will see the IP address shown in the associated +flows. ```yaml apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: - name: test-network-policy + name: web-app-db-network-policy namespace: default spec: podSelector: matchLabels: - app: nginx + app: web policyTypes: - - Ingress - - Egress + - Ingress + - Egress ingress: - - from: - - podSelector: - matchLabels: - app: nginx - ports: + - from: + - podSelector: + matchLabels: + app: client + ports: + - protocol: TCP + port: 80 + egress: + - to: + - podSelector: + matchLabels: + app: db + ports: + - protocol: TCP + port: 3306 +``` + +## Kubernetes Service Implementation + +Like K8s NetworkPolicy, several tables of the pipeline are dedicated to [Kubernetes +Service](https://kubernetes.io/docs/concepts/services-networking/service/) implementation (tables [NodePortMark], +[SessionAffinity], [ServiceLB], and [EndpointDNAT]). + +By enabling `proxyAll`, ClusterIP, NodePort, LoadBalancer, and ExternalIP are all handled by AntreaProxy. Otherwise, +only in-cluster ClusterIP is handled. In this document, we use the sample K8s Services below. These Services select Pods +with the label `app: web` as Endpoints. + +### ClusterIP without Endpoint + +A sample Service with `clusterIP` set to `10.101.255.29` does not have any associated Endpoint. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-clusterip-no-ep +spec: + ports: - protocol: TCP port: 80 - egress: - - to: - - podSelector: - matchLabels: - app: nginx - ports: + targetPort: 80 + clusterIP: 10.101.255.29 +``` + +### ClusterIP + +A sample ClusterIP Service with `clusterIP` set to `10.105.31.235`. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-clusterip +spec: + selector: + app: web + ports: - protocol: TCP port: 80 + targetPort: 80 + clusterIP: 10.105.31.235 ``` -This Network Policy is applied to all Pods with the `nginx` app label in the -`default` Namespace. For these Pods, it only allows TCP traffic on port 80 from -and to Pods which also have the `nginx` app label. Because Antrea will only -install OVS flows for this Network Policy on Nodes for which some of the Pods -are the target of the policy, we have scheduled 2 `nginx` Pods on the same -Node. They received IP addresses 10.10.1.2 and 10.10.1.3 from the Antrea CNI, so -you will see these addresses show up in the OVS flows. +### NodePort -## Antrea-native Policies Implementation +A sample NodePort Service with `nodePort` set to `30004`. -In addition to the above tables created for K8s NetworkPolicy, Antrea creates -additional dedicated tables to support the [Antrea-native policies](../antrea-network-policy.md) -([AntreaPolicyEgressRuleTable] and [AntreaPolicyIngressRuleTable]). +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-nodeport +spec: + selector: + app: web + ports: + - protocol: TCP + port: 80 + targetPort: 80 + nodePort: 30004 + type: NodePort +``` -Consider the following Antrea ClusterNetworkPolicy (ACNP) in the Application tier as an -example for the remainder of this document. +### LoadBalancer + +A sample LoadBalancer Service with ingress IP `192.168.77.150` assigned by an ingress controller. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-loadbalancer +spec: + selector: + app: web + ports: + - protocol: TCP + port: 80 + targetPort: 80 + type: LoadBalancer +status: + loadBalancer: + ingress: + - ip: 192.168.77.150 +``` + +### Service with ExternalIP + +A sample Service with external IP `192.168.77.200`. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-service-externalip +spec: + selector: + app: web + ports: + - protocol: TCP + port: 80 + targetPort: 80 + externalIPs: + - 192.168.77.200 +``` + +### Service with Session Affinity + +A sample Service configured with session affinity. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-service-session-affinity +spec: + selector: + app: web + ports: + - protocol: TCP + port: 80 + targetPort: 80 + clusterIP: 10.96.76.15 + sessionAffinity: ClientIP + sessionAffinityConfig: + clientIP: + timeoutSeconds: 300 +``` + +### Service with ExternalTrafficPolicy Local + +A sample Service configured `externalTrafficPolicy` to `Local`. Only `externalTrafficPolicy` of NodePort/LoadBalancer +Service can be configured with `Local`. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sample-service-etp-local +spec: + selector: + app: web + ports: + - protocol: TCP + port: 80 + targetPort: 80 + type: LoadBalancer + externalTrafficPolicy: Local +status: + loadBalancer: + ingress: + - ip: 192.168.77.151 +``` + +## Antrea-native NetworkPolicy Implementation + +In addition to the tables created for K8s NetworkPolicy, Antrea creates additional dedicated tables to support +[Antrea-native NetworkPolicy](../antrea-network-policy.md) (tables [AntreaPolicyEgressRule] and +[AntreaPolicyIngressRule]). + +Consider the following Antrea ClusterNetworkPolicy (ACNP) in the Application Tier as an example for the remainder of +this document. + +This ACNP is applied to all Pods with the label `app: web` in all Namespaces. For these Pods, only TCP traffic on port +80 from the Pods with the label `app: client` and to the Pods with the label `app: db` is allowed. Similar to K8s +NetworkPolicy, Antrea will only install OVS flows for this policy on Nodes that have Pods selected by the policy. + +This policy has very similar rules as the K8s NetworkPolicy example shown previously. This is intentional to simplify +this document and to allow easier comparison between the flows generated for both types of policies. Additionally, we +should emphasize that this policy applies to Pods across all Namespaces, while a K8s NetworkPolicy is always scoped to +a specific Namespace (in the case of our example, the default Namespace). ```yaml apiVersion: crd.antrea.io/v1beta1 kind: ClusterNetworkPolicy metadata: - name: cnp0 + name: web-app-db-network-policy spec: - priority: 10 - tier: application # defaults to application tier if not specified + priority: 5 + tier: application appliedTo: - podSelector: matchLabels: - app: server + app: web ingress: - - action: Drop + - action: Allow from: - podSelector: matchLabels: - app: notClient + app: client ports: - protocol: TCP port: 80 + name: AllowFromClient + - action: Drop egress: - action: Allow to: - podSelector: matchLabels: - app: dns + app: db ports: - - protocol: UDP - port: 53 + - protocol: TCP + port: 3306 + name: AllowToDB + - action: Drop ``` -This ACNP is applied to all Pods with the `app: server` label in all -Namespaces. For these Pods, it drops TCP traffic on port 80 from all -Pods which have the `app: notClient` label. In addition to the ingress rules, -this policy also allows egress UDP traffic on port 53 to all Pods with the -label `app: dns`. Similar to K8s NetworkPolicy, Antrea will only install OVS -flows for this ACNP on Nodes for which some of the Pods are the target of the -policy. Thus, we have scheduled three Pods (appServer, appDns, appNotClient) -on the same Node and they have the following IP addresses: +## Antrea-native L7 NetworkPolicy Implementation + +In addition to layer 3 and layer 4 policies mentioned above, [Antrea-native Layer 7 +NetworkPolicy](../antrea-l7-network-policy.md) is also supported in Antrea. The main difference is that Antrea-native L7 +NetworkPolicy uses layer 7 protocol to filter traffic, not layer 3 or layer 4 protocol. + +Consider the following Antrea-native L7 NetworkPolicy in the Application Tier as an example for the remainder of this +document. + +This ACNP is applied to all Pods with the label `app: web` in all Namespaces. It allows only HTTP ingress traffic on +port 8080 from Pods with the label `app: client`, limited to the `GET` method and `/api/v2/*` path. Any other HTTP +ingress traffic on port 8080 from Pods the label `app: client` will be dropped. -- appServer: 10.10.1.6 -- appNotClient: 10.10.1.7 -- appDns: 10.10.1.8 +```yaml +apiVersion: crd.antrea.io/v1beta1 +kind: ClusterNetworkPolicy +metadata: + name: ingress-allow-http-request-to-api-v2 +spec: + priority: 4 + tier: application + appliedTo: + - podSelector: + matchLabels: + app: web + ingress: + - name: AllowFromClientL7 + action: Allow + from: + - podSelector: + matchLabels: + app: client + ports: + - protocol: TCP + port: 8080 + l7Protocols: + - http: + path: "/api/v2/*" + method: "GET" +``` -## Tables +## TrafficControl Implementation -![OVS pipeline](../assets/ovs-pipeline-antrea-proxy.svg) +[TrafficControl](../traffic-control.md) is a CRD API that manages and manipulates the transmission of Pod traffic. +Antrea creates a dedicated table [TrafficControl] to implement feature `TrafficControl`. We will use the following +TrafficControls as examples for the remainder of this document. -### ClassifierTable (0) +This is a TrafficControl applied to Pods with the label `app: web`. For these Pods, both ingress and egress traffic will +be redirected to port `antrea-tc-tap0`, and returned back through port `antrea-tc-tap1`. -This table is used to determine which "category" of traffic (tunnel, local -gateway or local Pod) the packet belongs to. This is done by matching on the -ingress port for the packet. The appropriate value is then written to bits -[0..3] in NXM_NX_REG0: 0 for tunnel, 1 for local gateway and 2 for local Pod. -This information is used by matches in subsequent tables. For a packet received -from the tunnel port, bit 19 in NXM_NX_REG0 is set to 1, to indicate MAC rewrite -should be performed for the packet in [L3ForwardingTable]. +```yaml +apiVersion: crd.antrea.io/v1alpha2 +kind: TrafficControl +metadata: + name: redirect-web-to-local +spec: + appliedTo: + podSelector: + matchLabels: + app: web + direction: Both + action: Redirect + targetPort: + ovsInternal: + name: antrea-tc-tap0 + returnPort: + ovsInternal: + name: antrea-tc-tap1 +``` -If you dump the flows for this table, you may see the following: +This is a TrafficControl applied to Pods with the label `app: db`. For these Pods, both ingress and egress will be +mirrored (duplicated) to port `antrea-tc-tap2`. -```text -1. table=0, priority=200,in_port=32769 actions=set_field:0x1/0xf->reg0,resubmit(,10) -2. table=0, priority=200,in_port=32768 actions=set_field:0/0xf->reg0,load:0x1->NXM_NX_REG0[19],resubmit(,30) -3. table=0, priority=190,in_port=4 actions=set_field:0x2/0xf->reg0,resubmit(,10) -4. table=0, priority=190,in_port=32770 actions=set_field:0x2/0xf->reg0,resubmit(,10) -5. table=0, priority=0 actions=drop +```yaml +apiVersion: crd.antrea.io/v1alpha2 +kind: TrafficControl +metadata: + name: mirror-db-to-local +spec: + appliedTo: + podSelector: + matchLabels: + app: db + direction: Both + action: Mirror + targetPort: + ovsInternal: + name: antrea-tc-tap2 +``` + +## Egress Implementation + +Table [EgressMark] is dedicated to the implementation of feature `Egress`. + +Consider the following Egresses as examples for the remainder of this document. + +This is an Egress applied to Pods with the label `app: web`. For these Pods, all egress traffic will be SNAT'd on the +Node `k8s-node-control-plane` from which we dumped flows in the document with the Egress IP `192.168.77.112`. + +```yaml +apiVersion: crd.antrea.io/v1beta1 +kind: Egress +metadata: + name: egress-web +spec: + appliedTo: + podSelector: + matchLabels: + app: web + egressIP: 192.168.77.112 +status: + egressNode: k8s-node-control-plane ``` -Flow 1 is for traffic coming in on the local gateway. Flow 2 is for traffic -coming in through an overlay tunnel (i.e. from another Node). The next two -flows (3 and 4) are for local Pods. +This is an Egress applied to Pods with the label `app: client`. For these Pods, all egress traffic will be SNAT'd on the +Node `k8s-node-worker-1` with the Egress IP `192.168.77.113`. -Local traffic then goes to [SpoofGuardTable], while tunnel traffic from other -Nodes goes to [ConntrackTable]. The table-miss flow entry will drop all -unmatched packets (in practice this flow entry should almost never be used). +```yaml +apiVersion: crd.antrea.io/v1beta1 +kind: Egress +metadata: + name: egress-client +spec: + appliedTo: + podSelector: + matchLabels: + app: client + egressIP: 192.168.77.113 +status: + egressNode: k8s-node-worker-1 +``` -### SpoofGuardTable (10) +## OVS Tables -This table prevents IP and ARP -[spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods. For -each Pod (as identified by the ingress port), we ensure that: +![OVS pipeline](../assets/ovs-pipeline.svg) -* for IP traffic, the source IP and MAC addresses are correct, i.e. match the - values configured on the interface when Antrea set-up networking for the Pod. -* for ARP traffic, the advertised IP and MAC addresses are correct, i.e. match - the values configured on the interface when Antrea set-up networking for the - Pod. +### PipelineRootClassifier -Because Antrea currently relies on kube-proxy to load-balance traffic destined -to Services, implementing that kind of IP spoofing check for traffic coming-in -on the local gateway port is not as trivial. Traffic from local Pods destined to -Services will first go through the gateway, get load-balanced by the kube-proxy -datapath (DNAT) then sent back through the gateway. This means that legitimate -traffic can be received on the gateway port with a source IP belonging to a -local Pod. We may add some fine-grained rules in the future to accommodate for -this, but for now we just allow all IP traffic received from the gateway. We do -have an ARP spoofing check for the gateway however, since there is no reason for -the host to advertise a different MAC address on antrea-gw0. +This table serves as the primary entry point in the pipeline, forwarding packets to different tables based on their +respective protocols. -If you dump the flows for this table, you may see the following: +If you dump the flows of this table, you may see the following: ```text -1. table=10, priority=200,ip,in_port=32769 actions=resubmit(,23) -2. table=10, priority=200,arp,in_port=32769,arp_spa=10.10.0.1,arp_sha=3a:dd:79:0f:55:4c actions=resubmit(,20) -3. table=10, priority=200,arp,in_port=4,arp_spa=10.10.0.2,arp_sha=ce:99:ca:bd:62:c5 actions=resubmit(,20) -4. table=10, priority=200,arp,in_port=32770,arp_spa=10.10.0.3,arp_sha=3a:41:49:42:98:69 actions=resubmit(,20) -5. table=10, priority=200,ip,in_port=4,dl_src=ce:99:ca:bd:62:c5,nw_src=10.10.0.2 actions=resubmit(,23) -6. table=10, priority=200,ip,in_port=32770,dl_src=3a:41:49:42:98:69,nw_src=10.10.0.3 actions=resubmit(,23) -7. table=10, priority=0 actions=drop +1. table=PipelineRootClassifier, priority=200,arp actions=goto_table:ARPSpoofGuard +2. table=PipelineRootClassifier, priority=200,ip actions=goto_table:Classifier +3. table=PipelineRootClassifier, priority=0 actions=drop ``` -After this table, ARP traffic goes to [ARPResponderTable], while IP -traffic goes to [ServiceHairpinTable]. Traffic which does not match -any of the rules described above will be dropped by the table-miss flow entry. +Flow 1 forwards ARP packets to table [ARPSpoofGuard]. + +Flow 2 forwards IP packets to table [Classifier]. -### ARPResponderTable (20) +Flow 3 is the table-miss flow to drop other unsupported protocols, not normally used. -The main purpose of this table is to reply to ARP requests from the local -gateway asking for the MAC address of a remote peer gateway (another Node's -gateway). This ensures that the local Node can reach any remote Pod, which in -particular is required for Service traffic which has been load-balanced to a -remote Pod backend by kube-proxy. Note that the table is programmed to reply to -such ARP requests with a "Global Virtual MAC" ("Global" because it is used by -all Antrea OVS bridges), and not with the actual MAC address of the remote -gateway. This ensures that once the traffic is received by the remote OVS -bridge, it can be directly forwarded to the appropriate Pod without actually -going through the gateway. The Virtual MAC is used as the destination MAC -address for all the traffic being tunnelled. +### ARPSpoofGuard -If you dump the flows for this table, you may see the following: +This table is designed to drop ARP [spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) packets from local Pods or +the local Antrea gateway. We ensure that the advertised IP and MAC addresses are correct, meaning they match the values +configured on the interface when Antrea sets up networking for a local Pod or the local Antrea gateway. + +If you dump the flows of this table, you may see the following: ```text -1. table=20, priority=200,arp,arp_tpa=10.10.1.1,arp_op=1 actions=move:NXM_OF_ETH_SRC[]->NXM_OF_ETH_DST[],mod_dl_src:aa:bb:cc:dd:ee:ff,set_field:2->arp_op,move:NXM_NX_ARP_SHA[]->NXM_NX_ARP_THA[],load:0xaabbccddeeff->NXM_NX_ARP_SHA[],move:NXM_OF_ARP_SPA[]->NXM_OF_ARP_TPA[],load:0xa0a0101->NXM_OF_ARP_SPA[],IN_PORT -2. table=20, priority=190,arp actions=NORMAL -3. table=20, priority=0 actions=drop +1. table=ARPSpoofGuard, priority=200,arp,in_port="antrea-gw0",arp_spa=10.10.0.1,arp_sha=ba:5e:d1:55:aa:c0 actions=goto_table:ARPResponder +2. table=ARPSpoofGuard, priority=200,arp,in_port="client-6-3353ef",arp_spa=10.10.0.26,arp_sha=5e:b5:e3:a6:90:b7 actions=goto_table:ARPResponder +3. table=ARPSpoofGuard, priority=200,arp,in_port="web-7975-274540",arp_spa=10.10.0.24,arp_sha=fa:b7:53:74:21:a6 actions=goto_table:ARPResponder +4. table=ARPSpoofGuard, priority=200,arp,in_port="db-755c6-5080e3",arp_spa=10.10.0.25,arp_sha=36:48:21:a2:9d:b4 actions=goto_table:ARPResponder +5. table=ARPSpoofGuard, priority=0 actions=drop ``` -Flow 1 is the "ARP responder" for the peer Node whose local Pod subnet is -10.10.1.0/24. If we were to look at the routing table for the local Node, we -would see the following "onlink" route: +Flow 1 matches legitimate ARP packets from the local Antrea gateway. + +Flows 2-4 match legitimate ARP packets from local Pods. + +Flow 5 is the table-miss flow to drop ARP spoofing packets, which are not matched by flows 1-4. + +### ARPResponder + +The purpose of this table is to handle ARP requests from the local Antrea gateway or local Pods, addressing specific cases: + +1. Responding to ARP requests from the local Antrea gateway seeking the MAC address of a remote Antrea gateway located + on a different Node. This ensures that the local Node can reach any remote Pods. +2. Ensuring the normal layer 2 (L2) learning among local Pods and the local Antrea gateway. + +If you dump the flows of this table, you may see the following: + +```text +1. table=ARPResponder, priority=200,arp,arp_tpa=10.10.1.1,arp_op=1 actions=move:NXM_OF_ETH_SRC[]->NXM_OF_ETH_DST[],set_field:aa:bb:cc:dd:ee:ff->eth_src,set_field:2->arp_op,move:NXM_NX_ARP_SHA[]->NXM_NX_ARP_THA[],set_field:aa:bb:cc:dd:ee:ff->arp_sha,move:NXM_OF_ARP_SPA[]->NXM_OF_ARP_TPA[],set_field:10.10.1.1->arp_spa,IN_PORT +2. table=ARPResponder, priority=190,arp actions=NORMAL +3. table=ARPResponder, priority=0 actions=drop +``` + +Flow 1 is designed for case 1, matching ARP request packets for the MAC address of a remote Antrea gateway with IP address +`10.10.1.1`. It programs an ARP reply packet and sends it back to the port where the request packet was received. Note +that both the source hardware address and the source MAC address in the ARP reply packet are set with the *Global Virtual +MAC* `aa:bb:cc:dd:ee:ff`, not the actual MAC address of the remote Antrea gateway. This ensures that once the traffic is +received by the remote OVS bridge, it can be directly forwarded to the appropriate Pod without actually going through +the local Antrea gateway. The *Global Virtual MAC* is used as the destination MAC address for all the traffic being +tunneled or routed. + +This flow serves as the "ARP responder" for the peer Node whose local Pod subnet is `10.10.1.0/24`. If we were to look +at the routing table for the local Node, we would find the following "onlink" route: ```text 10.10.1.0/24 via 10.10.1.1 dev antrea-gw0 onlink ``` -A similar route is installed on the gateway (antrea-gw0) interface every time the -Antrea Node Route Controller is notified that a new Node has joined the -cluster. The route must be marked as "onlink" since the kernel does not have a -route to the peer gateway 10.10.1.1: we trick the kernel into believing that -10.10.1.1 is directly connected to the local Node, even though it is on the -other side of the tunnel. +A similar route is installed on the local Antrea gateway (antrea-gw0) interface every time the Antrea *Node Route Controller* +is notified that a new Node has joined the cluster. The route must be marked as "onlink" since the kernel does not have +a route to the peer gateway `10.10.1.1`. we trick the kernel into believing that `10.10.1.1` is directly connected to +the local Node, even though it is on the other side of the tunnel. + +Flow 2 is designed for case 2, ensuring that OVS handles the remainder of ARP traffic as a regular L2 learning switch +(using the `normal` action). In particular, this takes care of forwarding ARP requests and replies among local Pods. -Flow 2 ensures that OVS handle the remainder of ARP traffic as a regular L2 -learning switch (using the `normal` action). In particular, this takes care of -forwarding ARP requests and replies between local Pods. +Flow 3 is the table-miss flow, which should never be used since ARP packets will be matched by either flow 1 or 2. -The table-miss flow entry (flow 3) will drop all other packets. This flow should -never be used because only ARP traffic should go to this table, and -ARP traffic will either match flow 1 or flow 2. +### Classifier -### ServiceHairpinTable (23) +This table is designed to determine the "category" of packets by matching the ingress port of the packets. It +addresses specific cases: -When a backend Pod of a Service accesses the Service, and the Pod itself is selected -as the destination, then we have the hairpin case, in which the source IP should be -SNAT'd with a virtual hairpin IP in [hairpinSNATTable]. The source and destination -IP addresses cannot be the same, otherwise the connection will be broken. It will be -explained in detail in [hairpinSNATTable]. For response packets, the -destination IP is the virtual hairpin IP, so the destination IP should be changed back -to the IP of the backend Pod. Then the response packets can be forwarded back correctly. +1. Packets originating from the local Node through the local Antrea gateway port, requiring IP spoof legitimacy + verification. +2. Packets originating from the external network through the Antrea gateway port. +3. Packets received through an overlay tunnel. +4. Packets received through a return port defined in a user-provided TrafficControl CR (for feature `TrafficControl`). +5. Packets returned back from an application-aware engine through a specific port (for feature `L7NetworkPolicy`). +6. Packets originating from local Pods, requiring IP spoof legitimacy verification. -If you dump the flows for this table, you should see the flows: +If you dump the flows of this table, you may see the following: ```text -1. table=23, priority=200,ip,nw_dst=169.254.169.252 actions=move:NXM_OF_IP_SRC[]->NXM_OF_IP_DST[],load:0x1->NXM_NX_REG0[18],resubmit(,30) -2. table=23, priority=0 actions=resubmit(,24) +1. table=Classifier, priority=210,ip,in_port="antrea-gw0",nw_src=10.10.0.1 actions=set_field:0x2/0xf->reg0,set_field:0x10000000/0x10000000->reg4,goto_table:SpoofGuard +2. table=Classifier, priority=200,in_port="antrea-gw0" actions=set_field:0x2/0xf->reg0,set_field:0x8000000/0x8000000->reg4,goto_table:SpoofGuard +3. table=Classifier, priority=200,in_port="antrea-tun0" actions=set_field:0x1/0xf->reg0,set_field:0x200/0x200->reg0,goto_table:UnSNAT +4. table=Classifier, priority=200,in_port="antrea-tc-tap2" actions=set_field:0x6/0xf->reg0,goto_table:L3Forwarding +5. table=Classifier, priority=200,in_port="antrea-l7-tap1",vlan_tci=0x1000/0x1000 actions=pop_vlan,set_field:0x6/0xf->reg0,goto_table:L3Forwarding +6. table=Classifier, priority=190,in_port="client-6-3353ef" actions=set_field:0x3/0xf->reg0,set_field:0x10000000/0x10000000->reg4,goto_table:SpoofGuard +7. table=Classifier, priority=190,in_port="web-7975-274540" actions=set_field:0x3/0xf->reg0,set_field:0x10000000/0x10000000->reg4,goto_table:SpoofGuard +8. table=Classifier, priority=190,in_port="db-755c6-5080e3" actions=set_field:0x3/0xf->reg0,set_field:0x10000000/0x10000000->reg4,goto_table:SpoofGuard +9. table=Classifier, priority=0 actions=drop ``` -Flow 1 is used to match packet whose destination IP is virtual hairpin IP and -change the destination IP of the matched packet by loading register `NXM_OF_IP_SRC` -to `NXM_OF_IP_DST`. Bit 18 in NXM_NX_REG0 is set to 0x1, which indicates that the -packet should be output to the port on which it was received, which is done in -[L2ForwardingOutTable]. +Flow 1 is designed for case 1, matching the source IP address `10.10.0.1` to ensure that the packets are originating from +the local Antrea gateway. The following reg marks are loaded: -### ConntrackTable (30) +- `FromGatewayRegMark`, indicating that the packets are received on the local Antrea gateway port, which will be + consumed in tables [L3Forwarding], [L3DecTTL], [SNATMark] and [SNAT]. +- `FromLocalRegMark`, indicating that the packets are from the local Node, which will be consumed in table [ServiceLB]. -The sole purpose of this table is to invoke the `ct` action on all packets and -set the `ct_zone` (connection tracking context) to a hard-coded value, then -forward traffic to [ConntrackStateTable]. If you dump the flows for this table, -you should only see 1 flow: +Flow 2 is designed for case 2, matching packets originating from the external network through the Antrea gateway port +and forwarding them to table [SpoofGuard]. Since packets originating from the local Antrea gateway are matched by flow +1, flow 2 can only match packets originating from the external network. The following reg marks are loaded: -```text -1. table=30, priority=200,ip actions=ct(table=31,zone=65520) -``` +- `FromGatewayRegMark`, the same as flow 1. +- `FromExternalRegMark`, indicating that the packets are from the external network, not the local Node. + +Flow 3 is for case 3, matching packets through an overlay tunnel (i.e., from another Node) and forwarding them to table +[UnSNAT]. This approach is based on the understanding that these packets originate from remote Nodes, potentially +bearing varying source IP addresses. These packets undergo legitimacy verification before being tunneled. As a consequence, +packets from the tunnel should be seamlessly forwarded to table [UnSNAT]. The following reg marks are loaded: + +- `FromTunnelRegMark`, indicating that the packets are received on a tunnel, consumed in table [L3Forwarding]. +- `RewriteMACRegMark`, indicating that the source and destination MAC addresses of the packets should be rewritten, + and consumed in table [L3Forwarding]. -A `ct_zone` is simply used to isolate connection tracking rules. It is similar -in spirit to the more generic Linux network namespaces, but `ct_zone` is -specific to conntrack and has less overhead. +Flow 4 is for case 4, matching packets from a TrafficControl return port and forwarding them to table [L3Forwarding] +to decide the egress port. It's important to note that both the source and destination MAC addresses of the packets have +been set to the expected state before redirecting the packets to the TrafficControl target port in table [Output]. The +only purpose of forwarding the packets to table [L3Forwarding] is to load tunnel destination IP for packets destined for +remote Nodes. This ensures that the returned packets destined for remote Nodes are forwarded through the tunnel. +`FromTCReturnRegMark` that will be used in table [TrafficControl] is loaded to mark the packet source. -After invoking the ct action, packets will be in the "tracked" (`trk`) state and -all [connection tracking -fields](https://www.openvswitch.org/support/dist-docs/ovs-fields.7.txt) will be -set to the correct value. Packets will then move on to [ConntrackStateTable]. +Flow 5 is for case 5, matching packets sent back from an application-aware engine through a specific port and forwarding +them to table [L3Forwarding] to decide the egress port. Like flow 4, the purpose of forwarding the packets to table +[L3Forwarding] is to load tunnel destination IP for packets destined for remote Nodes. `FromTCReturnRegMark` that will +be used in table [TrafficControl] is also loaded to mark the packet source. -Refer to [this -document](https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/) for -more information on connection tracking in OVS. +Flows 6-8 are for case 6, matching packets from local Pods and forwarding them to table [SpoofGuard] to do legitimacy +verification. The following reg marks are loaded: -### ConntrackStateTable (31) +- `FromPodRegMark`, indicating that the packets are received on the ports connected to the local Pods, consumed in + tables [L3Forwarding] and [SNATMark]. +- `FromLocalRegMark`, indicating that the packets are from the local Pods, consumed in table [ServiceLB]. -This table handles "tracked" packets (packets which are moved to the tracked -state by the previous table [ConntrackTable]) and "untracked" packets (packets -is not in tracked state). +Flow 9 is the table-miss flow to drop packets that are not matched by flows 1-8. -This table serves the following purposes: +### SpoofGuard -* For tracked Service packets, bit 19 in NXM_NX_REG0 will be set to 0x1, then - the tracked packet will be forwarded to [EgressRuleTable] directly. -* Drop packets reported as invalid by conntrack. -* Non-Service tracked packets goes to [EgressRuleTable] directly. -* Untracked packets goes to [SessionAffinityTable] and [ServiceLBTable]. +This table is crafted to drop IP [spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods. It addresses +specific cases: -If you dump the flows for this table, you should see the following: +1. Allowing packets from the local Antrea gateway, where checks are not currently performed. +2. Ensuring that the source IP and MAC addresses are correct, i.e., matching the values configured on the interface when + Antrea sets up networking for a Pod. + +If you dump the flows of this table, you may see the following: ```text -1. table=31, priority=200,ct_state=-new+trk,ct_mark=0x21,ip actions=load:0x1->NXM_NX_REG0[19],resubmit(,50) -2. table=31, priority=190,ct_state=+inv+trk,ip actions=drop -3. table=31, priority=190,ct_state=-new+trk,ip actions=resubmit(,50) -4. table=31, priority=0 actions=resubmit(,40),resubmit(,41) +1. table=SpoofGuard, priority=200,ip,in_port="antrea-gw0" actions=goto_table:UnSNAT +2. table=SpoofGuard, priority=200,ip,in_port="client-6-3353ef",dl_src=5e:b5:e3:a6:90:b7,nw_src=10.10.0.26 actions=goto_table:UnSNAT +3. table=SpoofGuard, priority=200,ip,in_port="web-7975-274540",dl_src=fa:b7:53:74:21:a6,nw_src=10.10.0.24 actions=goto_table:UnSNAT +4. table=SpoofGuard, priority=200,ip,in_port="db-755c6-5080e3",dl_src=36:48:21:a2:9d:b4,nw_src=10.10.0.25 actions=goto_table:UnSNAT +5. table=SpoofGuard, priority=0 actions=drop ``` -Flow 1 is used to forward tracked Service packets to [EgressRuleTable] directly, -without passing [SessionAffinityTable], [ServiceLBTable] and [EndpointDNATTable]. -The flow also sets bit 19 in NXM_NX_REG0 to 0x1, which indicates that the destination -and source MACs of the matched packets should be rewritten in [l3ForwardingTable]. +Flow 1 is for case 1, matching packets received from the local Antrea gateway port without checking the source IP and MAC +address. There are some cases where the source IP of the packets through the local Antrea gateway port is not the local +Antrea gateway IP: + +- When Antrea is deployed with kube-proxy, and `AntreaProxy` is not enabled, packets from local Pods destined for Services + will first go through the gateway, get load-balanced by the kube-proxy data path (DNAT) then re-enter through the + gateway. Then the packets are received on the gateway port with a source IP belonging to a local Pod. +- When Antrea is deployed without kube-proxy, and both `AntreaProxy` and `proxyAll` are enabled, packets from the external + network destined for Services will be routed to OVS through the gateway without changing the source IP. +- When Antrea is deployed with kube-proxy, and `AntreaProxy` is enabled, packets from the external network destined for + Services will get load-balanced by the kube-proxy data path (DNAT) and then routed to OVS through the gateway without SNAT. -Flow 2 is used to drop packets which is reported as invalid by conntrack. +Flows 2-4 are for case 2, matching legitimate IP packets from local Pods. -Flow 3 is used to forward tracked non-Service packets to [EgressRuleTable] directly, -without passing [SessionAffinityTable], [ServiceLBTable] and [EndpointDNATTable]. +Flow 5 is the table-miss flow to drop IP spoofing packets. -Flow 4 is used to match the first packet of untracked connection and forward it to -[SessionAffinityTable] and [ServiceLBTable]. +### UnSNAT -### SessionAffinityTable (40) +This table is used to perform `de-SNAT` on reply packets by invoking action `ct` on them. The packets are from SNAT'd +Service connections that have been committed with `SNATCtZone` in table [SNAT]. After invoking action `ct`, the packets +will be in a "tracked" state, restoring all [connection tracking +fields](https://www.openvswitch.org/support/dist-docs/ovs-fields.7.txt) (such as `ct_state`, `ct_mark`, `ct_label`, etc.) +to their original values. The packets with a "tracked" state are then forwarded to table [ConntrackZone]. -If `service.spec.sessionAffinity` of a Service is `None`, this table will set the value -of bits [16..18] in NXM_NX_REG4 to 0b001, which indicates that the Service needs to do -Endpoint selection. If you dump the flow, you should see the flow: +If you dump the flows of this table, you may see the following: ```text -table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18] +1. table=UnSNAT, priority=200,ip,nw_dst=169.254.0.253 actions=ct(table=ConntrackZone,zone=65521,nat) +2. table=UnSNAT, priority=200,ip,nw_dst=10.10.0.1 actions=ct(table=ConntrackZone,zone=65521,nat) +3. table=UnSNAT, priority=0 actions=goto_table:ConntrackZone ``` -If `service.spec.sessionAffinity` of a Service is `ClientIP`, when a client accesses -the Service for the first time, a learned flow with hard timeout which equals -`service.spec.sessionAffinityConfig.clientIP.timeoutSeconds` of the Service will be -generated in this table. This will be explained in detail in chapter [ServiceLBTable]. +Flow 1 matches reply packets for Service connections where they were SNAT'd with the *Virtual Service IP* `169.254.0.253` +and invokes action `ct` on them. For the packets, the destination IP of them is the *Virtual Service IP*. + +Flow 2 matches packets for Service connections where they were SNAT'd with the local Antrea gateway IP `10.10.0.1` and +invokes action `ct` on them. For the packets, the destination IP of them is the local Antrea gateway IP. This flow also +matches request packets destined for the local Antrea gateway IP from local Pods by accident. However, this is harmless +since such connections will never be committed with `SNATCtZone`, and therefore, connection tracking fields for the +packets are unset. + +Flow 3 is the table-miss flow. -### ServiceLBTable (41) +For reply packets from SNAT'd connections, whose destination IP is the translated SNAT IP, after invoking action `ct`, +the destination IP of the packets will be restored to the original IP before SNAT is stored in the connection tracking +field `ct_nw_dst`. -This table is used to implement Service Endpoint selection. Note that, currently, only -ClusterIP Service request from Pods is supported. NodePort, LoadBalancer and ClusterIP -whose client is from K8s Node will be supported in the future. +### ConntrackZone -When a ClusterIP Service is created with `service.spec.sessionAffinity` set to `None`, if you -dump the flows, you should see the following flow: +The main purpose of this table is to invoke action `ct` on packets from all connections. After invoking `ct` action, +packets will be in a "tracked" state, restoring all connection tracking fields to their appropriate values. When invoking +action `ct` with `CtZone` to the packets that have a "tracked" state associated with `SNATCtZone`, then the "tracked" +state associated with `SNATCtZone` will be inaccessible. This transition occurs because the "tracked" state shifts to +another state associated with `CtZone`. A ct zone is similar in spirit to the more generic Linux network namespaces, +uniquely containing a "tracked" state within each ct zone. + +If you dump the flows of this table, you may see the following: ```text -1. table=41, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19],group:5 +1. table=ConntrackZone, priority=200,ip actions=ct(table=ConntrackState,zone=65520,nat) +2. table=ConntrackZone, priority=0 actions=goto_table:ConntrackState ``` -Among the match conditions of the above flow: +Flow 1 invokes `ct` action on packets from all connections, and the packets are then forwarded to table [ConntrackState] +with the "tracked" state associated with `CtZone`. Note that for packets in an established Service (DNATed) connection, +not the first packet of a Service connection, DNAT or un-DNAT is performed on them before they are forwarded. + +Flow 2 is the table-miss flow that should remain unused. -* `reg4=0x10000/0x70000`, value of bits [16..18] in NXM_NX_REG4 is 0b001, which is used - to match Service packet whose state is to do Endpoint selection. The value of - bits [16..18] in NXM_NX_REG4 is set in [SessionAffinityTable] by flow `table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18]`. +### ConntrackState -The actions of the above flow: +This table handles packets from the connections that have a "tracked" state associated with `CtZone`. It addresses +specific cases: -* `load:0x2->NXM_NX_REG4[16..18]` is used to set the value of bits [16..18] in NXM_NX_REG4 - to 0b002, which indicates that Endpoint selection "is performed". Note that, Endpoint - selection has not really been done yet - it will be done by group action. The current - action should have been done in target OVS group entry after Endpoint selection. However, - we set the bits here, for the purpose of supporting more Endpoints in an OVS group. - Please check PR [#2101](https://github.com/antrea-io/antrea/pull/2101) to learn more information. -* `load:0x1->NXM_NX_REG0[19]` is used to set the value of bit 19 in NXM_NX_REG0 to 0x1, - which means that the source and destination MACs need to be rewritten. -* `group:5` is used to set the target OVS group. Note that, the target group needs to be - created first before the flow is created. +1. Dropping invalid packets reported by conntrack. +2. Forwarding tracked sequencing packets from all connections to table [AntreaPolicyEgressRule] directly, bypassing the + tables like [PreRoutingClassifier], [NodePortMark], [SessionAffinity], [ServiceLB], and [EndpointDNAT] for Service + Endpoint selection. +3. Forwarding packets from new connections to table [PreRoutingClassifier] to start Service Endpoint selection since + Service connections are not identified at this stage. -Dump the group entry with command `ovs-ofctl dump-groups br-int 5`, you should see the -following: +If you dump the flows of this table, you may see the following: ```text -group_id=5,type=select,\ -bucket=bucket_id:0,weight:100,actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42),\ -bucket=bucket_id:1,weight:100,actions=load:0xa0a0003->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42),\ -bucket=bucket_id:2,weight:100,actions=load:0xa0a0004->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42) +1. table=ConntrackState, priority=200,ct_state=+inv+trk,ip actions=drop +2. table=ConntrackState, priority=190,ct_state=-new+trk,ct_mark=0/0x10,ip actions=goto_table:AntreaPolicyEgressRule +3. table=ConntrackState, priority=190,ct_state=-new+trk,ct_mark=0x10/0x10,ip actions=set_field:0x200/0x200->reg0,goto_table:AntreaPolicyEgressRule +4. table=ConntrackState, priority=0 actions=goto_table:PreRoutingClassifier ``` -For the above OVS group, there are three buckets which have the same weight. Every bucket -has the same chance to be selected since they have the same weight. The selected bucket -will load Endpoint IPv4 address to NXM_NX_REG3, Endpoint port number to bits [0..15] -in NXM_NX_REG4. Then the matched packet will be resubmitted to [EndpointDNATTable]. +Flow 1 is for case 1, dropping invalid packets. -When a ClusterIP Service is created with `service.spec.sessionAffinity` set to `ClientIP`, you may -see the following flows: +Flow 2 is for case 2, matching packets from non-Service connections with `NotServiceCTMark` and forwarding them to +table [AntreaPolicyEgressRule] directly, bypassing the tables for Service Endpoint selection. -```text -1. table=41, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=load:0x3->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19],group:5 -2. table=41, priority=190,tcp,reg4=0x30000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=\ - learn(table=40,hard_timeout=300,priority=200,delete_learned,cookie=0x2040000000008, \ - eth_type=0x800,nw_proto=6,NXM_OF_TCP_DST[],NXM_OF_IP_DST[],NXM_OF_IP_SRC[],\ - load:NXM_NX_REG3[]->NXM_NX_REG3[],load:NXM_NX_REG4[0..15]->NXM_NX_REG4[0..15],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19]),\ - load:0x2->NXM_NX_REG4[16..18],\ - resubmit(,42) -``` +Flow 3 is also for case 2, matching packets from Service connections with `ServiceCTMark` loaded in table +[EndpointDNAT] and forwarding them to table [AntreaPolicyEgressRule], bypassing the tables for Service Endpoint +selection. `RewriteMACRegMark`, which is used in table [L3Forwarding], is loaded in this flow, indicating that the +source and destination MAC addresses of the packets should be rewritten. -When a client (assumed that the source IP is 10.10.0.2) accesses the ClusterIP for the first -time, the first packet of the connection will be matched by flow 1. Note that the action -`load:0x3->NXM_NX_REG4[16..18]` indicates that the Service Endpoint selection result needs -to be cached. +Flow 4 is the table-miss flow for case 3, matching packets from all new connections and forwarding them to table +[PreRoutingClassifier] to start the processing of Service Endpoint selection. -Dump the group entry with command `ovs-ofctl dump-groups br-int 5`, you should see the -following: +### PreRoutingClassifier + +This table handles the first packet from uncommitted Service connections before Service Endpoint selection. It +sequentially resubmits the packets to tables [NodePortMark] and [SessionAffinity] to do some pre-processing, including +the loading of specific reg marks. Subsequently, it forwards the packets to table [ServiceLB] to perform Service Endpoint +selection. + +If you dump the flows of this table, you may see the following: ```text -group_id=5,type=select,\ -bucket=bucket_id:0,weight:100,actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41),\ -bucket=bucket_id:1,weight:100,actions=load:0xa0a0003->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41),\ -bucket=bucket_id:2,weight:100,actions=load:0xa0a0004->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41) +1. table=PreRoutingClassifier, priority=200,ip actions=resubmit(,NodePortMark),resubmit(,SessionAffinity),resubmit(,ServiceLB) +2. table=PreRoutingClassifier, priority=0 actions=goto_table:NodePortMark ``` -Note the action `resubmit(,41)` resubmits the first packet of a ClusterIP Service connection -back to [ServiceLBTable], not resubmits the packet to [EndpointDNATTable]. Then the -packet will be matched by flow 2 since value of bits [16..18] in NXM_NX_REG4 is 0b011. One -action of the flow is to generate a learned flow in [SessionAffinityTable], the other -action is to resubmit the packet to [EndpointDNATTable]. +Flow 1 sequentially resubmits packets to tables [NodePortMark], [SessionAffinity], and [ServiceLB]. Note that packets +are forwarded to table [ServiceLB] finally. In tables [NodePortMark] and [SessionAffinity], only reg marks are loaded. + +Flow 2 is the table-miss flow that should remain unused. + +### NodePortMark -Now if you dump flows of table [SessionAffinityTable], you may see the following flows: +This table is designed to potentially mark packets destined for NodePort Services. It is only created when `proxyAll` is +enabled. + +If you dump the flows of this table, you may see the following: ```text -1. table=40, hard_timeout=300, priority=200,tcp,nw_src=10.10.0.2,nw_dst=10.107.100.231,tp_dst=443 \ - actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19] -2. table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18] +1. table=NodePortMark, priority=200,ip,nw_dst=10.176.25.100 actions=set_field:0x80000/0x80000->reg4 +2. table=NodePortMark, priority=200,ip,nw_dst=192.168.77.102 actions=set_field:0x80000/0x80000->reg4 +3. table=NodePortMark, priority=200,ip,nw_dst=169.254.0.252 actions=set_field:0x80000/0x80000->reg4 +4. table=NodePortMark, priority=0 actions=goto_table:SessionAffinity ``` -Note that, flow 1 (the generated learned flow) has higher priority than flow 2 in table -[SessionAffinityTable]. When a particular client accesses the ClusterIP once again, the first -packet of the connection will be matched by flow 1 due to the match condition `nw_src=10.10.0.2`. +Flows 1-2 match packets destined for the local Node from local Pods. `NodePortRegMark` is loaded, indicating that the +packets are potentially destined for NodePort Services. -The actions of flow 1: +Flow 3 match packets destined for the *Virtual NodePort DNAT IP*. Packets destined for NodePort Services from the local +Node or the external network is DNAT'd to the *Virtual NodePort DNAT IP* by iptables before entering the pipeline. -* `load:0xa0a0004->NXM_NX_REG3[]` is used to load Endpoint IPv4 address to NXM_NX_REG3. -* `load:0x50->NXM_NX_REG4[0..15]` is used to load Endpoint port number to bits [0..15] in - NXM_NX_REG4. -* `load:0x2->NXM_NX_REG4[16..18]` is used to set the value of bits [16..18] in NXM_NX_REG4 to - 0b010, which indicates that the Service has done Endpoint selection. -* `load:0x1->NXM_NX_REG0[19]` is used to set the value of bit 19 in NXM_NX_REG0 to 0x1, which - indicates that the source and destination MACs need to be rewritten. +Flow 4 is the table-miss flow. -Note that, if the value of bits [16..18] in NXM_NX_REG4 is 0b010 (set by action `load:0x2->NXM_NX_REG4[16..18]` -in table [SessionAffinityTable]), then packet will not be matched by any flows in table -[ServiceLBTable] except the last one. The last one just forwards the packet to table -[EndpointDNATTable] without selecting target OVS group. Then connections from a particular -client will always access the same backend Pod within the session timeout setting by -`service.spec.sessionAffinityConfig.clientIP.timeoutSeconds`. +Note that packets of NodePort Services have not been identified in this table by matching destination IP address. The +identification of NodePort Services will be done finally in table [ServiceLB] by matching `NodePortRegMark` and the +the specific destination port of a NodePort. -### EndpointDNATTable (42) +### SessionAffinity -The table implements DNAT for Service traffic after Endpoint selection for the first -packet of a Service connection. +This table is designed to implement Service session affinity. The learned flows that cache the information of the +selected Endpoints are installed here. -If you dump the flows for this table, you should see flows like the following: +If you dump the flows of this table, you may see the following: ```text -1. table=42, priority=200,tcp,reg3=0xc0a84d64,reg4=0x2192b/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=192.168.77.100:6443),exec(load:0x21->NXM_NX_CT_MARK[])) -2. table=42, priority=200,tcp,reg3=0xc0a84d65,reg4=0x2286d/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=192.168.77.101:10349),exec(load:0x21->NXM_NX_CT_MARK[])) -3. table=42, priority=200,tcp,reg3=0xa0a0004,reg4=0x20050/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.0.4:80),exec(load:0x21->NXM_NX_CT_MARK[])) -4. table=42, priority=200,tcp,reg3=0xa0a0102,reg4=0x20050/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.1.2:80),exec(load:0x21->NXM_NX_CT_MARK[])) -5. table=42, priority=200,udp,reg3=0xa0a0002,reg4=0x20035/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.0.2:53),exec(load:0x21->NXM_NX_CT_MARK[])) -6. table=42, priority=190,reg4=0x20000/0x70000 actions=load:0x1->NXM_NX_REG4[16..18],resubmit(,41) -7. table=42, priority=0 actions=resubmit(,45) +1. table=SessionAffinity, hard_timeout=300, priority=200,tcp,nw_src=10.10.0.1,nw_dst=10.96.76.15,tp_dst=80 \ + actions=set_field:0x50/0xffff->reg4,set_field:0/0x4000000->reg4,set_field:0xa0a0001->reg3,set_field:0x20000/0x70000->reg4,set_field:0x200/0x200->reg0 +2. table=SessionAffinity, priority=0 actions=set_field:0x10000/0x70000->reg4 ``` -For flow 1-5, DNAT is performed with the IPv4 address stored in NXM_NX_REG3 and port number stored in -bits[0..15] in NXM_NX_REG4 by `ct commit` action. Note that, the match condition `reg4=0x2192b/0x7ffff` -is a union value. The value of bits [0..15] is port number. The value of bits [16..18] is 0b010, -which indicates that Service has done Endpoint selection. Service ct_mark `0x21` is also marked. +Flow 1 is a learned flow generated by flow 3 in table [ServiceLB], designed for the sample Service [ClusterIP with +Session Affinity], to implement Service session affinity. Here are some details about the flow: + +- The hard timeout of the learned flow should be equal to the value of + `service.spec.sessionAffinityConfig.clientIP.timeoutSeconds` defined in the Service. This means that during the hard + timeout, this flow is present in the pipeline, and the session affinity of the Service takes effect during the timeout. +- Source IP address, destination IP address, destination port, and transparent protocol are used to match packets of + connections sourced from the same client and destined for the Service during the timeout. +- Endpoint IP address and Endpoint port are loaded into `EndpointIPField` and `EndpointPortField` respectively. +- `EpSelectedRegMark` is loaded, indicating that the Service Endpoint selection is done, and then the packets will + be only matched by the last flow in table [ServiceLB]. +- `RewriteMACRegMark`, which will be consumed in table [L3Forwarding], is loaded here, indicating that the source and + destination MAC addresses of the packets should be rewritten. -If none of the flows described above are hit, flow 6 is used to forward packet back to table [ServiceLBTable] -to select Endpoint again. +Flow 2 is the table-miss flow to match the first packet of connections destined for Services. The loading of +`EpToSelectRegMark`, to be consumed in table [ServiceLB], indicating that the packet needs to do Service Endpoint +selection. -Flow 7 is used to match non-Service packet. +### ServiceLB -### AntreaPolicyEgressRuleTable (45) +This table is used to implement Service Endpoint selection. It addresses specific cases: -For this table, you will need to keep in mind the ACNP -[specification](#antrea-native-policies-implementation) -that we are using. +1. ClusterIP, as demonstrated in the examples [ClusterIP without Endpoint] and [ClusterIP]. +2. NodePort, as demonstrated in the example [NodePort]. +3. LoadBalancer, as demonstrated in the example [LoadBalancer]. +4. Service configured with external IPs, as demonstrated in the example [Service with ExternalIP]. +5. Service configured with session affinity, as demonstrated in the example [Service with Session Affinity]. +6. Service configured with externalTrafficPolicy to `Local`, as demonstrated in the example [Service with + ExternalTrafficPolicy Local]. -This table is used to implement the egress rules across all Antrea-native policies, -except for policies that are created in the Baseline Tier. Antrea-native policies -created in the Baseline Tier will be enforced after K8s NetworkPolicies, and their -egress rules are installed in the [EgressDefaultTable] and [EgressRuleTable] -respectively, i.e. +If you dump the flows of this table, you may see the following: ```text -Baseline Tier -> EgressDefaultTable(60) -K8s NetworkPolicy -> EgressRuleTable(50) -All other Tiers -> AntreaPolicyEgressRuleTable(45) +1. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.101.255.29,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x9->reg7,group:9 +2. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.105.31.235,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0xc->reg7,group:10 +3. table=ServiceLB, priority=200,tcp,reg4=0x90000/0xf0000,tp_dst=30004 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x200000/0x200000->reg4,set_field:0xc->reg7,group:12 +4. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=192.168.77.150,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0xe->reg7,group:14 +5. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=192.168.77.200,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x10->reg7,group:16 +6. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.96.76.15,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x30000/0x70000->reg4,set_field:0xa->reg7,group:11 +7. table=ServiceLB, priority=190,tcp,reg4=0x30000/0x70000,nw_dst=10.96.76.15,tp_dst=80 actions=learn(table=SessionAffinity,hard_timeout=300,priority=200,delete_learned,cookie=0x203000000000a,\ + eth_type=0x800,nw_proto=6,NXM_OF_TCP_DST[],NXM_OF_IP_DST[],NXM_OF_IP_SRC[],load:NXM_NX_REG4[0..15]->NXM_NX_REG4[0..15],load:NXM_NX_REG4[26]->NXM_NX_REG4[26],load:NXM_NX_REG3[]->NXM_NX_REG3[],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[9]),\ + set_field:0x20000/0x70000->reg4,goto_table:EndpointDNAT +8. table=ServiceLB, priority=210,tcp,reg4=0x10010000/0x10070000,nw_dst=192.168.77.151,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x11->reg7,group:17 +9. table=ServiceLB, priority=200,tcp,nw_dst=192.168.77.151,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x12->reg7,group:18 +10. table=ServiceLB, priority=0 actions=goto_table:EndpointDNAT ``` -Since the example ACNP resides in the Application tier, if you dump the flows for -table 45, you should see something like this: +Flow 1 or flow 2 is designed for case 1, matching the first packet of connections destined for the sample [ClusterIP +without Endpoint] or [ClusterIP]. This is achieved by matching `EpToSelectRegMark` loaded in table [SessionAffinity], +clusterIP, and port. The target of the packet matched by the flow is an OVS group where the Endpoint will be selected. +Before forwarding the packet to the OVS group, `RewriteMACRegMark` that will be consumed in table [L3Forwarding] is +loaded, indicating that the source and destination MAC addresses of the packets should be rewritten. `EpSelectedRegMark` +that will be consumed in table [EndpointDNAT] is also loaded, indicating that the Endpoint is selected. Note that the +Service Endpoint selection is not completed yet, as it will be done in the target OVS group. The action is set here to +support more Endpoints in an OVS group. Refer to PR [#2101](https://github.com/antrea-io/antrea/pull/2101) for more +information. + +Flow 3 is for case 2, matching the first packet of connections destined for the sample [NodePort]. This is achieved by +matching `EpToSelectRegMark` loaded in table [SessionAffinity], `NodePortRegMark` loaded in table [NodePortMark], and +NodePort port. Similar to flows 1-2, `RewriteMACRegMark` and `EpSelectedRegMark` are also loaded. + +Flow 4 is for case 3, processing the first packet of connections destined for the ingress IP of the sample +[LoadBalancer], similar to flow 1. + +Flow 5 is for case 4, processing the first packet of connections destined for the external IP of the sample [Service +with ExternalIP], similar to flow 1. + +Flow 6 is the initial process for case 5, matching the first packet of connections destined for the sample [Service with +Session Affinity]. This is achieved by matching the conditions similar to flow 1. Like flow 1, the target of the flow is +also an OVS group, and `RewriteMACRegMark` is loaded. The difference is that `EpToLearnRegMark` is loaded, rather than +`EpSelectedRegMark`, indicating that the selected Endpoint needs to be cached. + +Flow 7 is the final process for case 5, matching the packet previously matched by flow 6, sent back from the target OVS +group after selecting an Endpoint. Then a learned flow will be generated in table [SessionAffinity] to match the packets +of the subsequent connections from the same client IP, ensuring that the packets are always forwarded to the same Endpoint +selected the first time. `EpSelectedRegMark` that will be consumed in table [EndpointDNAT] is loaded, indicating that +Service Endpoint selection has been done. + +Flow 8 and flow 9 are for case 6. Flow 8 has the higher priority than that of flow 9, prioritizing matching the first +packet of connection sourced from a local Pod or the local Node with `FromLocalRegMark` loaded in table [Classifier] +and destined for the sample [Service with ExternalTrafficPolicy Local]. The target of flow 8 is an OVS group that has +all the Endpoints across the cluster, ensuring accessibility for Service connections originating from local Pods or +Nodes, regardless that `externalTrafficPolicy` of the Service is `Local`. Due to the existence of flow 8, consequently, +flow 9 exclusively matches packets sourced from the external network, resembling the pattern of flow 1. The target of +flow 9 is an OVS group that has only the local Endpoints since `externalTrafficPolicy` of the Service is `Local`. + +Flow 10 is the table-miss flow. + +As mentioned above, the Service Endpoint selection is performed within OVS groups. 3 typical OVS groups are list below: ```text -1. table=45, priority=64990,ct_state=-new+est,ip actions=resubmit(,61) -2. table=45, priority=14000,conj_id=1,ip actions=load:0x1->NXM_NX_REG5[],ct(commit,table=61,zone=65520,exec(load:0x1->NXM_NX_CT_LABEL[32..63])) -3. table=45, priority=14000,ip,nw_src=10.10.1.6 actions=conjunction(1,1/3) -4. table=45, priority=14000,ip,nw_dst=10.10.1.8 actions=conjunction(1,2/3) -5. table=45, priority=14000,udp,tp_dst=53 actions=conjunction(1,3/3) -6. table=45, priority=0 actions=resubmit(,50) +1. group_id=9,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0x4000/0x4000->reg0,resubmit(,EndpointDNAT) +2. group_id=10,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0xa0a0018->reg3,set_field:0x50/0xffff->reg4,resubmit(,EndpointDNAT),\ + bucket=bucket_id:1,weight:100,actions=set_field:0x4000000/0x4000000->reg4,set_field:0xa0a0106->reg3,set_field:0x50/0xffff->reg4,resubmit(,EndpointDNAT) +3. group_id=11,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0xa0a0018->reg3,set_field:0x50/0xffff->reg4,resubmit(,ServiceLB),\ + bucket=bucket_id:1,weight:100,actions=set_field:0x4000000/0x4000000->reg4,set_field:0xa0a0106->reg3,set_field:0x50/0xffff->reg4,resubmit(,ServiceLB) ``` -Similar to [K8s NetworkPolicy implementation](#egressruletable-50), -AntreaPolicyEgressRuleTable also relies on the OVS built-in `conjunction` action to -implement policies efficiently. +The first group with `group_id` 9 is the destination of packets matched by flow 1, designed for a Service without +Endpoints. The group only has a single bucket where `SvcNoEpRegMark` which will be used in table [EndpointDNAT] is +loaded, indicating that the Service has no Endpoint, and then packets are forwarded to table [EndpointDNAT]. -The above example flows read as follow: if the source IP address is in set -{10.10.1.6}, and the destination IP address is in the set {10.10.1.8}, and the -destination TCP port is in the set {53}, then use the `conjunction` action with -id 1, which stores the `conj_id` 1 in `ct_label[32..63]` for egress metrics collection -purposes, and forwards the packet to EgressMetricsTable, then [L3ForwardingTable]. -Otherwise, go to [EgressRuleTable] if no conjunctive flow above priority 0 is matched. -This corresponds to the case where the packet is not matched by any of the Antrea-native -policy egress rules in any tier (except for the "baseline" tier). +The second group with `group_id` 10 is the destination of packets matched by flow 2, designed for a Service with +Endpoints. The group has 2 buckets, indicating the availability of 2 selectable Endpoints. Each bucket has an equal +chance of being chosen since they have the same weights. For every bucket, the Endpoint IP and Endpoint port are loaded +into `EndpointIPField` and `EndpointPortField`, respectively. These loaded values will be consumed in table +[EndpointDNAT] to which the packets are forwarded and in which DNAT will be performed. `RemoteEndpointRegMark` is loaded +for remote Endpoints, like bucket with `bucket_id` 1 in this group. -If the `conjunction` action is matched, packets are "allowed" or "dropped" -based on the `action` field of the policy rule. If allowed, they follow a similar -path as described in the following [EgressRuleTable] section. +The third group with `group_id` 11 is the destination of packets matched by flow 6, designed for a Service that has +Endpoints and is configured with session affinity. The group closely resembles the group with `group_id` 10, except that +the destination of the packets is table [ServiceLB], rather than table [EndpointDNAT]. After being sent back to table +[ServiceLB], they will be matched by flow 7. -Unlike the default of K8s NetworkPolicies, Antrea-native policies have no such -default rules. Hence, they are evaluated as-is, and there is no need for a -AntreaPolicyEgressDefaultTable. +### EndpointDNAT -### EgressRuleTable (50) +The table implements DNAT for Service connection after Endpoint selection is performed in table [ServiceLB]. -For this table, you will need to keep mind the Network Policy -[specification](#network-policy-implementation) that we are using. We have 2 -Pods running on the same Node, with IP addresses 10.10.1.2 to 10.10.1.3. They -are allowed to talk to each other using TCP on port 80, but nothing else. - -This table is used to implement the egress rules across all Network Policies. If -you dump the flows for this table, you should see something like this: +If you dump the flows of this table, you may see the following:: ```text -1. table=50, priority=210,ct_state=-new+est,ip actions=goto_table:70 -2. table=50, priority=200,ip,nw_src=10.10.1.2 actions=conjunction(2,1/3) -3. table=50, priority=200,ip,nw_src=10.10.1.3 actions=conjunction(2,1/3) -4. table=50, priority=200,ip,nw_dst=10.10.1.2 actions=conjunction(2,2/3) -5. table=50, priority=200,ip,nw_dst=10.10.1.3 actions=conjunction(2,2/3) -6. table=50, priority=200,tcp,tp_dst=80 actions=conjunction(2,3/3) -7. table=50, priority=190,conj_id=2,ip actions=load:0x2->NXM_NX_REG5[],ct(commit,table=61,zone=65520,exec(load:0x2->NXM_NX_CT_LABEL[32..63])) -8. table=50, priority=0 actions=goto_table:60 +1. table=EndpointDNAT, priority=200,reg0=0x4000/0x4000 actions=controller(reason=no_match,id=62373,userdata=04) +2. table=EndpointDNAT, priority=200,tcp,reg3=0xa0a0018,reg4=0x20050/0x7ffff actions=ct(commit,table=AntreaPolicyEgressRule,zone=65520,nat(dst=10.10.0.24:80),exec(set_field:0x10/0x10->ct_mark,move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +3. table=EndpointDNAT, priority=200,tcp,reg3=0xa0a0106,reg4=0x20050/0x7ffff actions=ct(commit,table=AntreaPolicyEgressRule,zone=65520,nat(dst=10.10.1.6:80),exec(set_field:0x10/0x10->ct_mark,move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +4. table=EndpointDNAT, priority=190,reg4=0x20000/0x70000 actions=set_field:0x10000/0x70000->reg4,resubmit(,ServiceLB) +5. table=EndpointDNAT, priority=0 actions=goto_table:AntreaPolicyEgressRule ``` -Notice how we use the OVS built-in `conjunction` action to implement policies -efficiently. This enables us to do a conjunctive match across multiple -dimensions (source IP, destination IP, port) efficiently without "exploding" the -number of flows. By definition of a conjunctive match, we have at least 2 -dimensions. For our use-case we have at most 3 dimensions. - -The only requirements on `conj_id` is for it to be a unique 32-bit integer -within the table. At the moment we use a single custom allocator, which is -common to all tables that can have NetworkPolicy flows installed (45, 50, -60, 85, 90 and 100). This is why `conj_id` is set to 2 in the above example -(1 was allocated for the egress rule of our Antrea-native NetworkPolicy example -in the previous section). - -The above example flows read as follow: if the source IP address is in set -{10.10.1.2, 10.10.1.3}, and the destination IP address is in the set {10.10.1.2, -10.10.1.3}, and the destination TCP port is in the set {80}, then use the -`conjunction` action with id 2, which goes to [EgressMetricsTable], and then -[L3ForwardingTable]. Otherwise, packet goes to [EgressDefaultTable]. - -If the Network Policy specification includes exceptions (`except` field), then -the table will include multiple flows with conjunctive match, corresponding to -each CIDR that is present in `from` or `to` fields, but not in `except` field. -Network Policy implementation details are not covered in this document. - -If the `conjunction` action is matched, packets are "allowed" and forwarded -directly to [L3ForwardingTable]. Other packets go to [EgressDefaultTable]. If a -connection is established - as a reminder all connections are committed in -[ConntrackCommitTable] - its packets go straight to [L3ForwardingTable], with no -other match required (see flow 1 above, which has the highest priority). In -particular, this ensures that reply traffic is never dropped because of a -Network Policy rule. However, this also means that ongoing connections are not -affected if the K8s Network Policies are updated. - -One thing to keep in mind is that for Service traffic, these rules are applied -after the packets have gone through the local gateway and through kube-proxy. At -this point the ingress port is no longer the Pod port, but the local gateway -port. Therefore we cannot use the port as the match condition to identify if the -Pod has been applied a Network Policy - which is what we do for the -[IngressRuleTable] -, but instead have to use the source IP address. - -### EgressDefaultTable (60) - -This table complements [EgressRuleTable] for Network Policy egress rule -implementation. In K8s, when a Network Policy is applied to a set of Pods, the -default behavior for these Pods become "deny" (it becomes an [isolated Pod]( -https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). -This table is in charge of dropping traffic originating from Pods to which a Network -Policy (with an egress rule) is applied, and which did not match any of the -allowlist rules. - -Accordingly, based on our Network Policy example, we would expect to see flows -to drop traffic originating from our 2 Pods (10.10.1.2 and 10.10.1.3), which is -confirmed by dumping the flows: +Flow 1 is designed for Services without Endpoints. It identifies the first packet of connections destined for such Service +by matching `SvcNoEpRegMark`. Subsequently, the packet is forwarded to the OpenFlow controller (Antrea Agent). For TCP +Service traffic, the controller will send a TCP RST, and for all other cases the controller will an ICMP Destination +Unreachable message. + +Flows 2-3 are designed for Services that have selected an Endpoint. These flows identify the first packet of connections +destined for such Services by matching `EndpointPortField`, which stores the Endpoint IP, and `EpUnionField` (a combination +of `EndpointPortField` storing the Endpoint port and `EpSelectedRegMark`). Then `ct` action is invoked on the packet, +performing DNAT'd and forwarding it to table [ConntrackState] with the "tracked" state associated with `CtZone`. +Some bits of ct mark are persisted: + +- `ServiceCTMark`, to be consumed in tables [L3Forwarding] and [ConntrackCommit], indicating that the current packet and + subsequent packets of the connection are for a Service. +- The value of `PktSourceField` is persisted to `ConnSourceCTMarkField`, storing the source of the connection for the + current packet and subsequent packets of the connection. + +### AntreaPolicyEgressRule + +This table is used to implement the egress rules across all Antrea-native NetworkPolicies, except for NetworkPolicies +that are created in the Baseline Tier. Antrea-native NetworkPolicies created in the Baseline Tier will be enforced after +K8s NetworkPolicies and their egress rules are installed in tables [EgressDefaultRule] and [EgressRule] respectively, i.e. ```text -1. table=60, priority=200,ip,nw_src=10.10.1.2 actions=drop -2. table=60, priority=200,ip,nw_src=10.10.1.3 actions=drop -3. table=60, priority=0 actions=goto_table:61 +K8s NetworkPolicy -> EgressRule +Antrea-native NetworkPolicy other Tiers -> AntreaPolicyEgressRule +Antrea-native NetworkPolicy Baseline Tier -> EgressDefaultRule ``` -This table is also used to implement Antrea-native policy egress rules that are -created in the Baseline Tier. Since the Baseline Tier is meant to be enforced -after K8s NetworkPolicies, the corresponding flows will be created at a lower -priority than K8s default drop flows. For example, a baseline rule to drop -egress traffic to 10.0.10.0/24 for a Namespace will look like the following: +Antrea-native NetworkPolicy relies on the OVS built-in `conjunction` action to implement policies efficiently. This +enables us to do a conjunctive match across multiple dimensions (source IP, destination IP, port, etc.) efficiently +without "exploding" the number of flows. For our use case, we have at most 3 dimensions. + +The only requirement of `conj_id` is to be a unique 32-bit integer within the table. At the moment we use a single +custom allocator, which is common to all tables that can have NetworkPolicy flows installed +([AntreaPolicyEgressRule], [EgressRule], [EgressDefaultRule], [AntreaPolicyIngressRule], [IngressRule], and +[IngressDefaultRule]). + +For this table, you will need to keep in mind the Antrea-native NetworkPolicy +[specification](#antrea-native-networkpolicy-implementation). Since the sample egress policy resides in the Application +Tier. If you dump the flows of this table, you may see the following: ```text -1. table=60, priority=80,ip,nw_src=10.10.1.11 actions=conjunction(5,1/2) -2. table=60, priority=80,ip,nw_src=10.10.1.10 actions=conjunction(5,1/2) -3. table=60, priority=80,ip,nw_dst=10.0.10.0/24 actions=conjunction(5,2) -4. table=60, priority=80,conj_id=5,ip actions=load:0x3->NXM_NX_REG5[],load:0x1->NXM_NX_REG0[20],resubmit(,61) +1. table=AntreaPolicyEgressRule, priority=64990,ct_state=-new+est,ip actions=goto_table:EgressMetric +2. table=AntreaPolicyEgressRule, priority=64990,ct_state=-new+rel,ip actions=goto_table:EgressMetric +3. table=AntreaPolicyEgressRule, priority=14500,ip,nw_src=10.10.0.24 actions=conjunction(7,1/3) +4. table=AntreaPolicyEgressRule, priority=14500,ip,nw_dst=10.10.0.25 actions=conjunction(7,2/3) +5. table=AntreaPolicyEgressRule, priority=14500,tcp,tp_dst=3306 actions=conjunction(7,3/3) +6. table=AntreaPolicyEgressRule, priority=14500,conj_id=7,ip actions=set_field:0x7->reg5,ct(commit,table=EgressMetric,zone=65520,exec(set_field:0x700000000/0xffffffff00000000->ct_label)) +7. table=AntreaPolicyEgressRule, priority=14499,ip,nw_src=10.10.0.24 actions=conjunction(5,1/2) +8. table=AntreaPolicyEgressRule, priority=14499,ip actions=conjunction(5,2/2) +9. table=AntreaPolicyEgressRule, priority=14499,conj_id=5 actions=set_field:0x5->reg3,set_field:0x400/0x400->reg0,goto_table:EgressMetric +10. table=AntreaPolicyEgressRule, priority=0 actions=goto_table:EgressRule ``` -The table-miss flow entry, which is used for non-isolated Pods, forwards -traffic to the next table EgressMetricsTable, then ([L3ForwardingTable]). +Flows 1-2, which are installed by default with the highest priority, matching non-new and "tracked" packets and +forwarding them to table [EgressMetric] to bypass the check from egress rules. This means that if a connection is +established, its packets go straight to table [EgressMetric], with no other match required. In particular, this ensures +that reply traffic is never dropped because of an Antrea-native NetworkPolicy or K8s NetworkPolicy rule. However, this +also means that ongoing connections are not affected if the Antrea-native NetworkPolicy or the K8s NetworkPolicy is +updated. -### L3ForwardingTable (70) +The priorities of flows 3-9 installed for the egress rules are decided by the following: -This is the L3 routing table. It implements the following functionality: +- The `spec.tier` value in an Antrea-native NetworkPolicy determines the primary level for flow priority. +- The `spec.priority` value in an Antrea-native NetworkPolicy determines the secondary level for flow priority within + the same `spec.tier`. A lower value in this field corresponds to a higher priority for the flow. +- The rule's position within an Antrea-native NetworkPolicy also influences flow priority. Rules positioned closer to + the beginning have higher priority for the flow. -* Tunnelled traffic coming-in from a peer Node and destined to a local Pod is - directly forwarded to the Pod. This requires setting the source MAC to the MAC - of the local gateway interface and setting the destination MAC to the Pod's - MAC address. Then the packets will go to [L3DecTTLTable] for decrementing - the IP TTL value. Such packets can be identified by bit 19 of the NXM_NX_REG0 - register (which was set to 1 in the [ClassifierTable]) and the destination IP - address (which should match the IP address of a local Pod). We therefore - install one flow for each Pod created locally on the Node. For example: +Flows 3-6, whose priorities are all 14500, are installed for the egress rule `AllowToDB` in the sample policy. These +flows are described as follows: -```text -table=70, priority=200,ip,reg0=0x80000/0x80000,nw_dst=10.10.0.2 actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:12:9e:a6:47:d0:70,goto_table:72 -``` +- Flow 3 is used to match packets with the source IP address in set {10.10.0.24}, which has all IP addresses of the Pods + selected by the label `app: web`, constituting the first dimension for `conjunction` with `conj_id` 7. +- Flow 4 is used to match packets with the destination IP address in set {10.10.0.25}, which has all IP addresses of + the Pods selected by the label `app: db`, constituting the second dimension for `conjunction` with `conj_id` 7. +- Flow 5 is used to match packets with the destination TCP port in set {3306} specified in the rule, constituting the + third dimension for `conjunction` with `conj_id` 7. +- Flow 6 is used to match packets meeting all the three dimensions of `conjunction` with `conj_id` 7 and forward them + to table [EgressMetric], persisting `conj_id` to `EgressRuleCTLabel` that is consumed in table [EgressMetric]. -* All tunnelled traffic destined to the local gateway (i.e. for which the - destination IP matches the local gateway's IP) is forwarded to the gateway - port by rewriting the destination MAC (from the Global Virtual MAC to the - local gateway's MAC). +Flows 7-9, whose priorities are all 14499, are installed for the egress rule with a `Drop` action defined after the rule +`AllowToDB` in the sample policy, serves as a default rule. Unlike the default of K8s NetworkPolicy, Antrea-native +NetworkPolicy has no default rule, and all rules should be explicitly defined. Hence, they are evaluated as-is, and +there is no need for a table [AntreaPolicyEgressDefaultRule]. These flows are described as follows: -```text -table=70, priority=200,ip,reg0=0x80000/0x80000,nw_dst=10.10.0.1 actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:80 -``` +- Flow 7 is used to match packets with the source IP address in set {10.10.0.24}, which is from the Pods selected + by the label `app: web`, constituting the first dimension for `conjunction` with `conj_id` 5. +- Flow 8 is used to match any packets, constituting the second dimension for `conjunction` with `conj_id` 5. +- Flow 9 is used to match packets meeting both dimensions of `conjunction` with `conj_id` 5. `APDenyRegMark` is + loaded and will be consumed in table [EgressMetric] to which the packets are forwarded. -* All reply traffic of connections initiated through the gateway port, i.e. for - which the first packet of the connection (SYN packet for TCP) was received - through the gateway. Such packets can be identified by the packet's direction - in `ct_state` and the `ct_mark` value `0x20` which is committed in - [ConntrackCommitTable] when the first packet of the connection was handled. - A flow will overwrite the destination MAC to the local gateway MAC to ensure - that they get forwarded through the gateway port. This is required to handle - the following cases: - - reply traffic for connections from a local Pod to a ClusterIP Service, which - are handled by kube-proxy and go through DNAT. In this case the destination - IP address of the reply traffic is the Pod which initiated the connection to - the Service (no SNAT by kube-proxy). We need to make sure that these packets - are sent back through the gateway so that the source IP can be rewritten to - the ClusterIP ("undo" DNAT). If we do not use connection tracking and do not - rewrite the destination MAC, reply traffic from the backend will go directly - to the originating Pod without going first through the gateway and - kube-proxy. This means that the reply traffic will arrive at the originating - Pod with the incorrect source IP (it will be set to the backend's IP instead - of the Service IP). - - when hair-pinning is involved, i.e. connections between 2 local Pods, for - which NAT is performed. One example is a Pod accessing a NodePort Service - for which `externalTrafficPolicy` is set to `Local` using the local Node's - IP address, as there will be no SNAT for such traffic. Another example could - be `hostPort` support, depending on how the feature is implemented. +Flow 10 is the table-miss flow to forward packets not matched by other flows to table [EgressMetric]. -```text -table=70, priority=210,ct_state=+rpl+trk,ct_mark=0x20,ip actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:80 -``` +### EgressRule -* All traffic destined to a remote Pod is forwarded through the appropriate - tunnel. This means that we install one flow for each peer Node, each one - matching the destination IP address of the packet against the Pod subnet for - the Node. In case of a match the source MAC is set to the local gateway MAC, - the destination MAC is set to the Global Virtual MAC and we set the OF - `tun_dst` field to the appropriate value (i.e. the IP address of the remote - gateway). Traffic then goes to [L3DecTTLTable]. - For a given peer Node, the flow may look like this: +For this table, you will need to keep in mind the K8s NetworkPolicy +[specification](#kubernetes-networkpolicy-implementation) that we are using. + +This table is used to implement the egress rules across all K8s NetworkPolicies. If you dump the flows for this table, +you may see the following: ```text -table=70, priority=200,ip,nw_dst=10.10.1.0/24 actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:aa:bb:cc:dd:ee:ff,load:0x1->NXM_NX_REG1[],set_field:0x10000/0x10000->reg0,load:0xc0a80102->NXM_NX_TUN_IPV4_DST[],goto_table:72 +1. table=EgressRule, priority=200,ip,nw_src=10.10.0.24 actions=conjunction(2,1/3) +2. table=EgressRule, priority=200,ip,nw_dst=10.10.0.25 actions=conjunction(2,2/3) +3. table=EgressRule, priority=200,tcp,tp_dst=3306 actions=conjunction(2,3/3) +4. table=EgressRule, priority=190,conj_id=2,ip actions=set_field:0x2->reg5,ct(commit,table=EgressMetric,zone=65520,exec(set_field:0x200000000/0xffffffff00000000->ct_label)) +5. table=EgressRule, priority=0 actions=goto_table:EgressDefaultRule ``` -If none of the flows described above are hit, traffic goes directly to -[L2ForwardingCalcTable]. This is the case for external traffic, whose -destination is outside the cluster (such traffic has already been -forwarded to the local gateway by the local source Pod, and only L2 switching -is required), as well as for local Pod-to-Pod traffic. +Flows 1-4 are installed for the egress rule in the sample K8s NetworkPolicy. These flows are described as follows: -```text -table=70, priority=0 actions=goto_table:80 -``` +- Flow 1 is to match packets with the source IP address in set {10.10.0.24}, which has all IP addresses of the Pods + selected by the label `app: web`, constituting the first dimension for `conjunction` with `conj_id` 2. +- Flow 2 is to match packets with the destination IP address in set {10.10.0.25}, which has all IP addresses of the Pods + selected by the label `app: db`, constituting the second dimension for `conjunction` with `conj_id` 2. +- Flow 3 is to match packets with the destination TCP port in set {3306} specified in the rule, constituting the third + dimension for `conjunction` with `conj_id` 2. +- Flow 4 is to match packets meeting all the three dimensions of `conjunction` with `conj_id` 2 and forward them to + table [EgressMetric], persisting `conj_id` to `EgressRuleCTLabel`. + +Flow 5 is the table-miss flow to forward packets not matched by other flows to table [EgressDefaultRule]. -When the Egress feature is enabled, extra flows will be added to -[L3ForwardingTable], which send the egress traffic from Pods to external network -to [SNATTable]. The following two flows match traffic to local Pods and traffic -to the local Node IP respectively, and keep them in the normal forwarding path -(to [L2ForwardingCalcTable]), so they will not be sent to [SNATTable]: +### EgressDefaultRule + +This table complements table [EgressRule] for K8s NetworkPolicy egress rule implementation. When a NetworkPolicy is +applied to a set of Pods, and the default behavior for these Pods becomes "deny" (they become [isolated +Pods](https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). +This table is in charge of dropping traffic originating from Pods to which a NetworkPolicy (with an egress rule) is +applied, and which did not match any of the allowed list rules. + +If you dump the flows of this table, you may see the following: ```text -table=70, priority=200,ip,reg0=0/0x80000,nw_dst=10.10.1.0/24 actions=goto_table:80 -table=70, priority=200,ip,reg0=0x2/0xffff,nw_dst=192.168.1.1 actions=goto_table:80 +1. table=EgressDefaultRule, priority=200,ip,nw_src=10.10.0.24 actions=drop +2. table=EgressDefaultRule, priority=0 actions=goto_table:EgressMetric ``` -The following two flows send the traffic not matched by other flows to -[SNATTable]. One of the flows is for egress traffic from local Pods; another -one is for egress traffic from remote Pods, which is tunnelled to this Node to -be SNAT'd with a SNAT IP configured on the Node. In the latter case, the flow -also rewrites the destination MAC to the local gateway interface MAC. +Flow 1, based on our sample K8s NetworkPolicy, is to drop traffic originating from 10.10.0.24, an IP address associated +with a Pod selected by the label `app: web`. If there are multiple Pods being selected by the label `app: web`, you will +see multiple similar flows for each IP address. + +Flow 2 is the table-miss flow to forward packets to table [EgressMetric]. + +This table is also used to implement Antrea-native NetworkPolicy egress rules that are created in the Baseline Tier. +Since the Baseline Tier is meant to be enforced after K8s NetworkPolicies, the corresponding flows will be created at a +lower priority than K8s NetworkPolicy default drop flows. These flows are similar to flows 3-9 in table +[AntreaPolicyEgressRule]. + +### EgressMetric + +This table is used to collect egress metrics for Antrea-native NetworkPolicies and K8s NetworkPolicies. + +If you dump the flows of this table, you may see the following: ```text -table=70, priority=190,ip,reg0=0x2/0xf actions=goto_table:71 -table=70, priority=190,ip,reg0=0/0xf actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:71 +1. table=EgressMetric, priority=200,ct_state=+new,ct_label=0x200000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +2. table=EgressMetric, priority=200,ct_state=-new,ct_label=0x200000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +3. table=EgressMetric, priority=200,ct_state=+new,ct_label=0x700000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +4. table=EgressMetric, priority=200,ct_state=-new,ct_label=0x700000000/0xffffffff00000000,ip actions=goto_table:L3Forwarding +5. table=EgressMetric, priority=200,reg0=0x400/0x400,reg3=0x5 actions=drop +6. table=EgressMetric, priority=0 actions=goto_table:L3Forwarding ``` -### SNATTable (71) +Flows 1-2, matching packets with `EgressRuleCTLabel` set to 2, the `conj_id` allocated for the sample K8s NetworkPolicy +egress rule and loaded in table [EgressRule] flow 4, are used to collect metrics for the egress rule. + +Flows 3-4, matching packets with `EgressRuleCTLabel` set to 7, the `conj_id` allocated for the sample Antrea-native +NetworkPolicy egress rule and loaded in table [AntreaPolicyEgressRule] flow 6, are used to collect metrics for the +egress rule. + +Flow 5 serves as the drop rule for the sample Antrea-native NetworkPolicy egress rule. It drops the packets by matching +`APDenyRegMark` loaded in table [AntreaPolicyEgressRule] flow 9 and `APConjIDField` set to 5 which is the `conj_id` +allocated the egress rule and loaded in table [AntreaPolicyEgressRule] flow 9. + +Ct label is used in flows 1-4, while reg is used in flow 5. The distinction lies in the fact that the value persisted in +the ct label can be read throughout the entire lifecycle of a connection, but the reg mark is only valid for the current +packet. For a connection permitted by a rule, all its packets should be collected for metrics, thus a ct label is used. +For a connection denied or dropped by a rule, the first packet and the subsequent retry packets will be blocked, +therefore a reg is enough. + +Flow 6 is the table-miss flow. + +### L3Forwarding -This table is created only when the Egress feature is enabled. It includes flows -to implement Egresses and select the right SNAT IPs for egress traffic from Pods -to external network. +This table, designated as the L3 routing table, serves to assign suitable source and destination MAC addresses to +packets based on their destination IP addresses, as well as their reg marks or ct marks. -When no Egress applies to Pods on the Node, and no SNAT IP is configured on the -Node, [SNATTable] just has two flows. One drops egress traffic tunnelled from -remote Nodes that does not match any SNAT IP configured on this Node, and the -default flow that sends egress traffic from local Pods, which do not have any -Egress applied, to [L2ForwardingCalcTable]. Such traffic will be SNAT'd with -the default SNAT IP (by an iptables masquerade rule). +If you dump the flows of this table, you may see the following: ```text -table=71, priority=190,ct_state=+new+trk,ip,reg0=0/0xf actions=drop -table=71, priority=0 actions=goto_table:80 +1. table=L3Forwarding, priority=210,ip,nw_dst=10.10.0.1 actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +2. table=L3Forwarding, priority=210,ct_state=+rpl+trk,ct_mark=0x2/0xf,ip actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +3. table=L3Forwarding, priority=200,ip,reg0=0/0x200,nw_dst=10.10.0.0/24 actions=goto_table:L2ForwardingCalc +4. table=L3Forwarding, priority=200,ip,nw_dst=10.10.1.0/24 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:aa:bb:cc:dd:ee:ff->eth_dst,set_field:192.168.77.103->tun_dst,set_field:0x10/0xf0->reg0,goto_table:L3DecTTL +5. table=L3Forwarding, priority=200,ip,reg0=0x200/0x200,nw_dst=10.10.0.24 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:fa:b7:53:74:21:a6->eth_dst,goto_table:L3DecTTL +6. table=L3Forwarding, priority=200,ip,reg0=0x200/0x200,nw_dst=10.10.0.25 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:36:48:21:a2:9d:b4->eth_dst,goto_table:L3DecTTL +7. table=L3Forwarding, priority=200,ip,reg0=0x200/0x200,nw_dst=10.10.0.26 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:5e:b5:e3:a6:90:b7->eth_dst,goto_table:L3DecTTL +8. table=L3Forwarding, priority=190,ct_state=-rpl+trk,ip,reg0=0x3/0xf,reg4=0/0x100000 actions=goto_table:EgressMark +9. table=L3Forwarding, priority=190,ct_state=-rpl+trk,ip,reg0=0x1/0xf actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,goto_table:EgressMark +10. table=L3Forwarding, priority=190,ct_mark=0x10/0x10,reg0=0x202/0x20f actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +11. table=L3Forwarding, priority=0 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc ``` -When there is an Egress applied to a Pod on the Node, a flow will be added for -the Pod's egress traffic. If the SNAT IP of the Egress is configured on the -local Node, the flow sets an 8 bits ID allocated for the SNAT IP to pkt_mark. -The ID is for iptables SNAT rules to match the packets and perfrom SNAT with -the right SNAT IP (Antrea Agent adds an iptables SNAT rule for each local SNAT -IP that matches the ID). +Flow 1 matches packets destined for the local Antrea gateway IP, rewrites their destination MAC address to that of the +local Antrea gateway, loads `ToGatewayRegMark`, and forwards them to table [L3DecTTL] to decrease TTL value. The action +of rewriting the destination MAC address is not necessary but not harmful for Pod-to-gateway request packets because the +destination MAC address is already the local gateway MAC address. In short, the action is only necessary for +`AntreaIPAM` Pods, not required by the sample NodeIPAM Pods in this document. + +Flow 2 matches reply packets with corresponding ct "tracked" states and `FromGatewayCTMark` from connections initiated +through the local Antrea gateway. In other words, these are connections for which the first packet of the connection +(SYN packet for TCP) was received through the local Antrea gateway. It rewrites the destination MAC address to +that of the local Antrea gateway, loads `ToGatewayRegMark`, and forwards them to table [L3DecTTL]. This ensures that +reply packets can be forwarded back to the local Antrea gateway in subsequent tables, guaranteeing the availability +of the connection. This flow is required to handle the following cases when AntreaProxy is not enabled: + +- Reply traffic for connections from a local Pod to a ClusterIP Service, which are handled by kube-proxy and go through + DNAT. In this case, the destination IP address of the reply traffic is the Pod which initiated the connection to the + Service (no SNAT by kube-proxy). These packets should sent back to the local Antrea gateway to the third-party module + to complete the DNAT processes, e.g., kube-proxy. The destination MAC of the packets are rewritten in the table to + avoid it is forwarded to the original client Pod by mistake. +- When hairpin is involved, i.e. connections between 2 local Pods, for which NAT is performed. One example is a + Pod accessing a NodePort Service for which externalTrafficPolicy is set to `Local` using the local Node's IP address, + as there will be no SNAT for such traffic. Another example could be hostPort support, depending on how the feature + is implemented. + +Flow 3 matches packets from intra-Node connections (excluding Service connections) and marked with +`NotRewriteMACRegMark`, indicating that the destination and source MACs of packets should not be overwritten, and +forwards them to table [L2ForwardingCalc] instead of table [L3DecTTL]. The deviation is due to local Pods connections +not traversing any router device or undergoing NAT process. For packets from Service or inter-Node connections, +`RewriteMACRegMark`, mutually exclusive with `NotRewriteMACRegMark`, is loaded. Therefore, the packets will not be +matched by the flow. + +Flow 4 is designed to match packets destined for remote Pod CIDR. This involves installing a separate flow for each remote +Node, with each flow matching the destination IP address of the packets against the Pod subnet for the respective Node. +For the matched packets, the source MAC address is set to that of the local Antrea gateway MAC, and the destination +MAC address is set to the *Global Virtual MAC*. The Openflow `tun_dst` field is set to the appropriate value (i.e. +the IP address of the remote Node IP). Additionally, `ToTunnelRegMark` is loaded, signifying that the packets will be +forwarded to remote Nodes through a tunnel. The matched packets are then forwarded to table [L3DecTTL] to decrease the TTL +value. + +Flow 5-7 matches packets destined for local Pods and marked by `RewriteMACRegMark` that signifies that the packets may +originate from Service or inter-Node connections. For the matched packets, the source MAC address is set to that of the +local Antrea gateway MAC, and the destination MAC address is set to the associated local Pod MAC address. The matched +packets are then forwarded to table [L3DecTTL] to decrease the TTL value. + +Flow 8 matches request packets originating from local Pods and destined for the external network, and then forwards them +to table [EgressMark] dedicated to feature `Egress`. In table [EgressMark], SNAT IPs for Egress are looked up for the packets. +To match the expected packets, `FromPodRegMark` is used to exclude packets that are not from local Pods. +Additionally, `NotAntreaFlexibleIPAMRegMark`, mutually exclusive with `AntreaFlexibleIPAMRegMark` that is used to mark +packets from Antrea IPAM Pods, is used since Egress can only be applied to Node IPAM Pods. + +Flow 9 matches request packets originating from remote Pods and destined for the external network, and then forwards them +to table [EgressMark] dedicated to feature `Egress`. To match the expected packets, `FromTunnelRegMark` is used to +include packets that are from remote Pods through a tunnel. Considering that the packets from remote Pods traverse a +tunnel, the destination MAC address of the packets, represented by the *Global Virtual MAC*, needs to be rewritten to +MAC address of the local Antrea gateway. + +Flow 10 matches packets from Service connections that are originating from the local Antrea gateway and destined for the +external network. This is accomplished by matching `RewriteMACRegMark`, `FromGatewayRegMark`, and `ServiceCTMark`. The +destination MAC address is then set to that of the local Antrea gateway. Additionally, `ToGatewayRegMark` that will be +used with `FromGatewayRegMark` together to identify hairpin connections in table [SNATMark] is loaded. Finally, +the packets are forwarded to table [L3DecTTL]. + +Flow 11 is the table-miss flow, matching packets originating from local Pods and destined for the external network, and +then forwarding them to table [L2ForwardingCalc]. `ToGatewayRegMark` is loaded as the matched packets traverse the +local Antrea gateway. + +### EgressMark + +This table is dedicated to feature `Egress`. It includes flows to select the right SNAT IPs for egress traffic +originating from Pods and destined for the external network. + +If you dump the flows of this table, you may see the following: ```text -table=71, priority=200,ct_state=+new+trk,ip,in_port="pod1-7e503a" actions=set_field:0x1/0xff->pkt_mark,goto_table:80 +1. table=EgressMark, priority=210,ip,nw_dst=192.168.77.102 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc +2. table=EgressMark, priority=210,ip,nw_dst=192.168.77.103 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc +3. table=EgressMark, priority=210,ip,nw_dst=10.96.0.0/12 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc +4. table=EgressMark, priority=200,ip,in_port="client-6-3353ef" actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:aa:bb:cc:dd:ee:ff->eth_dst,set_field:192.168.77.113->tun_dst,set_field:0x10/0xf0->reg0,set_field:0x80000/0x80000->reg0,goto_table:L2ForwardingCalc +5. table=EgressMark, priority=200,ct_state=+new+trk,ip,tun_dst=192.168.77.112 actions=set_field:0x1/0xff->pkt_mark,set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc +6. table=EgressMark, priority=200,ct_state=+new+trk,ip,in_port="web-7975-274540" actions=set_field:0x1/0xff->pkt_mark,set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc +7. table=EgressMark, priority=190,ct_state=+new+trk,ip,reg0=0x1/0xf actions=drop +8. table=EgressMark, priority=0 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc ``` -When the SNAT IP of the Egress is on a remote Node, the flow will tunnel the -packets to the remote Node with the tunnel's destination IP to be the SNAT IP. -The packets will be SNAT'd on the remote Node. The same as a normal tunnel flow -in [L3ForwardingTable], the flow will rewrite the packets' source and -destination MAC addresses, load the SNAT IP to NXM_NX_TUN_IPV4_DST, and send the -packets to [L3DecTTLTable]. +Flows 1-2 match packets originating from local Pods and destined for the transport IP of remote Nodes, and then forward +them to table [L2ForwardingCalc] to skip Egress SNAT. `ToGatewayRegMark` is loaded, indicating that the output port of +the packets is the local Antrea gateway. + +Flow 3 matches packets originating from local Pods and destined for the Service CIDR, and then forwards them to table +[L2ForwardingCalc] to skip Egress SNAT. Similar to flows 1-2, `ToGatewayRegMark` is also loaded. + +Flow 4 match packets originating from local Pods selected by the sample Egress `egress-client`, whose SNAT IP is configured +on a remote Node, which means that the matched packets should be forwarded to the remote Node through a tunnel. Before +sending the packets to the tunnel, the source and destination MAC addresses are set to the local Antrea gateway MAC +and the *Global Virtual MAC* respectively. Additionally, `ToTunnelRegMark`, indicating that the output port is a tunnel, +and `EgressSNATRegMark`, indicating that packets should undergo SNAT on a remote Node, are loaded. Finally, the packets +are forwarded to table [L2ForwardingCalc]. + +Flow 5 matches the first packet of connections originating from remote Pods selected by the sample Egress `egress-web` +whose SNAT IP is configured on the local Node, and then loads an 8-bit ID allocated for the associated SNAT IP defined +in the sample Egress to the `pkt_mark`, which will be identified by iptables on the local Node to perform SNAT with the +SNAT IP. Subsequently, `ToGatewayRegMark`, indicating that the output port is the local Antrea gateway, is loaded. +Finally, the packets are forwarded to table [L2ForwardingCalc]. + +Flow 6 matches the first packet of connections originating from local Pods selected by the sample Egress `egress-web`, +whose SNAT IP is configured on the local Node. Similar to flow 4, the 8-bit ID allocated for the SNAT IP is loaded to +`pkt_mark`, `ToGatewayRegMark` is loaded, and the packets are forwarded to table [L2ForwardingCalc] finally. + +Flow 7 drops packets tunneled from remote Nodes (identified with `FromTunnelRegMark`, indicating that the packets are +from remote Pods through a tunnel). The packets are not matched by any flows 1-6, which means that they are here +unexpected and should be dropped. + +Flow 8 is the table-miss flow, which matches "tracked" and non-new packets from Egress connections and forwards +them to table [L2ForwardingCalc]. `ToGatewayRegMark` is also loaded for these packets. + +### L3DecTTL + +This is the table to decrement TTL for IP packets. + +If you dump the flows of this table, you may see the following: ```text -table=71, priority=200,ct_state=+new+trk,ip,in_port="pod2-357c21" actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:aa:bb:cc:dd:ee:ff,load:0x1->NXM_NX_REG1[],set_field:0x10000/0x10000->reg0,load:0xc0a80a66->NXM_NX_TUN_IPV4_DST[],goto_table:72 +1. table=L3DecTTL, priority=210,ip,reg0=0x2/0xf actions=goto_table:SNATMark +2. table=L3DecTTL, priority=200,ip actions=dec_ttl,goto_table:SNATMark +3. table=L3DecTTL, priority=0 actions=goto_table:SNATMark ``` -Last, when a SNAT IP configured for Egresses is on the local Node, an additional -flow is added in [SNATTable] for egress traffic from remote Node that should -use the SNAT IP. The flow matches the tunnel destination IP (which should be -equal to the SNAT IP), and sets the 8 bits ID of the SNAT IP to pkt_mark. +Flow 1 matches packets with `FromGatewayRegMark`, which means that these packets enter the OVS pipeline from the local +Antrea gateway, as the host IP stack should have decremented the TTL already for such packets, TTL should not be +decremented again. + +Flow 2 is to decrement TTL for packets which are not matched by flow 1. + +Flow 3 is the table-miss flow that should remain unused. + +### SNATMark + +This table marks connections requiring SNAT within the OVS pipeline, distinct from Egress SNAT handled by iptables. + +If you dump the flows of this table, you may see the following: ```text -table=71, priority=200,ct_state=+new+trk,ip,tun_dst="192.168.10.101" actions=set_field:0x1/0xff->pkt_mark,goto_table:80 +1. table=SNATMark, priority=200,ct_state=+new+trk,ip,reg0=0x22/0xff actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +2. table=SNATMark, priority=200,ct_state=+new+trk,ip,reg0=0x12/0xff,reg4=0x200000/0x2200000 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark)) +3. table=SNATMark, priority=190,ct_state=+new+trk,ip,nw_src=10.10.0.23,nw_dst=10.10.0.23 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +4. table=SNATMark, priority=190,ct_state=+new+trk,ip,nw_src=10.10.0.24,nw_dst=10.10.0.24 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +5. table=SNATMark, priority=0 actions=goto_table:SNAT ``` -### L3DecTTLTable (72) +Flow 1 matches the first packet of hairpin Service connections, identified by `FromGatewayRegMark` and `ToGatewayRegMark`, +indicating that both the input and output ports of the connections are the local Antrea gateway port. Such hairpin +connections will undergo SNAT with the *Virtual Service IP* in table [SNAT]. Before forwarding the packets to table +[SNAT], `ConnSNATCTMark`, indicating that the connection requires SNAT, and `HairpinCTMark`, indicating that this is +a hairpin connection, are persisted to mark the connections. These two ct marks will be consumed in table [SNAT]. + +Flow 2 matches the first packet of Service connections requiring SNAT, identified by `FromGatewayRegMark` and +`ToTunnelRegMark`, indicating that the input port is the local Antrea gateway and the output port is a tunnel. Such +connections will undergo SNAT with the IP address of the local Antrea gateway in table [SNAT]. Before forwarding the +packets to table [SNAT], `ToExternalAddressRegMark` and `NotDSRServiceRegMark` are loaded, indicating that the packets +are destined for a Service's external IP, like NodePort, LoadBalancerIP or ExternalIP, but it is not DSR mode. +Additionally, `ConnSNATCTMark`, indicating that the connection requires SNAT, is persisted to mark the connections. -This is the table to decrement TTL for the IP packets destined to remote Nodes -through a tunnel, or the IP packets received from a tunnel. But for the packets -that enter the OVS pipeline from the local gateway and are destined to a remote -Node, TTL should not be decremented in OVS on the source Node, because the host -IP stack should have already decremented TTL if that is needed. +Flow 3-4 match the first packet of hairpin Service connections, identified by the same source and destination IP +addresses. Such hairpin connections will undergo with the IP address of the local Antrea gateway in table [SNAT]. +Similar to flow 1, `ConnSNATCTMark` and `HairpinCTMark` are persisted to mark the connections. -If you dump the flows for this table, you should see flows like the following: +Flow 5 is the table-miss flow. + +### SNAT + +This table performs SNAT for connections requiring SNAT within the pipeline. + +If you dump the flows of this table, you may see the following: ```text -1. table=72, priority=210,ip,reg0=0x1/0xf, actions=goto_table:80 -2. table=72, priority=200,ip, actions=dec_ttl,goto_table:80 -3. table=72, priority=0, actions=goto_table:80 +1. table=SNAT, priority=200,ct_state=+new+trk,ct_mark=0x40/0x40,ip,reg0=0x2/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=169.254.0.253),exec(set_field:0x10/0x10->ct_mark,set_field:0x40/0x40->ct_mark)) +2. table=SNAT, priority=200,ct_state=+new+trk,ct_mark=0x40/0x40,ip,reg0=0x3/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=10.10.0.1),exec(set_field:0x10/0x10->ct_mark,set_field:0x40/0x40->ct_mark)) +3. table=SNAT, priority=200,ct_state=-new-rpl+trk,ct_mark=0x20/0x20,ip actions=ct(table=L2ForwardingCalc,zone=65521,nat) +4. table=SNAT, priority=190,ct_state=+new+trk,ct_mark=0x20/0x20,ip,reg0=0x2/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=10.10.0.1),exec(set_field:0x10/0x10->ct_mark)) +5. table=SNAT, priority=0 actions=goto_table:L2ForwardingCalc ``` -The first flow is to bypass the TTL decrement for the packets from the gateway -port. +Flow 1 matches the first packet of hairpin Service connections through the local Antrea gateway, identified by +`HairpinCTMark` and `FromGatewayRegMark`. It performs SNAT with the *Virtual Service IP* `169.254.0.253` and forwards +the SNAT'd packets to table [L2ForwardingCalc]. Before SNAT, the "tracked" state of packets is associated with `CtZone`. +After SNAT, their "track" state is associated with `SNATCtZone`, and then `ServiceCTMark` and `HairpinCTMark` persisted +in `CtZone` are not accessible anymore. As a result, `ServiceCTMark` and `HairpinCTMark` need to be persisted once +again, but this time they are persisted in `SNATCtZone` for subsequent tables to consume. + +Flow 2 matches the first packet of hairpin Service connection originating from local Pods, identified by `HairpinCTMark` +and `FromPodRegMark`. It performs SNAT with the IP address of the local Antrea gateway and forwards the SNAT'd packets +to table [L2ForwardingCalc]. Similar to flow 1, `ServiceCTMark` and `HairpinCTMark` are persisted in `SNATCtZone`. + +Flow 3 matches the subsequent request packets of connection whose first request packet has been performed SNAT and then +invoke `ct` action on the packets again to restore the "tracked" state in `SNATCtZone`. The packets with the appropriate +"tracked" state are forwarded to table [L2ForwardingCalc]. + +Flow 4 matches the first packet of Service connections requiring SNAT, identified by `ConnSNATCTMark` and +`FromGatewayRegMark`, indicating the connection is destined for an external Service IP initiated through the +Antrea gateway and the Endpoint is a remote Pod. It performs SNAT with the IP address of the local Antrea gateway and +forwards the SNAT'd packets to table [L2ForwardingCalc]. Similar to other flow 1 or 2, `ServiceCTMark` is persisted in +`SNATCtZone`. + +Flow 5 is the table-miss flow. + +### L2ForwardingCalc -### L2ForwardingCalcTable (80) +This is essentially the "dmac" table of the switch. We program one flow for each port (tunnel port, the local Antrea +gateway port, and local Pod ports). -This is essentially the "dmac" table of the switch. We program one flow for each -port (tunnel port, gateway port, and local Pod ports), as you can see if you -dump the flows: +If you dump the flows of this table, you may see the following: ```text -1. table=80, priority=200,dl_dst=aa:bb:cc:dd:ee:ff actions=set_field:0x8000->reg1,set_field:0x10000/0x10000->reg0,goto_table:105 -2. table=80, priority=200,dl_dst=e2:e5:a4:9b:1c:b1 actions=set_field:0x8001->reg1,set_field:0x10000/0x10000->reg0,goto_table:105 -3. table=80, priority=200,dl_dst=12:9e:a6:47:d0:70 actions=set_field:0x3->reg1,set_field:0x10000/0x10000->reg0,goto_table:90 -4. table=80, priority=200,dl_dst=ba:a8:13:ca:ed:cf actions=set_field:0x8002->reg1,set_field:0x10000/0x10000->reg0,goto_table:90 -5. table=80, priority=0 actions=goto_table:105 +1. table=L2ForwardingCalc, priority=200,dl_dst=ba:5e:d1:55:aa:c0 actions=set_field:0x2->reg1,set_field:0x200000/0x600000->reg0,goto_table:TrafficControl +2. table=L2ForwardingCalc, priority=200,dl_dst=aa:bb:cc:dd:ee:ff actions=set_field:0x1->reg1,set_field:0x200000/0x600000->reg0,goto_table:TrafficControl +3. table=L2ForwardingCalc, priority=200,dl_dst=5e:b5:e3:a6:90:b7 actions=set_field:0x24->reg1,set_field:0x200000/0x600000->reg0,goto_table:TrafficControl +4. table=L2ForwardingCalc, priority=200,dl_dst=fa:b7:53:74:21:a6 actions=set_field:0x25->reg1,set_field:0x200000/0x600000->reg0,goto_table:TrafficControl +5. table=L2ForwardingCalc, priority=200,dl_dst=36:48:21:a2:9d:b4 actions=set_field:0x26->reg1,set_field:0x200000/0x600000->reg0,goto_table:TrafficControl +6. table=L2ForwardingCalc, priority=0 actions=goto_table:TrafficControl ``` -For each port flow (1 through 5 in the example above), we set bit 16 of the -NXM_NX_REG0 register to indicate that there was a matching entry for the -destination MAC address and that the packet must be forwarded. In the last table -of the pipeline ([L2ForwardingOutTable]), we will drop all packets for which -this bit is not set. We also use the NXM_NX_REG1 register to store the egress -port for the packet, which will be used as a parameter to the `output` OpenFlow -action in [L2ForwardingOutTable]. - -The packets that match local Pods' MAC entries will go to the first table -([AntreaPolicyIngressRuleTable] when AntreaPolicy is enabled, or -[IngressRuleTable] when AntreaPolicy is not enabled) for NetworkPolicy ingress -rules. Other packets will go to [ConntrackCommitTable]. Specifically, packets -to the gateway port or the tunnel port will also go to [ConntrackCommitTable] -and bypass the NetworkPolicy ingress rule tables, as NetworkPolicy ingress rules -are not enforced for these packets on the source Node. - -What about L2 multicast / broadcast traffic? ARP requests will never reach this -table, as they will be handled by the OpenFlow `normal` action in the -[ArpResponderTable]. As for the rest, if it is IP traffic, it will hit the -"last" flow in this table and go to [ConntrackCommitTable]; and finally the last -table of the pipeline ([L2ForwardingOutTable]), and get dropped there since bit -16 of the NXM_NX_REG0 will not be set. Traffic which is non-ARP and non-IP -(assuming any can be received by the switch) is actually dropped much earlier in -the pipeline ([SpoofGuardTable]). In the future, we may need to support more -cases for L2 multicast / broadcast traffic. - -### AntreaPolicyIngressRuleTable (85) - -This table is very similar to [AntreaPolicyEgressRuleTable], but implements -the ingress rules of Antrea-native Policies. Depending on the tier to which the policy -belongs to, the rules will be installed in a table corresponding to that tier. -The ingress table to tier mappings is as follows: +Flow 1 matches packets destined for the local Antrea gateway, identified by the destination MAC address being that of +the local Antrea gateway. It loads `OutputToOFPortRegMark`, indicating that the packets should output to an OVS port, +and also loads port number of the local Antrea gateway to `TargetOFPortField`. Both of these two values will be consumed +in table [Output]. + +Flow 2 matches packets destined for a tunnel, identified by the destination MAC address being that of the *Global Virtual +MAC*. Similar to flow 1, `OutputToOFPortRegMark` is loaded, and the port number of the tunnel is loaded to +`TargetOFPortField`. + +Flows 3-5 match packets destined for local Pods, identified by the destination MAC address being that of the local +Pods. Similar to flow 1, `OutputToOFPortRegMark` is loaded, and the port number of the local Pods is loaded to +`TargetOFPortField`. + +Flow 6 is the table-miss flow. + +### TrafficControl + +This table is dedicated to `TrafficControl`. + +If you dump the flows of this table, you may see the following: ```text -Baseline Tier -> IngressDefaultTable(100) -K8s NetworkPolicy -> IngressRuleTable(90) -All other Tiers -> AntreaPolicyIngressRuleTable(85) +1. table=TrafficControl, priority=210,reg0=0x200006/0x60000f actions=goto_table:Output +2. table=TrafficControl, priority=200,reg1=0x25 actions=set_field:0x22->reg9,set_field:0x800000/0xc00000->reg4,goto_table:IngressSecurityClassifier +3. table=TrafficControl, priority=200,in_port="web-7975-274540" actions=set_field:0x22->reg9,set_field:0x800000/0xc00000->reg4,goto_table:IngressSecurityClassifier +4. table=TrafficControl, priority=200,reg1=0x26 actions=set_field:0x27->reg9,set_field:0x400000/0xc00000->reg4,goto_table:IngressSecurityClassifier +5. table=TrafficControl, priority=200,in_port="db-755c6-5080e3" actions=set_field:0x27->reg9,set_field:0x400000/0xc00000->reg4,goto_table:IngressSecurityClassifier +6. table=TrafficControl, priority=0 actions=goto_table:IngressSecurityClassifier ``` -Again for this table, you will need to keep in mind the ACNP -[specification](#antrea-native-policies-implementation) that we are using. -Since the example ACNP resides in the Application tier, if you dump the flows -for table 85, you should see something like this: +Flow 1 matches packets returned from TrafficControl return ports and forwards them to table [Output], where the packets +are output to the port to which they are destined. To identify such packets, `OutputToOFPortRegMark`, indicating that +the packets should be output to an OVS port, and `FromTCReturnRegMark` loaded in table [Classifier], indicating that +the packets are from a TrafficControl return port, are utilized. + +Flow 2 is installed for the sample TrafficControl `redirect-web-to-local`, which marks the packets destined for the Pods +labeled by `app: web` with `TrafficControlRedirectRegMark`, indicating the packets should be redirected to a +TrafficControl target port whose number is loaded to `TrafficControlTargetOFPortField`. + +Flow 3 is also installed for the sample TrafficControl `redirect-web-to-local`. Similar to flow 2, +`TrafficControlRedirectRegMark` is loaded and the TrafficControl target port whose number is loaded to +`TrafficControlTargetOFPortField`. + +Flow 4 is installed for the sample TrafficControl `mirror-db-to-local`, which marks the packets destined for the Pods +labeled by `app: db` with `TrafficControlMirrorRegMark`, indicating the packets should be mirrored to a +TrafficControl target port whose number is loaded to `TrafficControlTargetOFPortField`. + +Flow 5 is also installed for the sample TrafficControl `redirect-web-to-local`. Similar to flow 2, +`TrafficControlRedirectRegMark` is loaded and the TrafficControl target port whose number is loaded to +`TrafficControlTargetOFPortField`. + +Flow 6 is the table-miss flow. + +### IngressSecurityClassifier + +This table is to classify packets before they enter the tables for ingress security. + +If you dump the flows of this table, you may see the following: ```text -1. table=85, priority=64990,ct_state=-new+est,ip actions=resubmit(,105) -2. table=85, priority=14000,conj_id=4,ip actions=load:0x4->NXM_NX_REG3[],load:0x1->NXM_NX_REG0[20],resubmit(,101) -3. table=85, priority=14000,ip,nw_src=10.10.1.7 actions=conjunction(4,1/3) -4. table=85, priority=14000,ip,reg1=0x19c actions=conjunction(4,2/3) -5. table=85, priority=14000,tcp,tp_dst=80 actions=conjunction(4,3/3) -6. table=85, priority=0 actions=resubmit(,90) +1. table=IngressSecurityClassifier, priority=210,pkt_mark=0x80000000/0x80000000,ct_state=-rpl+trk,ip actions=goto_table:ConntrackCommit +2. table=IngressSecurityClassifier, priority=201,reg4=0x80000/0x80000 actions=goto_table:AntreaPolicyIngressRule +3. table=IngressSecurityClassifier, priority=200,reg0=0x20/0xf0 actions=goto_table:IngressMetric +4. table=IngressSecurityClassifier, priority=200,reg0=0x10/0xf0 actions=goto_table:IngressMetric +5. table=IngressSecurityClassifier, priority=200,reg0=0x40/0xf0 actions=goto_table:IngressMetric +6. table=IngressSecurityClassifier, priority=200,ct_mark=0x40/0x40 actions=goto_table:ConntrackCommit +7. table=IngressSecurityClassifier, priority=0 actions=goto_table:AntreaPolicyIngressRule ``` -As for [AntreaPolicyEgressRuleTable], flow 1 (highest priority) ensures that for -established connections packets go straight to IngressMetricsTable, -then [L2ForwardingOutTable], with no other match required. +Flow 1 matches locally generated request packets, identified by `pkt_mark` which is set by iptables in the host network +namespace. It forwards the packets to table [ConntrackCommit] directly to bypass all tables for ingress security. -The rest of the flows read as follows: if the source IP address is in set -{10.10.1.7}, and the destination OF port is in the set {412} (which -correspond to IP addresses {10.10.1.6}), and the destination TCP port -is in the set {80}, then use `conjunction` action with id 4, which loads -the `conj_id` 4 into NXM_NX_REG3, a register used by Antrea internally to -indicate the disposition of the packet is Drop, and forward the packet to -IngressMetricsTable for it to be dropped. +Flow 2 matches packets destined for NodePort Services and forwards them to table [AntreaPolicyIngressRule] to enforce +Antrea-native NetworkPolicies applied to NodePort Services. Without this flow, if the selected Endpoint is not a local +Pod, the packets might be matched by one of the flows 3-5, skipping table [AntreaPolicyIngressRule]. -Otherwise, go to [IngressRuleTable] if no conjunctive flow above priority 0 is matched. -This corresponds to the case where the packet is not matched by any of the Antrea-native -policy ingress rules in any tier (except for the "baseline" tier). -One notable difference is how we use OF ports to identify the destination of -the traffic, while we use IP addresses in [AntreaPolicyEgressRuleTable] to -identify the source of the traffic. More details regarding this can be found -in the following [IngressRuleTable] section. +Flows 3-5 matches packets destined for the local Antrea gateway, tunnel, uplink port with `ToGatewayRegMark`, +`ToTunnelRegMark` or `ToUplinkRegMark`, respectively, and forwards them to table [IngressMetric] directly to bypass +all tables for ingress security. -As seen in [AntreaPolicyEgressRuleTable], the default action is to evaluate K8s -Network Policy [IngressRuleTable] and a AntreaPolicyIngressDefaultTable does not exist. +Flow 5 matches packets from hairpin connections with `HairpinCTMark` and forwards them to table [ConntrackCommit] +directly to bypass all tables for ingress security. Refer to this PR +[#5687](https://github.com/antrea-io/antrea/pull/5687) for more information. -### IngressRuleTable (90) +Flow 6 is the table-miss flow. -This table is very similar to [EgressRuleTable], but implements ingress rules -for Network Policies. Once again, you will need to keep mind the Network Policy -[specification](#network-policy-implementation) that we are using. We have 2 -Pods running on the same Node, with IP addresses 10.10.1.2 to 10.10.1.3. They -are allowed to talk to each other using TCP on port 80, but nothing else. +### AntreaPolicyIngressRule -If you dump the flows for this table, you should see something like this: +This table is very similar to table [AntreaPolicyEgressRule] but implements the ingress rules of Antrea-native +NetworkPolicies. Depending on the tier to which the policy belongs, the rules will be installed in a table corresponding +to that tier. The ingress table to tier mappings is as follows: ```text -1. table=90, priority=210,ct_state=-new+est,ip actions=goto_table:101 -2. table=90, priority=210,pkt_mark=0x1/0x1 actions=goto_table:105 -3. table=90, priority=200,ip,nw_src=10.10.1.2 actions=conjunction(3,1/3) -4. table=90, priority=200,ip,nw_src=10.10.1.3 actions=conjunction(3,1/3) -5. table=90, priority=200,ip,reg1=0x3 actions=conjunction(3,2/3) -6. table=90, priority=200,ip,reg1=0x8002 actions=conjunction(3,2/3) -7. table=90, priority=200,tcp,tp_dst=80 actions=conjunction(3,3/3) -8. table=90, priority=190,conj_id=3,ip actions=load:0x3->NXM_NX_REG6[],ct(commit,table=101,zone=65520,exec(load:0x3->NXM_NX_CT_LABEL[0..31])) -9. table=90, priority=0 actions=goto_table:100 +K8s NetworkPolicy -> IngressRule +Antrea-native NetworkPolicy other Tiers -> AntreaPolicyIngressRule +Antrea-native NetworkPolicy Baseline Tier -> IngressDefaultRule ``` -As for [EgressRuleTable], flow 1 (highest priority) ensures that for established -connections - as a reminder all connections are committed in -[ConntrackCommitTable] - packets go straight to IngressMetricsTable, -then [L2ForwardingOutTable], with no other match required. - -Flow 2 ensures that the traffic initiated from the host network namespace cannot -be dropped because of Network Policies. This ensures that K8s [liveness -probes](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/) -can go through. An iptables rule in the mangle table of the host network -namespace is responsible for marking the locally-generated packets with the -`0x1/0x1` mark. Note that the flow will be different for Windows worker Node or -when OVS userspace (netdev) datapath is used. This is because either there is no -way to add mark for particular traffic (i.e. Windows) or matching the mark in -OVS is not properly supported (i.e. netdev datapath). As a result, the flow will -match source IP instead, however, NodePort Service access by external clients -will be masqueraded as a local gateway IP to bypass Network Policies. This may -be fixed after AntreaProxy can serve NodePort traffic. - -The rest of the flows read as follows: if the source IP address is in set -{10.10.1.2, 10.10.1.3}, and the destination OF port is in the set {3, 4} (which -correspond to IP addresses {10.10.1.2, 10.10.1.3}, and the destination TCP port -is in the set {80}, then use `conjunction` action with id 3, which stores the -`conj_id` 3 in `ct_label[0..31]` for egress metrics collection purposes, and forwards -the packet to IngressMetricsTable, then [L2ForwardingOutTable]. Otherwise, go to -[IngressDefaultTable]. One notable difference is how we use OF ports to identify -the destination of the traffic, while we use IP addresses in [EgressRuleTable] -to identify the source of the traffic. We do this as an increased security measure -in case a local Pod is misbehaving and trying to access another local Pod using -the correct destination MAC address but a different destination IP address to bypass -an egress Network Policy rule. This is also why the Network Policy ingress rules -are enforced after the egress port has been determined. - -### IngressDefaultTable (100) - -This table is similar in its purpose to [EgressDefaultTable], and it complements -[IngressRuleTable] for Network Policy ingress rule implementation. In K8s, when -a Network Policy is applied to a set of Pods, the default behavior for these -Pods become "deny" (it becomes an [isolated -Pod](https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). This -table is in charge of dropping traffic destined to Pods to which a Network -Policy (with an ingress rule) is applied, and which did not match any of the -allowlist rules. - -Accordingly, based on our Network Policy example, we would expect to see flows -to drop traffic destined to our 2 Pods (3 and 4), which is confirmed by dumping -the flows: +Again for this table, you will need to keep in mind the Antrea-native NetworkPolicy +[specification](#antrea-native-networkpolicy-implementation) and Antrea-native L7 NetworkPolicy +[specification](#antrea-native-l7-networkpolicy-implementation) that we are using that we are using. Since these sample +ingress policies reside in the Application Tier, if you dump the flows for this table, you may see the following: ```text -1. table=100, priority=200,ip,reg1=0x3 actions=drop -2. table=100, priority=200,ip,reg1=0x8002 actions=drop -3. table=100, priority=0 actions=goto_table:105 +1. table=AntreaPolicyIngressRule, priority=64990,ct_state=-new+est,ip actions=goto_table:IngressMetric +2. table=AntreaPolicyIngressRule, priority=64990,ct_state=-new+rel,ip actions=goto_table:IngressMetric +3. table=AntreaPolicyIngressRule, priority=14500,reg1=0x7 actions=conjunction(14,2/3) +4. table=AntreaPolicyIngressRule, priority=14500,ip,nw_src=10.10.0.26 actions=conjunction(14,1/3) +5. table=AntreaPolicyIngressRule, priority=14500,tcp,tp_dst=8080 actions=conjunction(14,3/3) +6. table=AntreaPolicyIngressRule, priority=14500,conj_id=14,ip actions=set_field:0xd->reg6,ct(commit,table=IngressMetric,zone=65520,exec(set_field:0xd/0xffffffff->ct_label,set_field:0x80/0x80->ct_mark,set_field:0x20000000000000000/0xfff0000000000000000->ct_label)) +7. table=AntreaPolicyIngressRule, priority=14600,ip,nw_src=10.10.0.26 actions=conjunction(6,1/3) +8. table=AntreaPolicyIngressRule, priority=14600,reg1=0x25 actions=conjunction(6,2/3) +9. table=AntreaPolicyIngressRule, priority=14600,tcp,tp_dst=80 actions=conjunction(6,3/3) +10. table=AntreaPolicyIngressRule, priority=14600,conj_id=6,ip actions=set_field:0x6->reg6,ct(commit,table=IngressMetric,zone=65520,exec(set_field:0x6/0xffffffff->ct_label)) +11. table=AntreaPolicyIngressRule, priority=14600,ip actions=conjunction(4,1/2) +12. table=AntreaPolicyIngressRule, priority=14599,reg1=0x25 actions=conjunction(4,2/2) +13. table=AntreaPolicyIngressRule, priority=14599,conj_id=4 actions=set_field:0x4->reg3,set_field:0x400/0x400->reg0,goto_table:IngressMetric +14. table=AntreaPolicyIngressRule, priority=0 actions=goto_table:IngressRule ``` -Similar to the [EgressDefaultTable], this table is also used to implement -Antrea-native policy ingress rules that are created in the Baseline Tier. -Since the Baseline Tier is meant to be enforced after K8s NetworkPolicies, the -corresponding flows will be created at a lower priority than K8s default drop flows. -For example, a baseline rule to isolate ingress traffic for a Namespace will look -like the following: +Flows 1-2, which are installed by default with the highest priority, matching non-new and "tracked" packets and +forwarding them to table [IngressMetric] to bypass the check from egress rules. This means that if a connection is +established, its packets go straight to table [IngressMetric], with no other match required. In particular, this ensures +that reply traffic is never dropped because of an Antrea-native NetworkPolicy or K8s NetworkPolicy rule. However, this +also means that ongoing connections are not affected if the Antrea-native NetworkPolicy or the K8s NetworkPolicy is +updated. + +Similar to table [AntreaPolicyEgressRule], the priorities of flows 3-13 installed for the ingress rules are decided by +the following: + +- The `spec.tier` value in an Antrea-native NetworkPolicy determines the primary level for flow priority. +- The `spec.priority` value in an Antrea-native NetworkPolicy determines the secondary level for flow priority within + the same `spec.tier`. A lower value in this field corresponds to a higher priority for the flow. +- The rule's position within an Antrea-native NetworkPolicy also influences flow priority. Rules positioned closer to + the beginning have higher priority for the flow. + +Flows 3-6, whose priories are all 14500, are installed for the egress rule `AllowFromClientL7` in the sample policy. +These flows are described as follows: + +- Flow 3 is used to match packets with the source IP address in set {10.10.0.26}, which has all IP addresses of the + Pods selected by the label `app: client`, constituting the first dimension for `cojunction` with `conj_id` 14. +- Flow 4 is used to match packets with the output OVS port in set {0x25}, which has all the ports of the Pods selected + by the label `app: web`, constituting the second dimension for `conjunction` with `conj_id` 14. +- Flow 5 is used to match packets with the destination TCP port in set {8080} specified in the rule, constituting the + third dimension for `conjunction` with `conj_id` 14. +- Flow 6 is used to match packets meeting all the three dimensions of `conjunction` with `conj_id` 14 and forward them + to table [IngressMetric], persisting `conj_id` to `IngressRuleCTLabel` consumed in table [IngressMetric]. + Additionally, for the L7 protocol: + - `L7NPRedirectCTMark` is persisted, indicating the packets should be redirected to an application-aware engine to + be filtered according to L7 rules, such as method `GET` and path `/api/v2/*` in the sample policy. + - A VLAN ID allocated for the Antrea-native L7 NetworkPolicy is persisted in `L7NPRuleVlanIDCTLabel`, which will be + consumed in table [Output]. + +Flows 7-11, whose priorities are 14600, are installed for the egress rule `AllowFromClient` in the sample policy. +These flows are described as follows: + +- Flow 7 is used to match packets with the source IP address in set {10.10.0.26}, which has all IP addresses of the Pods + selected by the label `app: client`, constituting the first dimension for `cojunction` with `conj_id` 6. +- Flow 8 is used to match packets with the output OVS port in set {0x25}, which has all the ports of the Pods selected + by the label `app: web`, constituting the second dimension for `conjunction` with `conj_id` 6. +- Flow 9 is used to match packets with the destination TCP port in set {80} specified in the rule, constituting the + third dimension for `conjunction` with `conj_id` 6. +- Flow 10 is used to match packets meeting all the three dimensions of `conjunction` with `conj_id` 6 and forward + them to table [IngressMetric], persisting `conj_id` to `IngressRuleCTLabel` consumed in table [IngressMetric]. + +Flows 11-13, whose priorities are all 14599, are installed for the egress rule with a `Drop` action defined after the +rule `AllowFromClient` in the sample policy, serves as a default rule. Unlike the default of K8s NetworkPolicy, +Antrea-native NetworkPolicy has no default rule, and all rules should be explicitly defined. Hence, they are evaluated +as-is, and there is no need for a table [AntreaPolicyIngressDefaultRule]. These flows are described as follows: + +- Flow 11 is used to match any packets, constituting the second dimension for `conjunction` with `conj_id` 4. +- Flow 12 is used to match packets with the output OVS port in set {0x25}, which has all the ports of the Pods + selected by the label `app: web`, constituting the first dimension for `conjunction` with `conj_id` 4. +- Flow 13 is used to match packets meeting both dimensions of `conjunction` with `conj_id` 4. `APDenyRegMark` that + will be consumed in table [IngressMetric] to which the packets are forwarded is loaded. + +Flow 14 is the table-miss flow to forward packets not matched by other flows to table [IngressMetric]. + +### IngressRule + +This table is very similar to table [EgressRule] but implements ingress rules for K8s NetworkPolicies. Once again, you +will need to keep in mind the K8s NetworkPolicy [specification](#kubernetes-networkpolicy-implementation) that we are +using. + +If you dump the flows of this table, you should see something like this: ```text -table=100, priority=80,ip,reg1=0xb actions=conjunction(6,2/3) -table=100, priority=80,ip,reg1=0xc actions=conjunction(6,2/3) -table=100, priority=80,ip,nw_src=10.10.1.9 actions=conjunction(6,1/3) -table=100, priority=80,ip,nw_src=10.10.1.7 actions=conjunction(6,1/3) -table=100, priority=80,tcp,tp_dst=8080 actions=conjunction(6,3/3) -table=100, priority=80,conj_id=6,ip actions=load:0x6->NXM_NX_REG3[],load:0x1->NXM_NX_REG0[20],resubmit(,101) +1. table=IngressRule, priority=200,ip,nw_src=10.10.0.26 actions=conjunction(3,1/3) +2. table=IngressRule, priority=200,reg1=0x25 actions=conjunction(3,2/3) +3. table=IngressRule, priority=200,tcp,tp_dst=80 actions=conjunction(3,3/3) +4. table=IngressRule, priority=190,conj_id=3,ip actions=set_field:0x3->reg6,ct(commit,table=IngressMetric,zone=65520,exec(set_field:0x3/0xffffffff->ct_label)) +5. table=IngressRule, priority=0 actions=goto_table:IngressDefaultRule ``` -The table-miss flow entry, which is used for non-isolated Pods, forwards -traffic to the next table ([ConntrackCommitTable]). +Flows 1-4 are installed for the ingress rule in the sample K8s NetworkPolicy. These flows are described as follows: + +- Flow 1 is used to match packets with the source IP address in set {10.10.0.26}, which is from the Pods selected + by the label `app: client`, constituting the first dimension for `conjunction` with `conj_id` 3. +- Flow 2 is used to match packets with the output port OVS in set {0x25}, which has all ports of the Pods selected + by the label `app: web`, constituting the second dimension for `conjunction` with `conj_id` 3. +- Flow 3 is used to match packets with the destination TCP port in set {80} specified in the rule, constituting + the third dimension for `conjunction` with `conj_id` 3. +- Flow 4 is used to match packets meeting all the three dimensions of `conjunction` with `conj_id` 3 and forward + them to table [IngressMetric], persisting `conj_id` to `IngressRuleCTLabel`. -### ConntrackCommitTable (105) +Flow 5 is the table-miss flow to forward packets not matched by other flows to table [IngressDefaultRule]. -As mentioned before, this table is in charge of committing all new connections -which are not dropped because of Network Policies. If you dump the flows for this -table, you should see something like this: +### IngressDefaultRule + +This table is similar in its purpose to table [EgressDefaultRule], and it complements table [IngressRule] for K8s +NetworkPolicy ingress rule implementation. In Kubernetes, when a NetworkPolicy is applied to a set of Pods, the default +behavior for these Pods becomes "deny" (they become [isolated +Pods](https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). This +table is in charge of dropping traffic destined for Pods to which a NetworkPolicy (with an ingress rule) is applied, +and which did not match any of the allow list rules. + +If you dump the flows of this table, you may see the following: ```text -1. table=105, priority=200,ct_state=+new+trk,ip,reg0=0x1/0xf actions=ct(commit,table=108,zone=65520,exec(load:0x20->NXM_NX_CT_MARK[])) -2. table=105, priority=190,ct_state=+new+trk,ip actions=ct(commit,table=108,zone=65520) -3. table=105, priority=0 actions=goto_table:108 +1. table=IngressDefaultRule, priority=200,reg1=0x25 actions=drop +2. table=IngressDefaultRule, priority=0 actions=goto_table:IngressMetric ``` -Flow 1 ensures that we commit connections initiated through the gateway -interface and mark them with a `ct_mark` of `0x20`. This ensures that -[ConntrackStateTable] can perform its functions correctly and rewrite the -destination MAC address to the gateway's MAC address for connections which -require it. Such connections include Pod-to-ClusterIP traffic. Note that the -`0x20` mark is applied to *all* connections initiated through the gateway -(i.e. for which the first packet of the connection was received through the -gateway) and that [ConntrackStateTable] will perform the destination MAC address -for the reply traffic of *all* such connections. In some cases (the ones -described for [ConntrackStateTable]), this rewrite is necessary. For others -(e.g. a connection from the host to a local Pod), this rewrite is not necessary -but is also harmless, as the destination MAC is already correct. +Flow 1, based on our sample K8s NetworkPolicy, is to drop traffic destined for OVS port 0x25, the port number associated +with a Pod selected by the label `app: web`. -Flow 2 commits all other new connections. +Flow 2 is the table-miss flow to forward packets to table [IngressMetric]. -All traffic then goes to [HairpinSNATTable]. +This table is also used to implement Antrea-native NetworkPolicy ingress rules created in the Baseline Tier. +Since the Baseline Tier is meant to be enforced after K8s NetworkPolicies, the corresponding flows will be created at a +lower priority than K8s NetworkPolicy default drop flows. These flows are similar to flows 3-9 in table +[AntreaPolicyIngressRule]. -### HairpinSNATTable (108) +### IngressMetric -The table is used to handle Service hairpin case, which indicates that the -packet should be output to the port on which it was received. +This table is very similar to table [EgressMetric], but used to collect ingress metrics for Antrea-native NetworkPolicies. -If you dump the flows for this table, you should see the flows: +If you dump the flows of this table, you may see the following: ```text -1. table=108, priority=200,ip,nw_src=10.10.0.4,nw_dst=10.10.0.4 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -2. table=108, priority=200,ip,nw_src=10.10.0.2,nw_dst=10.10.0.2 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -3. table=108, priority=200,ip,nw_src=10.10.0.3,nw_dst=10.10.0.3 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -4. table=108, priority=0 actions=resubmit(,110) +1. table=IngressMetric, priority=200,ct_state=+new,ct_label=0x3/0xffffffff,ip actions=goto_table:ConntrackCommit +2. table=IngressMetric, priority=200,ct_state=-new,ct_label=0x3/0xffffffff,ip actions=goto_table:ConntrackCommit +3. table=IngressMetric, priority=200,ct_state=+new,ct_label=0x6/0xffffffff,ip actions=goto_table:ConntrackCommit +4. table=IngressMetric, priority=200,ct_state=-new,ct_label=0x6/0xffffffff,ip actions=goto_table:ConntrackCommit +5. table=IngressMetric, priority=200,reg0=0x400/0x400,reg3=0x4 actions=drop +6. table=IngressMetric, priority=0 actions=goto_table:ConntrackCommit ``` -Flow 1-3 are used to match Service packets from Pods. The source IP of the matched -packets by flow 1-3 should be SNAT'd with a virtual hairpin IP since the source and -destination IP addresses should not be the same. Without SNAT, response packets from -a Pod will not be forwarded back to OVS pipeline as the destination IP is the Pod's -own IP, then the connection is interrupted because the conntrack state is only stored -in OVS ct zone, not in the Pod. With SNAT, the destination IP will be the virtual -hairpin IP and forwarded back to OVS pipeline. Note that, bit 18 in NXM_NX_REG0 is -set to 0x1, and it is consumed in [L2ForwardingOutTable] to output the packet -to the port on which it was received with action `IN_PORT`. +Flows 1-2, matching packets with `IngressRuleCTLabel` set to 3 (the `conj_id` allocated for the sample K8s NetworkPolicy +ingress rule and loaded in table [IngressRule] flow 4), are used to collect metrics for the ingress rule. + +Flows 3-4, matching packets with `IngressRuleCTLabel` set to 6 (the `conj_id` allocated for the sample Antrea-native +NetworkPolicy ingress rule and loaded in table [AntreaPolicyIngressRule] flow 10), are used to collect metrics for the +ingress rule. + +Flow 5 is the drop rule for the sample Antrea-native NetworkPolicy ingress rule. It drops the packets by matching +`APDenyRegMark` loaded in table [AntreaPolicyIngressRule] flow 13 and `APConjIDField` set to 4 which is the `conj_id` +allocated for the ingress rule and loaded in table [AntreaPolicyIngressRule] flow 13. -### L2ForwardingOutTable (110) +Flow 6 is the table-miss flow. -It is a simple table and if you dump the flows for this table, you should only -see 2 flows: +### ConntrackCommit + +This table is in charge of committing non-Service connections in `CtZone`. + +If you dump the flows of this table, you may see the following: ```text -1. table=110, priority=200,ip,reg0=0x10000/0x10000 actions=output:NXM_NX_REG1[] -2. table=110, priority=0, actions=drop +1. table=ConntrackCommit, priority=200,ct_state=+new+trk-snat,ct_mark=0/0x10,ip actions=ct(commit,table=Output,zone=65520,exec(move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +2. table=ConntrackCommit, priority=0 actions=goto_table:Output ``` -The first flow outputs all unicast packets to the correct port (the port was -resolved by the "dmac" table, [L2ForwardingCalcTable]). IP packets for which -[L2ForwardingCalcTable] did not set bit 16 of NXM_NX_REG0 will be dropped. +Flow 1 is designed to match the first packet of non-Service connections with the "tracked" state and `NotServiceCTMark`. +Then it commits the relevant connections in `CtZone`, persisting the value of `PktSourceField` to +`ConnSourceCTMarkField`, and forwards the packets to table [Output]. -## Tables (AntreaProxy is disabled) +Flow 2 is the table-miss flow. -![OVS pipeline](../assets/ovs-pipeline.svg) +### Output -### DNATTable (40) +This is the final table in the pipeline, responsible for handling the output of packets from OVS. It addresses the +following cases: -This table is created only when AntreaProxy is disabled. Its only job is to -send traffic destined to Services through the local gateway interface, without any -modifications. kube-proxy will then take care of load-balancing the connections -across the different backends for each Service. +1. Output packets to an application-aware engine for further L7 protocol processing. +2. Output packets to a target port and a mirroring port defined in a TrafficControl CR with `Mirror` action. +3. Output packets to a port defined in a TrafficControl CR with `Redirect` action. +4. Output packets from hairpin connections to the ingress port where they were received. +5. Output packets to a target port. +6. Output packets to the OpenFlow controller (Antrea Agent). +7. Drop packets. -If you dump the flows for this table, you should see something like this: +If you dump the flows of this table, you may see the following: ```text -1. table=40, priority=200,ip,nw_dst=10.96.0.0/12 actions=set_field:0x8001->reg1,load:0x1->NXM_NX_REG0[16],goto_table:105 -2. table=40, priority=0 actions=goto_table:45 +1. table=Output, priority=212,ct_mark=0x80/0x80,reg0=0x200000/0x600000 actions=push_vlan:0x8100,move:NXM_NX_CT_LABEL[64..75]->OXM_OF_VLAN_VID[],output:"antrea-l7-tap0" +2. table=Output, priority=211,reg0=0x200000/0x600000,reg4=0x400000/0xc00000 actions=output:NXM_NX_REG1[],output:NXM_NX_REG9[] +3. table=Output, priority=211,reg0=0x200000/0x600000,reg4=0x800000/0xc00000 actions=output:NXM_NX_REG9[] +4. table=Output, priority=210,ct_mark=0x40/0x40 actions=IN_PORT +5. table=Output, priority=200,reg0=0x200000/0x600000 actions=output:NXM_NX_REG1[] +6. table=Output, priority=200,reg0=0x2400000/0xfe600000 actions=meter:256,controller(reason=no_match,id=62373,userdata=01.01) +7. table=Output, priority=200,reg0=0x4400000/0xfe600000 actions=meter:256,controller(reason=no_match,id=62373,userdata=01.02) +8. table=Output, priority=0 actions=drop ``` -In the example above, 10.96.0.0/12 is the Service CIDR (this is the default -value used by `kubeadm init`). This flow is not actually required for -forwarding, but to bypass [EgressRuleTable] and [EgressDefaultTable] for Service -traffic on its way to kube-proxy through the gateway. If we omitted this flow, -such traffic would be unconditionally dropped if a Network Policy is applied on -the originating Pod. For such traffic, we instead enforce Network Policy egress -rules when packets come back through the gateway and the destination IP has been -rewritten by kube-proxy (DNAT to a backend for the Service). We cannot output -the Service traffic to the gateway port directly as we haven't committed the -connection yet; instead we store the port in NXM_NX_REG1 - similarly to how we -process non-Service traffic in [L2ForwardingCalcTable] - and forward it to -[ConntrackCommitTable]. By committing the connection we ensure that reply -traffic (traffic from the Service backend which has already gone through -kube-proxy for source IP rewrite) will not be dropped because of Network -Policies. - -The table-miss flow entry (flow 2) for this table forwards all non-Service -traffic to the next table, [AntreaPolicyEgressRuleTable]. - -[ClassifierTable]: #classifiertable-0 -[SpoofGuardTable]: #spoofguardtable-10 -[ARPResponderTable]: #arprespondertable-20 -[ServiceHairpinTable]: #servicehairpintable-23 -[ConntrackTable]: #conntracktable-30 -[ConntrackStateTable]: #conntrackstatetable-31 -[DNATTable]: #dnattable-40 -[SessionAffinityTable]: #sessionaffinitytable-40 -[ServiceLBTable]: #servicelbtable-41 -[EndpointDNATTable]: #endpointdnattable-42 -[AntreaPolicyEgressRuleTable]: #antreapolicyegressruletable-45 -[EgressRuleTable]: #egressruletable-50 -[EgressDefaultTable]: #egressdefaulttable-60 -[L3ForwardingTable]: #l3forwardingtable-70 -[SNATTable]: #snattable-71 -[L3DecTTLTable]: #l3decttltable-72 -[L2ForwardingCalcTable]: #l2forwardingcalctable-80 -[AntreaPolicyIngressRuleTable]: #antreapolicyingressruletable-85 -[IngressRuleTable]: #ingressruletable-90 -[IngressDefaultTable]: #ingressdefaulttable-100 -[ConntrackCommitTable]: #conntrackcommittable-105 -[HairpinSNATTable]: #hairpinsnattable-108 -[L2ForwardingOutTable]: #l2forwardingouttable-110 +Flow 1 is for case 1. It matches packets with `L7NPRedirectCTMark` and `OutputToOFPortRegMark`, and then outputs them to +the port `antrea-l7-tap0` specifically created for connecting to an application-aware engine. Notably, these packets are pushed +with an 802.1Q header and loaded with the VLAN ID value persisted in `L7NPRuleVlanIDCTLabel` before being output, due to +the implementation of `L7NetworkPolicy`. + +Flow 2 is for case 2. It matches packets with `TrafficControlMirrorRegMark` and `OutputToOFPortRegMark`, and then +outputs them to the port specified in `TargetOFPortField` and the port specified in `TrafficControlTargetOFPortField`. + +Flow 3 is for case 3. It matches packets with `TrafficControlRedirectRegMark` and `OutputToOFPortRegMark`, and then +outputs them to the port specified in `TrafficControlTargetOFPortField`. + +Flow 4 is for case 4. It matches packets from hairpin connections by matching `HairpinCTMark` and outputs them back to the +port where they were received. + +Flow 5 is for case 5. It matches packets by matching `OutputToOFPortRegMark` and outputs them to the OVS port specified by +the value stored in `TargetOFPortField`. + +Flows 6-7 are for case 6. They match packets by matching `OutputToControllerRegMark` and the value stored in +`PacketInOperationField`, then output them to the OpenFlow controller (Antrea Agent) with corresponding user data. + +Flow 8 is the table-miss flow for case 7. It drops packets that do not match any of the flows in this table. + +[ARPSpoofGuard]: #arpspoofguard +[AntreaPolicyEgressRule]: #antreapolicyegressrule +[AntreaPolicyIngressRule]: #antreapolicyingressrule +[Classifier]: #classifier +[ClusterIP without Endpoint]: #clusterip-without-endpoint +[ClusterIP]: #clusterip +[ConntrackState]: #conntrackstate +[ConntrackZone]: #conntrackzone +[Ct Labels]: #ovs-ct-label +[Ct Marks]: #ovs-ct-mark +[Ct Zones]: #ovs-ct-zone +[EgressDefaultRule]: #egressdefaultrule +[EgressMark]: #egressmark +[EgressMetric]: #egressmetric +[EgressRule]: #egressrule +[EndpointDNAT]: #endpointdnat +[IngressDefaultRule]: #ingressdefaultrule +[IngressMetric]: #ingressmetric +[IngressRule]: #ingressrule +[L2ForwardingCalc]: #l2forwardingcalc +[L3DecTTL]: #l3decttl +[L3Forwarding]: #l3forwarding +[LoadBalancer]: #loadbalancer +[NodePort]: #nodeport +[NodePortMark]: #nodeportmark +[OVS Registers]: #ovs-registers +[Output]: #output +[PreRoutingClassifier]: #preroutingclassifier +[SNATMark]: #snatmark +[SNAT]: #snat +[Service with ExternalIP]: #service-with-externalip +[Service with ExternalTrafficPolicy Local]: #service-with-externaltrafficpolicy-local +[Service with Session Affinity]: #service-with-session-affinity +[ServiceLB]: #servicelb +[SessionAffinity]: #sessionaffinity +[SpoofGuard]: #spoofguard +[TrafficControl]: #trafficcontrol +[UnSNAT]: #unsnat diff --git a/pkg/agent/openflow/fields.go b/pkg/agent/openflow/fields.go index 47381d5ad0b..78f0845a143 100644 --- a/pkg/agent/openflow/fields.go +++ b/pkg/agent/openflow/fields.go @@ -109,12 +109,12 @@ var ( APConjIDField = binding.NewRegField(3, 0, 31) // reg4(NXM_NX_REG4) - // reg4[0..15]: Field to store the selected Service Endpoint port. + // reg4[0..15]: Field to store the selected Service Endpoint port number. EndpointPortField = binding.NewRegField(4, 0, 15) // reg4[16..18]: Field to store the state of a packet accessing a Service. Marks in this field include: - // - 0b001: packet need to do service selection. - // - 0b010: packet has done service selection. - // - 0b011: packet has done service selection and the selection result needs to be cached. + // - 0b001: packet needs to do Endpoint selection. + // - 0b010: packet has done Endpoint selection. + // - 0b011: packet has done Endpoint selection and the selection result needs to be cached. ServiceEPStateField = binding.NewRegField(4, 16, 18) EpToSelectRegMark = binding.NewRegMark(ServiceEPStateField, 0b001) EpSelectedRegMark = binding.NewRegMark(ServiceEPStateField, 0b010) From e2ef40f808e7d5f1254a6db076024a9ed230a513 Mon Sep 17 00:00:00 2001 From: Hongliang Liu <75655411+hongliangl@users.noreply.github.com> Date: Wed, 24 Apr 2024 17:11:19 +0800 Subject: [PATCH 2/5] For review comments Signed-off-by: Hongliang Liu --- docs/design/ovs-pipeline.md | 327 ++++++++++++++++++++---------------- 1 file changed, 181 insertions(+), 146 deletions(-) diff --git a/docs/design/ovs-pipeline.md b/docs/design/ovs-pipeline.md index 356531f56cf..b3c514ecd9b 100644 --- a/docs/design/ovs-pipeline.md +++ b/docs/design/ovs-pipeline.md @@ -473,7 +473,7 @@ document. This ACNP is applied to all Pods with the label `app: web` in all Namespaces. It allows only HTTP ingress traffic on port 8080 from Pods with the label `app: client`, limited to the `GET` method and `/api/v2/*` path. Any other HTTP -ingress traffic on port 8080 from Pods the label `app: client` will be dropped. +ingress traffic on port 8080 from Pods with the label `app: client` will be dropped. ```yaml apiVersion: crd.antrea.io/v1beta1 @@ -509,8 +509,10 @@ spec: Antrea creates a dedicated table [TrafficControl] to implement feature `TrafficControl`. We will use the following TrafficControls as examples for the remainder of this document. +### TrafficControl for Packet Redirecting + This is a TrafficControl applied to Pods with the label `app: web`. For these Pods, both ingress and egress traffic will -be redirected to port `antrea-tc-tap0`, and returned back through port `antrea-tc-tap1`. +be redirected to port `antrea-tc-tap0`, and returned through port `antrea-tc-tap1`. ```yaml apiVersion: crd.antrea.io/v1alpha2 @@ -532,6 +534,8 @@ spec: name: antrea-tc-tap1 ``` +### TrafficControl for Packet Mirroring + This is a TrafficControl applied to Pods with the label `app: db`. For these Pods, both ingress and egress will be mirrored (duplicated) to port `antrea-tc-tap2`. @@ -558,8 +562,14 @@ Table [EgressMark] is dedicated to the implementation of feature `Egress`. Consider the following Egresses as examples for the remainder of this document. -This is an Egress applied to Pods with the label `app: web`. For these Pods, all egress traffic will be SNAT'd on the -Node `k8s-node-control-plane` from which we dumped flows in the document with the Egress IP `192.168.77.112`. +### Egress Applied to Web Pods + +This is an Egress applied to Pods with the label `app: web`. For these Pods, all egress traffic (traffic leaving the +cluster) will be SNAT'd on the Node `k8s-node-control-plane` using Egress IP `192.168.77.112`. In this context, +`k8s-node-control-plane` is known as the "Egress Node" for this Egress resource. Note that the flows presented in the +rest of this document were dumped on Node `k8s-node-control-plane`. Egress flows are different on the "source Node" +(Node running a workload Pod to which the Egress resource is applied) and on the "Egress Node" (Node enforcing the +SNAT policy). ```yaml apiVersion: crd.antrea.io/v1beta1 @@ -576,8 +586,10 @@ status: egressNode: k8s-node-control-plane ``` +### Egress Applied to Client Pods + This is an Egress applied to Pods with the label `app: client`. For these Pods, all egress traffic will be SNAT'd on the -Node `k8s-node-worker-1` with the Egress IP `192.168.77.113`. +Node `k8s-node-worker-1` using Egress IP `192.168.77.113`. ```yaml apiVersion: crd.antrea.io/v1beta1 @@ -657,7 +669,7 @@ If you dump the flows of this table, you may see the following: Flow 1 is designed for case 1, matching ARP request packets for the MAC address of a remote Antrea gateway with IP address `10.10.1.1`. It programs an ARP reply packet and sends it back to the port where the request packet was received. Note -that both the source hardware address and the source MAC address in the ARP reply packet are set with the *Global Virtual +that both the source hardware address and the source MAC address in the ARP reply packet are set to the *Global Virtual MAC* `aa:bb:cc:dd:ee:ff`, not the actual MAC address of the remote Antrea gateway. This ensures that once the traffic is received by the remote OVS bridge, it can be directly forwarded to the appropriate Pod without actually going through the local Antrea gateway. The *Global Virtual MAC* is used as the destination MAC address for all the traffic being @@ -672,7 +684,7 @@ at the routing table for the local Node, we would find the following "onlink" ro A similar route is installed on the local Antrea gateway (antrea-gw0) interface every time the Antrea *Node Route Controller* is notified that a new Node has joined the cluster. The route must be marked as "onlink" since the kernel does not have -a route to the peer gateway `10.10.1.1`. we trick the kernel into believing that `10.10.1.1` is directly connected to +a route to the peer gateway `10.10.1.1`. We "trick" the kernel into believing that `10.10.1.1` is directly connected to the local Node, even though it is on the other side of the tunnel. Flow 2 is designed for case 2, ensuring that OVS handles the remainder of ARP traffic as a regular L2 learning switch @@ -682,15 +694,15 @@ Flow 3 is the table-miss flow, which should never be used since ARP packets will ### Classifier -This table is designed to determine the "category" of packets by matching the ingress port of the packets. It -addresses specific cases: +This table is designed to determine the "category" of IP packets by matching on their ingress port. It addresses +specific cases: 1. Packets originating from the local Node through the local Antrea gateway port, requiring IP spoof legitimacy verification. 2. Packets originating from the external network through the Antrea gateway port. 3. Packets received through an overlay tunnel. 4. Packets received through a return port defined in a user-provided TrafficControl CR (for feature `TrafficControl`). -5. Packets returned back from an application-aware engine through a specific port (for feature `L7NetworkPolicy`). +5. Packets returned from an application-aware engine through a specific port (for feature `L7NetworkPolicy`). 6. Packets originating from local Pods, requiring IP spoof legitimacy verification. If you dump the flows of this table, you may see the following: @@ -731,16 +743,17 @@ packets from the tunnel should be seamlessly forwarded to table [UnSNAT]. The fo and consumed in table [L3Forwarding]. Flow 4 is for case 4, matching packets from a TrafficControl return port and forwarding them to table [L3Forwarding] -to decide the egress port. It's important to note that both the source and destination MAC addresses of the packets have -been set to the expected state before redirecting the packets to the TrafficControl target port in table [Output]. The -only purpose of forwarding the packets to table [L3Forwarding] is to load tunnel destination IP for packets destined for -remote Nodes. This ensures that the returned packets destined for remote Nodes are forwarded through the tunnel. -`FromTCReturnRegMark` that will be used in table [TrafficControl] is loaded to mark the packet source. - -Flow 5 is for case 5, matching packets sent back from an application-aware engine through a specific port and forwarding -them to table [L3Forwarding] to decide the egress port. Like flow 4, the purpose of forwarding the packets to table -[L3Forwarding] is to load tunnel destination IP for packets destined for remote Nodes. `FromTCReturnRegMark` that will -be used in table [TrafficControl] is also loaded to mark the packet source. +to decide the egress port. It's important to note that a forwarding decision for these packets was already made before +redirecting them to the TrafficControl target port in table [Output], and at this point, the source and destination MAC +addresses of these packets have already been set to the correct values. The only purpose of forwarding the packets to +table [L3Forwarding] is to load the tunnel destination IP for packets destined for remote Nodes. This ensures that the +returned packets destined for remote Nodes are forwarded through the tunnel. `FromTCReturnRegMark`, which will be used +in table [TrafficControl], is loaded to mark the packet source. + +Flow 5 is for case 5, matching packets returned back from an application-aware engine through a specific port, stripping +the VLAN ID used by the application-aware engine, and forwarding them to table [L3Forwarding] to decide the egress port. +Like flow 4, the purpose of forwarding the packets to table [L3Forwarding] is to load the tunnel destination IP for +packets destined for remote Nodes, and `FromTCReturnRegMark` is also loaded. Flows 6-8 are for case 6, matching packets from local Pods and forwarding them to table [SpoofGuard] to do legitimacy verification. The following reg marks are loaded: @@ -753,10 +766,11 @@ Flow 9 is the table-miss flow to drop packets that are not matched by flows 1-8. ### SpoofGuard -This table is crafted to drop IP [spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods. It addresses -specific cases: +This table is crafted to prevent IP [spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods. It +addresses specific cases: -1. Allowing packets from the local Antrea gateway, where checks are not currently performed. +1. Allowing all packets from the local Antrea gateway. We do not perform checks for this interface as we need to accept + external traffic with a source IP address that does not match the gateway IP. 2. Ensuring that the source IP and MAC addresses are correct, i.e., matching the values configured on the interface when Antrea sets up networking for a Pod. @@ -770,17 +784,19 @@ If you dump the flows of this table, you may see the following: 5. table=SpoofGuard, priority=0 actions=drop ``` -Flow 1 is for case 1, matching packets received from the local Antrea gateway port without checking the source IP and MAC -address. There are some cases where the source IP of the packets through the local Antrea gateway port is not the local -Antrea gateway IP: +Flow 1 is for case 1, matching packets received on the local Antrea gateway port without checking the source IP and MAC +addresses. There are some cases where the source IP of the packets through the local Antrea gateway port is not the local +Antrea gateway IP address: - When Antrea is deployed with kube-proxy, and `AntreaProxy` is not enabled, packets from local Pods destined for Services - will first go through the gateway, get load-balanced by the kube-proxy data path (DNAT) then re-enter through the - gateway. Then the packets are received on the gateway port with a source IP belonging to a local Pod. + will first go through the gateway port, get load-balanced by the kube-proxy data path (undergo DNAT with a local Endpoint + selected by the kube-proxy) then re-enter through the gateway port. Then the packets are received on the gateway port + with a source IP belonging to a local Pod. - When Antrea is deployed without kube-proxy, and both `AntreaProxy` and `proxyAll` are enabled, packets from the external - network destined for Services will be routed to OVS through the gateway without changing the source IP. -- When Antrea is deployed with kube-proxy, and `AntreaProxy` is enabled, packets from the external network destined for - Services will get load-balanced by the kube-proxy data path (DNAT) and then routed to OVS through the gateway without SNAT. + network destined for Services will be routed to OVS through the gateway port without changing the source IP. +- When Antrea is deployed with kube-proxy, packets from the external network destined for Services whose + `externalTrafficPolicy` is set to `Local` will get load-balanced by the kube-proxy data path (undergo DNAT with a + local Endpoint selected by the kube-proxy) and then routed to OVS through the gateway without SNAT. Flows 2-4 are for case 2, matching legitimate IP packets from local Pods. @@ -788,9 +804,9 @@ Flow 5 is the table-miss flow to drop IP spoofing packets. ### UnSNAT -This table is used to perform `de-SNAT` on reply packets by invoking action `ct` on them. The packets are from SNAT'd -Service connections that have been committed with `SNATCtZone` in table [SNAT]. After invoking action `ct`, the packets -will be in a "tracked" state, restoring all [connection tracking +This table is used to undo SNAT on reply packets by invoking action `ct` on them. The packets are from SNAT'd Service +connections that have been committed to `SNATCtZone` in table [SNAT]. After invoking action `ct`, the packets will be +in a "tracked" state, restoring all [connection tracking fields](https://www.openvswitch.org/support/dist-docs/ovs-fields.7.txt) (such as `ct_state`, `ct_mark`, `ct_label`, etc.) to their original values. The packets with a "tracked" state are then forwarded to table [ConntrackZone]. @@ -802,20 +818,19 @@ If you dump the flows of this table, you may see the following: 3. table=UnSNAT, priority=0 actions=goto_table:ConntrackZone ``` -Flow 1 matches reply packets for Service connections where they were SNAT'd with the *Virtual Service IP* `169.254.0.253` -and invokes action `ct` on them. For the packets, the destination IP of them is the *Virtual Service IP*. +Flow 1 matches reply packets for Service connections which were SNAT'd with the *Virtual Service IP* `169.254.0.253` +and invokes action `ct` on them. -Flow 2 matches packets for Service connections where they were SNAT'd with the local Antrea gateway IP `10.10.0.1` and -invokes action `ct` on them. For the packets, the destination IP of them is the local Antrea gateway IP. This flow also -matches request packets destined for the local Antrea gateway IP from local Pods by accident. However, this is harmless -since such connections will never be committed with `SNATCtZone`, and therefore, connection tracking fields for the -packets are unset. +Flow 2 matches packets for Service connections which were SNAT'd with the local Antrea gateway IP `10.10.0.1` and +invokes action `ct` on them. This flow also matches request packets destined for the local Antrea gateway IP from +local Pods by accident. However, this is harmless since such connections will never be committed to `SNATCtZone`, and +therefore, connection tracking fields for the packets are unset. Flow 3 is the table-miss flow. For reply packets from SNAT'd connections, whose destination IP is the translated SNAT IP, after invoking action `ct`, -the destination IP of the packets will be restored to the original IP before SNAT is stored in the connection tracking -field `ct_nw_dst`. +the destination IP of the packets will be restored to the original IP stored in the connection tracking field `ct_nw_dst` +before SNAT. ### ConntrackZone @@ -845,9 +860,9 @@ This table handles packets from the connections that have a "tracked" state asso specific cases: 1. Dropping invalid packets reported by conntrack. -2. Forwarding tracked sequencing packets from all connections to table [AntreaPolicyEgressRule] directly, bypassing the - tables like [PreRoutingClassifier], [NodePortMark], [SessionAffinity], [ServiceLB], and [EndpointDNAT] for Service - Endpoint selection. +2. Forwarding tracked packets from all connections to table [AntreaPolicyEgressRule] directly, bypassing the tables + like [PreRoutingClassifier], [NodePortMark], [SessionAffinity], [ServiceLB], and [EndpointDNAT] for Service Endpoint + selection. 3. Forwarding packets from new connections to table [PreRoutingClassifier] to start Service Endpoint selection since Service connections are not identified at this stage. @@ -888,7 +903,7 @@ If you dump the flows of this table, you may see the following: ``` Flow 1 sequentially resubmits packets to tables [NodePortMark], [SessionAffinity], and [ServiceLB]. Note that packets -are forwarded to table [ServiceLB] finally. In tables [NodePortMark] and [SessionAffinity], only reg marks are loaded. +are ultimately forwarded to table [ServiceLB]. In tables [NodePortMark] and [SessionAffinity], only reg marks are loaded. Flow 2 is the table-miss flow that should remain unused. @@ -900,19 +915,20 @@ enabled. If you dump the flows of this table, you may see the following: ```text -1. table=NodePortMark, priority=200,ip,nw_dst=10.176.25.100 actions=set_field:0x80000/0x80000->reg4 -2. table=NodePortMark, priority=200,ip,nw_dst=192.168.77.102 actions=set_field:0x80000/0x80000->reg4 -3. table=NodePortMark, priority=200,ip,nw_dst=169.254.0.252 actions=set_field:0x80000/0x80000->reg4 -4. table=NodePortMark, priority=0 actions=goto_table:SessionAffinity +1. table=NodePortMark, priority=200,ip,nw_dst=192.168.77.102 actions=set_field:0x80000/0x80000->reg4 +2. table=NodePortMark, priority=200,ip,nw_dst=169.254.0.252 actions=set_field:0x80000/0x80000->reg4 +3. table=NodePortMark, priority=0 actions=goto_table:SessionAffinity ``` -Flows 1-2 match packets destined for the local Node from local Pods. `NodePortRegMark` is loaded, indicating that the -packets are potentially destined for NodePort Services. +Flow 1 matches packets destined for the local Node from local Pods. `NodePortRegMark` is loaded, indicating that the +packets are potentially destined for NodePort Services. We assume only one valid IP address, `192.168.77.102`, can serve +as the host IP address for NodePort based on the option `antreaProxy.nodePortAddresses`. If there are multiple valid IP +addresses specified in the option, a flow similar to flow 1 will be installed for each IP address. -Flow 3 match packets destined for the *Virtual NodePort DNAT IP*. Packets destined for NodePort Services from the local +Flow 2 match packets destined for the *Virtual NodePort DNAT IP*. Packets destined for NodePort Services from the local Node or the external network is DNAT'd to the *Virtual NodePort DNAT IP* by iptables before entering the pipeline. -Flow 4 is the table-miss flow. +Flow 3 is the table-miss flow. Note that packets of NodePort Services have not been identified in this table by matching destination IP address. The identification of NodePort Services will be done finally in table [ServiceLB] by matching `NodePortRegMark` and the @@ -934,14 +950,15 @@ If you dump the flows of this table, you may see the following: Flow 1 is a learned flow generated by flow 3 in table [ServiceLB], designed for the sample Service [ClusterIP with Session Affinity], to implement Service session affinity. Here are some details about the flow: -- The hard timeout of the learned flow should be equal to the value of - `service.spec.sessionAffinityConfig.clientIP.timeoutSeconds` defined in the Service. This means that during the hard - timeout, this flow is present in the pipeline, and the session affinity of the Service takes effect during the timeout. -- Source IP address, destination IP address, destination port, and transparent protocol are used to match packets of - connections sourced from the same client and destined for the Service during the timeout. +- The "hard timeout" of the learned flow should be equal to the value of + `service.spec.sessionAffinityConfig.clientIP.timeoutSeconds` defined in the Service. This means that until the hard + timeout expires, this flow is present in the pipeline, and the session affinity of the Service takes effect. Unlike an + "idle timeout", the "hard timeout" does not reset whenever the flow is matched. +- Source IP address, destination IP address, destination port, and transport protocol are used to match packets of + connections sourced from the same client and destined for the Service during the affinity time window. - Endpoint IP address and Endpoint port are loaded into `EndpointIPField` and `EndpointPortField` respectively. -- `EpSelectedRegMark` is loaded, indicating that the Service Endpoint selection is done, and then the packets will - be only matched by the last flow in table [ServiceLB]. +- `EpSelectedRegMark` is loaded, indicating that the Service Endpoint selection is done, and ensuring that the packets + will only match the last flow in table [ServiceLB]. - `RewriteMACRegMark`, which will be consumed in table [L3Forwarding], is loaded here, indicating that the source and destination MAC addresses of the packets should be rewritten. @@ -957,7 +974,7 @@ This table is used to implement Service Endpoint selection. It addresses specifi 2. NodePort, as demonstrated in the example [NodePort]. 3. LoadBalancer, as demonstrated in the example [LoadBalancer]. 4. Service configured with external IPs, as demonstrated in the example [Service with ExternalIP]. -5. Service configured with session affinity, as demonstrated in the example [Service with Session Affinity]. +5. Service configured with session affinity, as demonstrated in the example [Service with session affinity]. 6. Service configured with externalTrafficPolicy to `Local`, as demonstrated in the example [Service with ExternalTrafficPolicy Local]. @@ -978,15 +995,13 @@ If you dump the flows of this table, you may see the following: 10. table=ServiceLB, priority=0 actions=goto_table:EndpointDNAT ``` -Flow 1 or flow 2 is designed for case 1, matching the first packet of connections destined for the sample [ClusterIP +Flow 1 and flow 2 are designed for case 1, matching the first packet of connections destined for the sample [ClusterIP without Endpoint] or [ClusterIP]. This is achieved by matching `EpToSelectRegMark` loaded in table [SessionAffinity], clusterIP, and port. The target of the packet matched by the flow is an OVS group where the Endpoint will be selected. -Before forwarding the packet to the OVS group, `RewriteMACRegMark` that will be consumed in table [L3Forwarding] is +Before forwarding the packet to the OVS group, `RewriteMACRegMark`, which will be consumed in table [L3Forwarding], is loaded, indicating that the source and destination MAC addresses of the packets should be rewritten. `EpSelectedRegMark` -that will be consumed in table [EndpointDNAT] is also loaded, indicating that the Endpoint is selected. Note that the -Service Endpoint selection is not completed yet, as it will be done in the target OVS group. The action is set here to -support more Endpoints in an OVS group. Refer to PR [#2101](https://github.com/antrea-io/antrea/pull/2101) for more -information. +, which will be consumed in table [EndpointDNAT], is also loaded, indicating that the Endpoint is selected. Note that the +Service Endpoint selection is not completed yet, as it will be done in the target OVS group. Flow 3 is for case 2, matching the first packet of connections destined for the sample [NodePort]. This is achieved by matching `EpToSelectRegMark` loaded in table [SessionAffinity], `NodePortRegMark` loaded in table [NodePortMark], and @@ -1003,23 +1018,23 @@ Session Affinity]. This is achieved by matching the conditions similar to flow 1 also an OVS group, and `RewriteMACRegMark` is loaded. The difference is that `EpToLearnRegMark` is loaded, rather than `EpSelectedRegMark`, indicating that the selected Endpoint needs to be cached. -Flow 7 is the final process for case 5, matching the packet previously matched by flow 6, sent back from the target OVS +Flow 7 is the final process for case 5, matching the packet previously matched by flow 6, resubmitted back from the target OVS group after selecting an Endpoint. Then a learned flow will be generated in table [SessionAffinity] to match the packets of the subsequent connections from the same client IP, ensuring that the packets are always forwarded to the same Endpoint -selected the first time. `EpSelectedRegMark` that will be consumed in table [EndpointDNAT] is loaded, indicating that +selected the first time. `EpSelectedRegMark`, which will be consumed in table [EndpointDNAT], is loaded, indicating that Service Endpoint selection has been done. -Flow 8 and flow 9 are for case 6. Flow 8 has the higher priority than that of flow 9, prioritizing matching the first -packet of connection sourced from a local Pod or the local Node with `FromLocalRegMark` loaded in table [Classifier] +Flow 8 and flow 9 are for case 6. Flow 8 has higher priority than flow 9, prioritizing matching the first +packet of connections sourced from a local Pod or the local Node with `FromLocalRegMark` loaded in table [Classifier] and destined for the sample [Service with ExternalTrafficPolicy Local]. The target of flow 8 is an OVS group that has all the Endpoints across the cluster, ensuring accessibility for Service connections originating from local Pods or -Nodes, regardless that `externalTrafficPolicy` of the Service is `Local`. Due to the existence of flow 8, consequently, +Nodes, even though `externalTrafficPolicy` is set to `Local` for the Service. Due to the existence of flow 8, consequently, flow 9 exclusively matches packets sourced from the external network, resembling the pattern of flow 1. The target of flow 9 is an OVS group that has only the local Endpoints since `externalTrafficPolicy` of the Service is `Local`. Flow 10 is the table-miss flow. -As mentioned above, the Service Endpoint selection is performed within OVS groups. 3 typical OVS groups are list below: +As mentioned above, the Service Endpoint selection is performed within OVS groups. 3 typical OVS groups are listed below: ```text 1. group_id=9,type=select,\ @@ -1041,16 +1056,16 @@ Endpoints. The group has 2 buckets, indicating the availability of 2 selectable chance of being chosen since they have the same weights. For every bucket, the Endpoint IP and Endpoint port are loaded into `EndpointIPField` and `EndpointPortField`, respectively. These loaded values will be consumed in table [EndpointDNAT] to which the packets are forwarded and in which DNAT will be performed. `RemoteEndpointRegMark` is loaded -for remote Endpoints, like bucket with `bucket_id` 1 in this group. +for remote Endpoints, like the bucket with `bucket_id` 1 in this group. The third group with `group_id` 11 is the destination of packets matched by flow 6, designed for a Service that has Endpoints and is configured with session affinity. The group closely resembles the group with `group_id` 10, except that -the destination of the packets is table [ServiceLB], rather than table [EndpointDNAT]. After being sent back to table +the destination of the packets is table [ServiceLB], rather than table [EndpointDNAT]. After being resubmitted back to table [ServiceLB], they will be matched by flow 7. ### EndpointDNAT -The table implements DNAT for Service connection after Endpoint selection is performed in table [ServiceLB]. +The table implements DNAT for Service connections after Endpoint selection is performed in table [ServiceLB]. If you dump the flows of this table, you may see the following:: @@ -1064,7 +1079,7 @@ If you dump the flows of this table, you may see the following:: Flow 1 is designed for Services without Endpoints. It identifies the first packet of connections destined for such Service by matching `SvcNoEpRegMark`. Subsequently, the packet is forwarded to the OpenFlow controller (Antrea Agent). For TCP -Service traffic, the controller will send a TCP RST, and for all other cases the controller will an ICMP Destination +Service traffic, the controller will send a TCP RST, and for all other cases the controller will send an ICMP Destination Unreachable message. Flows 2-3 are designed for Services that have selected an Endpoint. These flows identify the first packet of connections @@ -1078,6 +1093,10 @@ Some bits of ct mark are persisted: - The value of `PktSourceField` is persisted to `ConnSourceCTMarkField`, storing the source of the connection for the current packet and subsequent packets of the connection. +Flow 4 is to resubmit the packets which are not matched by flows 1-3 back to table [ServiceLB] to select Endpoint again. + +Flow 5 is the table-miss flow to match non-Service packets. + ### AntreaPolicyEgressRule This table is used to implement the egress rules across all Antrea-native NetworkPolicies, except for NetworkPolicies @@ -1085,8 +1104,8 @@ that are created in the Baseline Tier. Antrea-native NetworkPolicies created in K8s NetworkPolicies and their egress rules are installed in tables [EgressDefaultRule] and [EgressRule] respectively, i.e. ```text -K8s NetworkPolicy -> EgressRule Antrea-native NetworkPolicy other Tiers -> AntreaPolicyEgressRule +K8s NetworkPolicy -> EgressRule Antrea-native NetworkPolicy Baseline Tier -> EgressDefaultRule ``` @@ -1101,7 +1120,7 @@ custom allocator, which is common to all tables that can have NetworkPolicy flow For this table, you will need to keep in mind the Antrea-native NetworkPolicy [specification](#antrea-native-networkpolicy-implementation). Since the sample egress policy resides in the Application -Tier. If you dump the flows of this table, you may see the following: +Tie, if you dump the flows of this table, you may see the following: ```text 1. table=AntreaPolicyEgressRule, priority=64990,ct_state=-new+est,ip actions=goto_table:EgressMetric @@ -1116,8 +1135,8 @@ Tier. If you dump the flows of this table, you may see the following: 10. table=AntreaPolicyEgressRule, priority=0 actions=goto_table:EgressRule ``` -Flows 1-2, which are installed by default with the highest priority, matching non-new and "tracked" packets and -forwarding them to table [EgressMetric] to bypass the check from egress rules. This means that if a connection is +Flows 1-2, which are installed by default with the highest priority, match non-new and "tracked" packets and +forward them to table [EgressMetric] to bypass the check from egress rules. This means that if a connection is established, its packets go straight to table [EgressMetric], with no other match required. In particular, this ensures that reply traffic is never dropped because of an Antrea-native NetworkPolicy or K8s NetworkPolicy rule. However, this also means that ongoing connections are not affected if the Antrea-native NetworkPolicy or the K8s NetworkPolicy is @@ -1141,16 +1160,18 @@ flows are described as follows: - Flow 5 is used to match packets with the destination TCP port in set {3306} specified in the rule, constituting the third dimension for `conjunction` with `conj_id` 7. - Flow 6 is used to match packets meeting all the three dimensions of `conjunction` with `conj_id` 7 and forward them - to table [EgressMetric], persisting `conj_id` to `EgressRuleCTLabel` that is consumed in table [EgressMetric]. + to table [EgressMetric], persisting `conj_id` to `EgressRuleCTLabel` that will be consumed in table [EgressMetric]. Flows 7-9, whose priorities are all 14499, are installed for the egress rule with a `Drop` action defined after the rule -`AllowToDB` in the sample policy, serves as a default rule. Unlike the default of K8s NetworkPolicy, Antrea-native -NetworkPolicy has no default rule, and all rules should be explicitly defined. Hence, they are evaluated as-is, and -there is no need for a table [AntreaPolicyEgressDefaultRule]. These flows are described as follows: +`AllowToDB` in the sample policy, and serves as a default rule. Antrea-native NetworkPolicy does not have the same +default isolated behavior as K8s NetworkPolicy (implemented in the [EgressDefaultRule] table). As soon as a rule is +matched, we apply the corresponding action. If no rule is matched, there is no implicit drop for Pods to which an +Antrea-native NetworkPolicy applies. These flows are described as follows: - Flow 7 is used to match packets with the source IP address in set {10.10.0.24}, which is from the Pods selected by the label `app: web`, constituting the first dimension for `conjunction` with `conj_id` 5. -- Flow 8 is used to match any packets, constituting the second dimension for `conjunction` with `conj_id` 5. +- Flow 8 is used to match any IP packets, constituting the second dimension for `conjunction` with `conj_id` 5. This + flow, which matches all IP packets, exists because we need at least 2 dimensions for a conjunctive match. - Flow 9 is used to match packets meeting both dimensions of `conjunction` with `conj_id` 5. `APDenyRegMark` is loaded and will be consumed in table [EgressMetric] to which the packets are forwarded. @@ -1175,9 +1196,9 @@ you may see the following: Flows 1-4 are installed for the egress rule in the sample K8s NetworkPolicy. These flows are described as follows: - Flow 1 is to match packets with the source IP address in set {10.10.0.24}, which has all IP addresses of the Pods - selected by the label `app: web`, constituting the first dimension for `conjunction` with `conj_id` 2. + selected by the label `app: web` in the `default` Namespace, constituting the first dimension for `conjunction` with `conj_id` 2. - Flow 2 is to match packets with the destination IP address in set {10.10.0.25}, which has all IP addresses of the Pods - selected by the label `app: db`, constituting the second dimension for `conjunction` with `conj_id` 2. + selected by the label `app: db` in the `default` Namespace, constituting the second dimension for `conjunction` with `conj_id` 2. - Flow 3 is to match packets with the destination TCP port in set {3306} specified in the rule, constituting the third dimension for `conjunction` with `conj_id` 2. - Flow 4 is to match packets meeting all the three dimensions of `conjunction` with `conj_id` 2 and forward them to @@ -1188,10 +1209,10 @@ Flow 5 is the table-miss flow to forward packets not matched by other flows to t ### EgressDefaultRule This table complements table [EgressRule] for K8s NetworkPolicy egress rule implementation. When a NetworkPolicy is -applied to a set of Pods, and the default behavior for these Pods becomes "deny" (they become [isolated +applied to a set of Pods, then the default behavior for egress connections for these Pods becomes "deny" (they become [isolated Pods](https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). This table is in charge of dropping traffic originating from Pods to which a NetworkPolicy (with an egress rule) is -applied, and which did not match any of the allowed list rules. +applied, and which did not match any of the "allowed" list rules. If you dump the flows of this table, you may see the following: @@ -1209,7 +1230,7 @@ Flow 2 is the table-miss flow to forward packets to table [EgressMetric]. This table is also used to implement Antrea-native NetworkPolicy egress rules that are created in the Baseline Tier. Since the Baseline Tier is meant to be enforced after K8s NetworkPolicies, the corresponding flows will be created at a lower priority than K8s NetworkPolicy default drop flows. These flows are similar to flows 3-9 in table -[AntreaPolicyEgressRule]. +[AntreaPolicyEgressRule]. For the sake of simplicity, we have not defined any example Baseline policies in this document. ### EgressMetric @@ -1237,6 +1258,9 @@ Flow 5 serves as the drop rule for the sample Antrea-native NetworkPolicy egress `APDenyRegMark` loaded in table [AntreaPolicyEgressRule] flow 9 and `APConjIDField` set to 5 which is the `conj_id` allocated the egress rule and loaded in table [AntreaPolicyEgressRule] flow 9. +These flows have no explicit action besides the `goto_table` action. This is because we rely on the "implicit" flow +counters to keep track of connection / packet statistics. + Ct label is used in flows 1-4, while reg is used in flow 5. The distinction lies in the fact that the value persisted in the ct label can be read throughout the entire lifecycle of a connection, but the reg mark is only valid for the current packet. For a connection permitted by a rule, all its packets should be collected for metrics, thus a ct label is used. @@ -1276,13 +1300,13 @@ Flow 2 matches reply packets with corresponding ct "tracked" states and `FromGat through the local Antrea gateway. In other words, these are connections for which the first packet of the connection (SYN packet for TCP) was received through the local Antrea gateway. It rewrites the destination MAC address to that of the local Antrea gateway, loads `ToGatewayRegMark`, and forwards them to table [L3DecTTL]. This ensures that -reply packets can be forwarded back to the local Antrea gateway in subsequent tables, guaranteeing the availability -of the connection. This flow is required to handle the following cases when AntreaProxy is not enabled: +reply packets can be forwarded back to the local Antrea gateway in subsequent tables. This flow is required to handle +the following cases when AntreaProxy is not enabled: - Reply traffic for connections from a local Pod to a ClusterIP Service, which are handled by kube-proxy and go through DNAT. In this case, the destination IP address of the reply traffic is the Pod which initiated the connection to the - Service (no SNAT by kube-proxy). These packets should sent back to the local Antrea gateway to the third-party module - to complete the DNAT processes, e.g., kube-proxy. The destination MAC of the packets are rewritten in the table to + Service (no SNAT by kube-proxy). These packets should be forwarded back to the local Antrea gateway to the third-party module + to complete the DNAT processes, e.g., kube-proxy. The destination MAC of the packets is rewritten in the table to avoid it is forwarded to the original client Pod by mistake. - When hairpin is involved, i.e. connections between 2 local Pods, for which NAT is performed. One example is a Pod accessing a NodePort Service for which externalTrafficPolicy is set to `Local` using the local Node's IP address, @@ -1296,15 +1320,15 @@ not traversing any router device or undergoing NAT process. For packets from Ser `RewriteMACRegMark`, mutually exclusive with `NotRewriteMACRegMark`, is loaded. Therefore, the packets will not be matched by the flow. -Flow 4 is designed to match packets destined for remote Pod CIDR. This involves installing a separate flow for each remote +Flow 4 is designed to match packets destined for a remote Pod CIDR. This involves installing a separate flow for each remote Node, with each flow matching the destination IP address of the packets against the Pod subnet for the respective Node. For the matched packets, the source MAC address is set to that of the local Antrea gateway MAC, and the destination MAC address is set to the *Global Virtual MAC*. The Openflow `tun_dst` field is set to the appropriate value (i.e. -the IP address of the remote Node IP). Additionally, `ToTunnelRegMark` is loaded, signifying that the packets will be +the IP address of the remote Node). Additionally, `ToTunnelRegMark` is loaded, signifying that the packets will be forwarded to remote Nodes through a tunnel. The matched packets are then forwarded to table [L3DecTTL] to decrease the TTL value. -Flow 5-7 matches packets destined for local Pods and marked by `RewriteMACRegMark` that signifies that the packets may +Flow 5-7 matches packets destined for local Pods and marked by `RewriteMACRegMark`, which signifies that the packets may originate from Service or inter-Node connections. For the matched packets, the source MAC address is set to that of the local Antrea gateway MAC, and the destination MAC address is set to the associated local Pod MAC address. The matched packets are then forwarded to table [L3DecTTL] to decrease the TTL value. @@ -1312,7 +1336,7 @@ packets are then forwarded to table [L3DecTTL] to decrease the TTL value. Flow 8 matches request packets originating from local Pods and destined for the external network, and then forwards them to table [EgressMark] dedicated to feature `Egress`. In table [EgressMark], SNAT IPs for Egress are looked up for the packets. To match the expected packets, `FromPodRegMark` is used to exclude packets that are not from local Pods. -Additionally, `NotAntreaFlexibleIPAMRegMark`, mutually exclusive with `AntreaFlexibleIPAMRegMark` that is used to mark +Additionally, `NotAntreaFlexibleIPAMRegMark`, mutually exclusive with `AntreaFlexibleIPAMRegMark` which is used to mark packets from Antrea IPAM Pods, is used since Egress can only be applied to Node IPAM Pods. Flow 9 matches request packets originating from remote Pods and destined for the external network, and then forwards them @@ -1323,11 +1347,11 @@ MAC address of the local Antrea gateway. Flow 10 matches packets from Service connections that are originating from the local Antrea gateway and destined for the external network. This is accomplished by matching `RewriteMACRegMark`, `FromGatewayRegMark`, and `ServiceCTMark`. The -destination MAC address is then set to that of the local Antrea gateway. Additionally, `ToGatewayRegMark` that will be -used with `FromGatewayRegMark` together to identify hairpin connections in table [SNATMark] is loaded. Finally, +destination MAC address is then set to that of the local Antrea gateway. Additionally, `ToGatewayRegMark`, which will be +used with `FromGatewayRegMark` together to identify hairpin connections in table [SNATMark], is loaded. Finally, the packets are forwarded to table [L3DecTTL]. -Flow 11 is the table-miss flow, matching packets originating from local Pods and destined for the external network, and +Flow 11 is the table-miss flow, and is used for packets originating from local Pods and destined for the external network, and then forwarding them to table [L2ForwardingCalc]. `ToGatewayRegMark` is loaded as the matched packets traverse the local Antrea gateway. @@ -1350,30 +1374,31 @@ If you dump the flows of this table, you may see the following: ``` Flows 1-2 match packets originating from local Pods and destined for the transport IP of remote Nodes, and then forward -them to table [L2ForwardingCalc] to skip Egress SNAT. `ToGatewayRegMark` is loaded, indicating that the output port of -the packets is the local Antrea gateway. +them to table [L2ForwardingCalc] to bypass the Pod-to-Node traffic from Egress SNAT. `ToGatewayRegMark` is loaded, +indicating that the output port of the packets is the local Antrea gateway. Flow 3 matches packets originating from local Pods and destined for the Service CIDR, and then forwards them to table -[L2ForwardingCalc] to skip Egress SNAT. Similar to flows 1-2, `ToGatewayRegMark` is also loaded. +[L2ForwardingCalc] to bypass the Pod-to-Service traffic from Egress SNAT. Similar to flows 1-2, `ToGatewayRegMark` is +also loaded. -Flow 4 match packets originating from local Pods selected by the sample Egress `egress-client`, whose SNAT IP is configured +Flow 4 match packets originating from local Pods selected by the sample [Egress egress-client], whose SNAT IP is configured on a remote Node, which means that the matched packets should be forwarded to the remote Node through a tunnel. Before sending the packets to the tunnel, the source and destination MAC addresses are set to the local Antrea gateway MAC and the *Global Virtual MAC* respectively. Additionally, `ToTunnelRegMark`, indicating that the output port is a tunnel, and `EgressSNATRegMark`, indicating that packets should undergo SNAT on a remote Node, are loaded. Finally, the packets are forwarded to table [L2ForwardingCalc]. -Flow 5 matches the first packet of connections originating from remote Pods selected by the sample Egress `egress-web` +Flow 5 matches the first packet of connections originating from remote Pods selected by the sample [Egress egress-web] whose SNAT IP is configured on the local Node, and then loads an 8-bit ID allocated for the associated SNAT IP defined -in the sample Egress to the `pkt_mark`, which will be identified by iptables on the local Node to perform SNAT with the +in the sample Egress to the `pkt_mark`, which will be consumed by iptables on the local Node to perform SNAT with the SNAT IP. Subsequently, `ToGatewayRegMark`, indicating that the output port is the local Antrea gateway, is loaded. Finally, the packets are forwarded to table [L2ForwardingCalc]. -Flow 6 matches the first packet of connections originating from local Pods selected by the sample Egress `egress-web`, +Flow 6 matches the first packet of connections originating from local Pods selected by the sample [Egress egress-web], whose SNAT IP is configured on the local Node. Similar to flow 4, the 8-bit ID allocated for the SNAT IP is loaded to `pkt_mark`, `ToGatewayRegMark` is loaded, and the packets are forwarded to table [L2ForwardingCalc] finally. -Flow 7 drops packets tunneled from remote Nodes (identified with `FromTunnelRegMark`, indicating that the packets are +Flow 7 drops all other packets tunneled from remote Nodes (identified with `FromTunnelRegMark`, indicating that the packets are from remote Pods through a tunnel). The packets are not matched by any flows 1-6, which means that they are here unexpected and should be dropped. @@ -1427,8 +1452,11 @@ packets to table [SNAT], `ToExternalAddressRegMark` and `NotDSRServiceRegMark` a are destined for a Service's external IP, like NodePort, LoadBalancerIP or ExternalIP, but it is not DSR mode. Additionally, `ConnSNATCTMark`, indicating that the connection requires SNAT, is persisted to mark the connections. -Flow 3-4 match the first packet of hairpin Service connections, identified by the same source and destination IP -addresses. Such hairpin connections will undergo with the IP address of the local Antrea gateway in table [SNAT]. +It's worthy to note that flows 1-2 are specific to `proxyAll`, but it is harmless when `proxyAll` is disabled since +these flows should be never matched by in-cluster Service traffic. + +Flow 3-4 match the first packet of hairpin Service connections, identified by the same source and destination Pod IP +addresses. Such hairpin connections will undergo SNAT with the IP address of the local Antrea gateway in table [SNAT]. Similar to flow 1, `ConnSNATCTMark` and `HairpinCTMark` are persisted to mark the connections. Flow 5 is the table-miss flow. @@ -1458,8 +1486,8 @@ Flow 2 matches the first packet of hairpin Service connection originating from l and `FromPodRegMark`. It performs SNAT with the IP address of the local Antrea gateway and forwards the SNAT'd packets to table [L2ForwardingCalc]. Similar to flow 1, `ServiceCTMark` and `HairpinCTMark` are persisted in `SNATCtZone`. -Flow 3 matches the subsequent request packets of connection whose first request packet has been performed SNAT and then -invoke `ct` action on the packets again to restore the "tracked" state in `SNATCtZone`. The packets with the appropriate +Flow 3 matches the subsequent request packets of connections for which SNAT was performed for the first packet, and then +invokes `ct` action on the packets again to restore the "tracked" state in `SNATCtZone`. The packets with the appropriate "tracked" state are forwarded to table [L2ForwardingCalc]. Flow 4 matches the first packet of Service connections requiring SNAT, identified by `ConnSNATCTMark` and @@ -1488,14 +1516,14 @@ If you dump the flows of this table, you may see the following: Flow 1 matches packets destined for the local Antrea gateway, identified by the destination MAC address being that of the local Antrea gateway. It loads `OutputToOFPortRegMark`, indicating that the packets should output to an OVS port, -and also loads port number of the local Antrea gateway to `TargetOFPortField`. Both of these two values will be consumed +and also loads the port number of the local Antrea gateway to `TargetOFPortField`. Both of these two values will be consumed in table [Output]. Flow 2 matches packets destined for a tunnel, identified by the destination MAC address being that of the *Global Virtual MAC*. Similar to flow 1, `OutputToOFPortRegMark` is loaded, and the port number of the tunnel is loaded to `TargetOFPortField`. -Flows 3-5 match packets destined for local Pods, identified by the destination MAC address being that of the local +Flows 3-5 match packets destined for local Pods, identified by the destination MAC address being that of one of the local Pods. Similar to flow 1, `OutputToOFPortRegMark` is loaded, and the port number of the local Pods is loaded to `TargetOFPortField`. @@ -1519,23 +1547,17 @@ If you dump the flows of this table, you may see the following: Flow 1 matches packets returned from TrafficControl return ports and forwards them to table [Output], where the packets are output to the port to which they are destined. To identify such packets, `OutputToOFPortRegMark`, indicating that the packets should be output to an OVS port, and `FromTCReturnRegMark` loaded in table [Classifier], indicating that -the packets are from a TrafficControl return port, are utilized. +the packets are from a TrafficControl return port, are used. -Flow 2 is installed for the sample TrafficControl `redirect-web-to-local`, which marks the packets destined for the Pods -labeled by `app: web` with `TrafficControlRedirectRegMark`, indicating the packets should be redirected to a -TrafficControl target port whose number is loaded to `TrafficControlTargetOFPortField`. +Flows 2-3 are installed for the sample [TrafficControl redirect-web-to-local] to mark the packets associated with the +Pods labeled by `app: web` using `TrafficControlRedirectRegMark`. Flow 2 handles the ingress direction, while flow 3 +handles the egress direction. In table [Output], these packets will be redirected to a TrafficControl target port +specified in `TrafficControlTargetOFPortField`, of which value is loaded in these 2 flows. -Flow 3 is also installed for the sample TrafficControl `redirect-web-to-local`. Similar to flow 2, -`TrafficControlRedirectRegMark` is loaded and the TrafficControl target port whose number is loaded to -`TrafficControlTargetOFPortField`. - -Flow 4 is installed for the sample TrafficControl `mirror-db-to-local`, which marks the packets destined for the Pods -labeled by `app: db` with `TrafficControlMirrorRegMark`, indicating the packets should be mirrored to a -TrafficControl target port whose number is loaded to `TrafficControlTargetOFPortField`. - -Flow 5 is also installed for the sample TrafficControl `redirect-web-to-local`. Similar to flow 2, -`TrafficControlRedirectRegMark` is loaded and the TrafficControl target port whose number is loaded to -`TrafficControlTargetOFPortField`. +Flows 4-5 are installed for the sample [TrafficControl mirror-db-to-local] to mark the packets associated with the Pods +labeled by `app: db` using `TrafficControlMirrorRegMark`. Similar to flows 2-3, flows 4-5 also handles the two directions. +In table [Output], these packets will be mirrored (duplicated) to a TrafficControl target port specified in +`TrafficControlTargetOFPortField`, of which value is loaded in these 2 flows. Flow 6 is the table-miss flow. @@ -1555,8 +1577,9 @@ If you dump the flows of this table, you may see the following: 7. table=IngressSecurityClassifier, priority=0 actions=goto_table:AntreaPolicyIngressRule ``` -Flow 1 matches locally generated request packets, identified by `pkt_mark` which is set by iptables in the host network -namespace. It forwards the packets to table [ConntrackCommit] directly to bypass all tables for ingress security. +Flow 1 matches locally generated request packets for liveness/readiness probes from kubelet, identified by `pkt_mark` +which is set by iptables in the host network namespace. It forwards the packets to table [ConntrackCommit] directly to +bypass all tables for ingress security. Flow 2 matches packets destined for NodePort Services and forwards them to table [AntreaPolicyIngressRule] to enforce Antrea-native NetworkPolicies applied to NodePort Services. Without this flow, if the selected Endpoint is not a local @@ -1579,8 +1602,8 @@ NetworkPolicies. Depending on the tier to which the policy belongs, the rules wi to that tier. The ingress table to tier mappings is as follows: ```text -K8s NetworkPolicy -> IngressRule Antrea-native NetworkPolicy other Tiers -> AntreaPolicyIngressRule +K8s NetworkPolicy -> IngressRule Antrea-native NetworkPolicy Baseline Tier -> IngressDefaultRule ``` @@ -1606,8 +1629,8 @@ ingress policies reside in the Application Tier, if you dump the flows for this 14. table=AntreaPolicyIngressRule, priority=0 actions=goto_table:IngressRule ``` -Flows 1-2, which are installed by default with the highest priority, matching non-new and "tracked" packets and -forwarding them to table [IngressMetric] to bypass the check from egress rules. This means that if a connection is +Flows 1-2, which are installed by default with the highest priority, match non-new and "tracked" packets and +forward them to table [IngressMetric] to bypass the check from egress rules. This means that if a connection is established, its packets go straight to table [IngressMetric], with no other match required. In particular, this ensures that reply traffic is never dropped because of an Antrea-native NetworkPolicy or K8s NetworkPolicy rule. However, this also means that ongoing connections are not affected if the Antrea-native NetworkPolicy or the K8s NetworkPolicy is @@ -1656,7 +1679,8 @@ rule `AllowFromClient` in the sample policy, serves as a default rule. Unlike th Antrea-native NetworkPolicy has no default rule, and all rules should be explicitly defined. Hence, they are evaluated as-is, and there is no need for a table [AntreaPolicyIngressDefaultRule]. These flows are described as follows: -- Flow 11 is used to match any packets, constituting the second dimension for `conjunction` with `conj_id` 4. +- Flow 11 is used to match any IP packets, constituting the second dimension for `conjunction` with `conj_id` 4. This + flow, which matches all IP packets, exists because we need at least 2 dimensions for a conjunctive match. - Flow 12 is used to match packets with the output OVS port in set {0x25}, which has all the ports of the Pods selected by the label `app: web`, constituting the first dimension for `conjunction` with `conj_id` 4. - Flow 13 is used to match packets meeting both dimensions of `conjunction` with `conj_id` 4. `APDenyRegMark` that @@ -1683,9 +1707,9 @@ If you dump the flows of this table, you should see something like this: Flows 1-4 are installed for the ingress rule in the sample K8s NetworkPolicy. These flows are described as follows: - Flow 1 is used to match packets with the source IP address in set {10.10.0.26}, which is from the Pods selected - by the label `app: client`, constituting the first dimension for `conjunction` with `conj_id` 3. + by the label `app: client` in the `default` Namespace, constituting the first dimension for `conjunction` with `conj_id` 3. - Flow 2 is used to match packets with the output port OVS in set {0x25}, which has all ports of the Pods selected - by the label `app: web`, constituting the second dimension for `conjunction` with `conj_id` 3. + by the label `app: web` in the `default` Namespace, constituting the second dimension for `conjunction` with `conj_id` 3. - Flow 3 is used to match packets with the destination TCP port in set {80} specified in the rule, constituting the third dimension for `conjunction` with `conj_id` 3. - Flow 4 is used to match packets meeting all the three dimensions of `conjunction` with `conj_id` 3 and forward @@ -1696,11 +1720,11 @@ Flow 5 is the table-miss flow to forward packets not matched by other flows to t ### IngressDefaultRule This table is similar in its purpose to table [EgressDefaultRule], and it complements table [IngressRule] for K8s -NetworkPolicy ingress rule implementation. In Kubernetes, when a NetworkPolicy is applied to a set of Pods, the default -behavior for these Pods becomes "deny" (they become [isolated +NetworkPolicy ingress rule implementation. In Kubernetes, when a NetworkPolicy is applied to a set of Pods, then the default +behavior for ingress connections for these Pods becomes "deny" (they become [isolated Pods](https://kubernetes.io/docs/concepts/services-networking/network-policies/#isolated-and-non-isolated-pods)). This table is in charge of dropping traffic destined for Pods to which a NetworkPolicy (with an ingress rule) is applied, -and which did not match any of the allow list rules. +and which did not match any of the "allow" list rules. If you dump the flows of this table, you may see the following: @@ -1793,10 +1817,12 @@ If you dump the flows of this table, you may see the following: Flow 1 is for case 1. It matches packets with `L7NPRedirectCTMark` and `OutputToOFPortRegMark`, and then outputs them to the port `antrea-l7-tap0` specifically created for connecting to an application-aware engine. Notably, these packets are pushed with an 802.1Q header and loaded with the VLAN ID value persisted in `L7NPRuleVlanIDCTLabel` before being output, due to -the implementation of `L7NetworkPolicy`. +the implementation of Antrea-native L7 NetworkPolicy. The application-aware engine enforcing L7 policies (e.g., Suricata) +can leverage the VLAN ID to determine which set of rules to apply to the packet. Flow 2 is for case 2. It matches packets with `TrafficControlMirrorRegMark` and `OutputToOFPortRegMark`, and then outputs them to the port specified in `TargetOFPortField` and the port specified in `TrafficControlTargetOFPortField`. +Unlike the `Redirect` action, the `Mirror` action creates an additional copy of the packet. Flow 3 is for case 3. It matches packets with `TrafficControlRedirectRegMark` and `OutputToOFPortRegMark`, and then outputs them to the port specified in `TrafficControlTargetOFPortField`. @@ -1810,6 +1836,10 @@ the value stored in `TargetOFPortField`. Flows 6-7 are for case 6. They match packets by matching `OutputToControllerRegMark` and the value stored in `PacketInOperationField`, then output them to the OpenFlow controller (Antrea Agent) with corresponding user data. +In practice, you will see additional flows similar to these ones to accommodate different scenarios (different +PacketInOperationField values). Note that packets sent to controller are metered to avoid overrunning the antrea-agent +and using too many resources. + Flow 8 is the table-miss flow for case 7. It drops packets that do not match any of the flows in this table. [ARPSpoofGuard]: #arpspoofguard @@ -1818,6 +1848,7 @@ Flow 8 is the table-miss flow for case 7. It drops packets that do not match any [Classifier]: #classifier [ClusterIP without Endpoint]: #clusterip-without-endpoint [ClusterIP]: #clusterip +[ConntrackCommit]: #conntrackcommit [ConntrackState]: #conntrackstate [ConntrackZone]: #conntrackzone [Ct Labels]: #ovs-ct-label @@ -1827,6 +1858,8 @@ Flow 8 is the table-miss flow for case 7. It drops packets that do not match any [EgressMark]: #egressmark [EgressMetric]: #egressmetric [EgressRule]: #egressrule +[Egress egress-client]: #egress-applied-to-client-pods +[Egress egress-web]: #egress-applied-to-web-pods [EndpointDNAT]: #endpointdnat [IngressDefaultRule]: #ingressdefaultrule [IngressMetric]: #ingressmetric @@ -1844,9 +1877,11 @@ Flow 8 is the table-miss flow for case 7. It drops packets that do not match any [SNAT]: #snat [Service with ExternalIP]: #service-with-externalip [Service with ExternalTrafficPolicy Local]: #service-with-externaltrafficpolicy-local -[Service with Session Affinity]: #service-with-session-affinity +[Service with session affinity]: #service-with-session-affinity [ServiceLB]: #servicelb [SessionAffinity]: #sessionaffinity [SpoofGuard]: #spoofguard [TrafficControl]: #trafficcontrol +[TrafficControl mirror-db-to-local]: #trafficcontrol-for-packet-mirroring +[TrafficControl redirect-web-to-local]: #trafficcontrol-for-packet-redirecting [UnSNAT]: #unsnat From d53ee0ed7ff502d094a13ce39dc809f85379ef85 Mon Sep 17 00:00:00 2001 From: Hongliang Liu Date: Fri, 26 Apr 2024 12:28:59 +0800 Subject: [PATCH 3/5] For review comments Signed-off-by: Hongliang Liu --- docs/design/ovs-pipeline.md | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/docs/design/ovs-pipeline.md b/docs/design/ovs-pipeline.md index b3c514ecd9b..f6b112ec9ef 100644 --- a/docs/design/ovs-pipeline.md +++ b/docs/design/ovs-pipeline.md @@ -1339,6 +1339,12 @@ To match the expected packets, `FromPodRegMark` is used to exclude packets that Additionally, `NotAntreaFlexibleIPAMRegMark`, mutually exclusive with `AntreaFlexibleIPAMRegMark` which is used to mark packets from Antrea IPAM Pods, is used since Egress can only be applied to Node IPAM Pods. +It's worthy to note that packets sourced from local Pods and destined for the Services listed in the option +`antreaProxy.skipServices` are unexpectedly matched by flow 8. This occurs due to that there is no flow in [ServiceLB] +to handle these Services. Consequently, the destination IP address of the packets, allocated from the Service CIDR, +is considered part of the "external network". No need to worry about the mismatch, as flow 3 in table [EgressMark] +is designed to match these packets and bypass them from undergoing SNAT by Egress. + Flow 9 matches request packets originating from remote Pods and destined for the external network, and then forwards them to table [EgressMark] dedicated to feature `Egress`. To match the expected packets, `FromTunnelRegMark` is used to include packets that are from remote Pods through a tunnel. Considering that the packets from remote Pods traverse a @@ -1374,12 +1380,16 @@ If you dump the flows of this table, you may see the following: ``` Flows 1-2 match packets originating from local Pods and destined for the transport IP of remote Nodes, and then forward -them to table [L2ForwardingCalc] to bypass the Pod-to-Node traffic from Egress SNAT. `ToGatewayRegMark` is loaded, -indicating that the output port of the packets is the local Antrea gateway. +them to table [L2ForwardingCalc] to bypass Egress SNAT. `ToGatewayRegMark` is loaded, indicating that the output port +of the packets is the local Antrea gateway. + +Flow 3 matches packets originating from local Pods and destined for the Services listed in the option +`antreaProxy.skipServices`, and then forwards them to table [L2ForwardingCalc] to bypass Egress SNAT. Similar to flows +1-2, `ToGatewayRegMark` is also loaded. -Flow 3 matches packets originating from local Pods and destined for the Service CIDR, and then forwards them to table -[L2ForwardingCalc] to bypass the Pod-to-Service traffic from Egress SNAT. Similar to flows 1-2, `ToGatewayRegMark` is -also loaded. +The packets, matched by flows 1-3, are forwared to this table by flow 8 in table [L3Forwarding], as they are classified +as part of traffic destined for the external network. However, these packets are not intended to undergo Egress SNAT. +Consequently, flows 1-3 are used to bypass Egress SNAT for these packets. Flow 4 match packets originating from local Pods selected by the sample [Egress egress-client], whose SNAT IP is configured on a remote Node, which means that the matched packets should be forwarded to the remote Node through a tunnel. Before From 1354a31e6773962de31a17123916fb4fa4a67c69 Mon Sep 17 00:00:00 2001 From: Hongliang Liu Date: Fri, 26 Apr 2024 14:44:17 +0800 Subject: [PATCH 4/5] For review comments Signed-off-by: Hongliang Liu --- docs/design/ovs-pipeline.md | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/docs/design/ovs-pipeline.md b/docs/design/ovs-pipeline.md index f6b112ec9ef..81cd0486e9f 100644 --- a/docs/design/ovs-pipeline.md +++ b/docs/design/ovs-pipeline.md @@ -789,14 +789,15 @@ addresses. There are some cases where the source IP of the packets through the l Antrea gateway IP address: - When Antrea is deployed with kube-proxy, and `AntreaProxy` is not enabled, packets from local Pods destined for Services - will first go through the gateway port, get load-balanced by the kube-proxy data path (undergo DNAT with a local Endpoint - selected by the kube-proxy) then re-enter through the gateway port. Then the packets are received on the gateway port - with a source IP belonging to a local Pod. + will first go through the gateway port, get load-balanced by the kube-proxy data path (undergoes DNAT) then re-enter + the OVS pipeline through the gateway port (through an "onlink" route, installed by Antrea, directing the DNAT'd packets + to the gateway port), resulting in the source IP being that of a local Pod. - When Antrea is deployed without kube-proxy, and both `AntreaProxy` and `proxyAll` are enabled, packets from the external - network destined for Services will be routed to OVS through the gateway port without changing the source IP. + network destined for Services will be routed to OVS through the gateway port without masquerading source IP. - When Antrea is deployed with kube-proxy, packets from the external network destined for Services whose - `externalTrafficPolicy` is set to `Local` will get load-balanced by the kube-proxy data path (undergo DNAT with a - local Endpoint selected by the kube-proxy) and then routed to OVS through the gateway without SNAT. + `externalTrafficPolicy` is set to `Local` will get load-balanced by the kube-proxy data path (undergoes DNAT with a + local Endpoint selected by the kube-proxy) and then enter the OVS pipeline through the gateway (through a "onlink" + route, installed by Antrea, directing the DNAT'd packets to the gateway port) without masquerading source IP. Flows 2-4 are for case 2, matching legitimate IP packets from local Pods. @@ -829,8 +830,8 @@ therefore, connection tracking fields for the packets are unset. Flow 3 is the table-miss flow. For reply packets from SNAT'd connections, whose destination IP is the translated SNAT IP, after invoking action `ct`, -the destination IP of the packets will be restored to the original IP stored in the connection tracking field `ct_nw_dst` -before SNAT. +the destination IP of the packets will be restored to the original IP before SNAT, stored in the connection tracking +field `ct_nw_dst` before SNAT. ### ConntrackZone @@ -921,9 +922,10 @@ If you dump the flows of this table, you may see the following: ``` Flow 1 matches packets destined for the local Node from local Pods. `NodePortRegMark` is loaded, indicating that the -packets are potentially destined for NodePort Services. We assume only one valid IP address, `192.168.77.102`, can serve -as the host IP address for NodePort based on the option `antreaProxy.nodePortAddresses`. If there are multiple valid IP -addresses specified in the option, a flow similar to flow 1 will be installed for each IP address. +packets are potentially destined for NodePort Services. We assume only one valid IP address, `192.168.77.102` (the +Node's transport IP), can serve as the host IP address for NodePort based on the option `antreaProxy.nodePortAddresses`. +If there are multiple valid IP addresses specified in the option, a flow similar to flow 1 will be installed for each +IP address. Flow 2 match packets destined for the *Virtual NodePort DNAT IP*. Packets destined for NodePort Services from the local Node or the external network is DNAT'd to the *Virtual NodePort DNAT IP* by iptables before entering the pipeline. @@ -1160,7 +1162,7 @@ flows are described as follows: - Flow 5 is used to match packets with the destination TCP port in set {3306} specified in the rule, constituting the third dimension for `conjunction` with `conj_id` 7. - Flow 6 is used to match packets meeting all the three dimensions of `conjunction` with `conj_id` 7 and forward them - to table [EgressMetric], persisting `conj_id` to `EgressRuleCTLabel` that will be consumed in table [EgressMetric]. + to table [EgressMetric], persisting `conj_id` to `EgressRuleCTLabel`, which will be consumed in table [EgressMetric]. Flows 7-9, whose priorities are all 14499, are installed for the egress rule with a `Drop` action defined after the rule `AllowToDB` in the sample policy, and serves as a default rule. Antrea-native NetworkPolicy does not have the same @@ -1339,11 +1341,11 @@ To match the expected packets, `FromPodRegMark` is used to exclude packets that Additionally, `NotAntreaFlexibleIPAMRegMark`, mutually exclusive with `AntreaFlexibleIPAMRegMark` which is used to mark packets from Antrea IPAM Pods, is used since Egress can only be applied to Node IPAM Pods. -It's worthy to note that packets sourced from local Pods and destined for the Services listed in the option -`antreaProxy.skipServices` are unexpectedly matched by flow 8. This occurs due to that there is no flow in [ServiceLB] +It's worth noting that packets sourced from local Pods and destined for the Services listed in the option +`antreaProxy.skipServices` are unexpectedly matched by flow 8 due to the fact that there is no flow in [ServiceLB] to handle these Services. Consequently, the destination IP address of the packets, allocated from the Service CIDR, is considered part of the "external network". No need to worry about the mismatch, as flow 3 in table [EgressMark] -is designed to match these packets and bypass them from undergoing SNAT by Egress. +is designed to match these packets and prevent them from undergoing SNAT by Egress. Flow 9 matches request packets originating from remote Pods and destined for the external network, and then forwards them to table [EgressMark] dedicated to feature `Egress`. To match the expected packets, `FromTunnelRegMark` is used to @@ -1387,7 +1389,7 @@ Flow 3 matches packets originating from local Pods and destined for the Services `antreaProxy.skipServices`, and then forwards them to table [L2ForwardingCalc] to bypass Egress SNAT. Similar to flows 1-2, `ToGatewayRegMark` is also loaded. -The packets, matched by flows 1-3, are forwared to this table by flow 8 in table [L3Forwarding], as they are classified +The packets, matched by flows 1-3, are forwarded to this table by flow 8 in table [L3Forwarding], as they are classified as part of traffic destined for the external network. However, these packets are not intended to undergo Egress SNAT. Consequently, flows 1-3 are used to bypass Egress SNAT for these packets. @@ -1462,7 +1464,7 @@ packets to table [SNAT], `ToExternalAddressRegMark` and `NotDSRServiceRegMark` a are destined for a Service's external IP, like NodePort, LoadBalancerIP or ExternalIP, but it is not DSR mode. Additionally, `ConnSNATCTMark`, indicating that the connection requires SNAT, is persisted to mark the connections. -It's worthy to note that flows 1-2 are specific to `proxyAll`, but it is harmless when `proxyAll` is disabled since +It's worth noting that flows 1-2 are specific to `proxyAll`, but they are harmless when `proxyAll` is disabled since these flows should be never matched by in-cluster Service traffic. Flow 3-4 match the first packet of hairpin Service connections, identified by the same source and destination Pod IP From caa54e67d289068663107ddb09829b0710614e09 Mon Sep 17 00:00:00 2001 From: Quan Tian Date: Fri, 26 Apr 2024 21:38:46 +0800 Subject: [PATCH 5/5] Address comment Signed-off-by: Quan Tian --- docs/design/ovs-pipeline.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/ovs-pipeline.md b/docs/design/ovs-pipeline.md index 81cd0486e9f..6f5d1d18b24 100644 --- a/docs/design/ovs-pipeline.md +++ b/docs/design/ovs-pipeline.md @@ -831,7 +831,7 @@ Flow 3 is the table-miss flow. For reply packets from SNAT'd connections, whose destination IP is the translated SNAT IP, after invoking action `ct`, the destination IP of the packets will be restored to the original IP before SNAT, stored in the connection tracking -field `ct_nw_dst` before SNAT. +field `ct_nw_dst`. ### ConntrackZone