diff --git a/docs/assets/ovs-pipeline-antrea-proxy.svg b/docs/assets/ovs-pipeline-antrea-proxy.svg index 7016a665305..28e7cdca320 100644 --- a/docs/assets/ovs-pipeline-antrea-proxy.svg +++ b/docs/assets/ovs-pipeline-antrea-proxy.svg @@ -3,13 +3,13 @@ inkscape:export-ydpi="300" inkscape:export-xdpi="300" inkscape:export-filename="/Users/lhongliang/Desktop/bitmap.png" - sodipodi:docname="ovs-pipeline.svg" - inkscape:version="1.1 (c4e8f9e, 2021-05-24)" + sodipodi:docname="ovs-pipeline-antrea-proxy.svg" + inkscape:version="1.3 (0e150ed, 2023-07-21)" id="svg8" version="1.1" - viewBox="0 0 384 227" - height="227mm" - width="384mm" + viewBox="0 0 450 327" + height="327mm" + width="450mm" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns="http://www.w3.org/2000/svg" @@ -17,7 +17,6 @@ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/"> - + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> - - - - - - - - - - - - - - - - - - - - - - - - - - - + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156488" + preserveAspectRatio="xMidYMid"> @@ -595,26 +490,25 @@ orient="auto" refY="0" refX="0" - id="marker1534-5" + id="marker1488-1" style="overflow:visible" inkscape:isstock="true"> @@ -624,42 +518,39 @@ orient="auto" refY="0" refX="0" - id="marker5914-9-9" + id="marker1644-8" style="overflow:visible" - inkscape:isstock="true" - inkscape:collect="always"> + inkscape:isstock="true"> @@ -669,11 +560,11 @@ orient="auto" refY="0" refX="0" - id="marker1488-1" + id="marker1644-7" style="overflow:visible" inkscape:isstock="true"> @@ -683,11 +574,11 @@ orient="auto" refY="0" refX="0" - id="marker1488-5" + id="marker1644-7-9" style="overflow:visible" inkscape:isstock="true"> @@ -697,11 +588,11 @@ orient="auto" refY="0" refX="0" - id="marker1644-8" + id="marker1644-7-9-1" style="overflow:visible" inkscape:isstock="true"> @@ -711,11 +602,11 @@ orient="auto" refY="0" refX="0" - id="marker2584-8" + id="marker1644-7-9-1-1" style="overflow:visible" inkscape:isstock="true"> @@ -725,11 +616,15 @@ orient="auto" refY="0" refX="0" - id="marker1586-8" + id="marker1644-7-9-1-5" style="overflow:visible" - inkscape:isstock="true"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> @@ -739,11 +634,15 @@ orient="auto" refY="0" refX="0" - id="marker1644-7" + id="marker2218-3" style="overflow:visible" - inkscape:isstock="true"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> @@ -753,26 +652,33 @@ orient="auto" refY="0" refX="0" - id="marker1644-7-9" + id="marker1644-7-9-1-5-6" style="overflow:visible" - inkscape:isstock="true"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> @@ -782,11 +688,15 @@ orient="auto" refY="0" refX="0" - id="marker1644-7-9-1" + id="marker1644-7-9-1-5-6-9" style="overflow:visible" - inkscape:isstock="true"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> @@ -796,11 +706,15 @@ orient="auto" refY="0" refX="0" - id="marker1644-7-9-1-1" + id="marker1644-7-9-1-5-5" style="overflow:visible" - inkscape:isstock="true"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> @@ -810,4026 +724,1512 @@ orient="auto" refY="0" refX="0" - id="marker1644-7-9-1-5" + id="marker1644-7-9-1-5-6-1" style="overflow:visible" - inkscape:isstock="true"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> + refY="0" + refX="0" + id="marker1644-7-9-1-5-9-7-6" + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + refY="0" + refX="0" + id="marker1644-7-9-1-5-69" + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> + + + + refY="0" + refX="0" + id="marker1644-7-9-1-5-9-5" + style="overflow:visible" + inkscape:isstock="true" + viewBox="0 0 17.773854 10.156488" + markerWidth="17.773853" + markerHeight="10.156487" + preserveAspectRatio="xMidYMid"> + + + + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + id="path1642-1-3-0-7-8-5" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + - - - - - - - - - - - - - - - - - - - - - - - - - - - + id="path1642-1-3-0-7-9-4-8" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-2-1-5-5" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-2-1-5-6" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-2-1-5-5-8" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-2-1-1" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-87" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-87-5" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + - - + id="path1642-1-3-0-7-87-2" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-87-2-2" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-87-2-22" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-87-2-22-6" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-87-2-22-6-9" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-5" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-45" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-0" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + id="path1642-1-3-0-7-0-8" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-87-5-8" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-87-2-22-4" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-87-2-23" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + id="path1642-1-3-0-7-2-1-5-5-8-8" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + id="path1642-1-3-0-7-49" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/design/ovs-pipeline.md b/docs/design/ovs-pipeline.md index 5188cfe43f5..72871898e6b 100644 --- a/docs/design/ovs-pipeline.md +++ b/docs/design/ovs-pipeline.md @@ -2,8 +2,7 @@ ## Terminology -* *Node Route Controller*: the [K8s - controller](https://kubernetes.io/docs/concepts/architecture/controller/) +* *Node Route Controller*: the [K8s controller](https://kubernetes.io/docs/concepts/architecture/controller/) which is part of the Antrea Agent and watches for updates to Nodes. When a Node is added, it updates the local networking configuration (e.g. configure the tunnel to the new Node). When a Node is deleted, it performs the necessary @@ -53,532 +52,584 @@ for more information about session affinity. **This document currently makes the following assumptions:** + * Antrea is deployed with default configurations and feature gates. + * IPv4 only. -* Antrea is used in encap mode (an overlay network is created between all Nodes) -* All the Nodes are Linux Nodes -* IPv6 is disabled -* AntreaProxy is enabled -* AntreaPolicy is enabled +## Dumping the Flows / Groups -## Dumping the Flows - -This guide includes a representative flow dump for every table in the pipeline, -in order to illustrate the function of each table. If you have a cluster running -Antrea, you can dump the flows for a given Node as follows: +This guide includes a representative flow dump for every table in the pipeline, in order to illustrate the function of +each table. If you have a cluster running Antrea, you can dump the flows or groups on a given Node as follows: ```bash -kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows [--no-stats] [--names] +# Dump all flows. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows -O Openflow15 [--no-stats] [--names] + +# Dump all groups. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-groups -O Openflow15 [--no-stats] [--names] ``` -where `` is the name of the Antrea Agent Pod running on -that Node and `` is the name of the bridge created by Antrea -(`br-int` by default). +where `` is the name of the Antrea Agent Pod running on that Node, and `` is the name +of the bridge created by Antrea (`br-int` by default). -## Registers +You can also dump the flows of a table or a group on a given Node as follows: -We use 2 32-bit OVS registers to carry information throughout the pipeline: - -* reg0 (NXM_NX_REG0): - - bits [0..3] are used to store the traffic source (from tunnel: 0, from - local gateway: 1, from local Pod: 2). It is set in [ClassifierTable]. - - bit 16 is used to indicate whether the destination MAC address of a packet - is "known", i.e. corresponds to an entry in [L2ForwardingCalcTable], which - is essentially a "dmac" table. - - bit 18 is used to indicate whether the packet should be output to the port - on which it was received. It is consumed in [L2ForwardingOutTable] - to output the packet with action `IN_PORT`. - - bit 19 is used to indicate whether the destination and source MACs of the - packet should be rewritten in [l3ForwardingTable]. The bit is set for - packets received from the tunnel port in [ClassifierTable]. The - destination MAC of such packets is the Global Virtual MAC and should be - rewritten to the destination port's MAC before output to the port. When such - a packet is destined to a Pod, its source MAC should be rewritten to the - local gateway port's MAC too. -* reg1 (NXM_NX_REG1): it is used to store the egress OF port for the packet. It - is set in [DNATTable] for traffic destined to Services and in - [L2ForwardingCalcTable] otherwise. It is consumed in [L2ForwardingOutTable] to - output each packet to the correct port. -* reg3 (NXM_NX_REG3): it is used to store selected Service Endpoint IPv4 address - in OVS group entry. It is consumed in [EndpointDNATTable]. -* reg4 (NXM_NX_REG4): - * bits [0..16] are used to store selected Service Endpoint port number in OVS - group entry. They are consumed in [EndpointDNATTable]. - * bits [17..18] are used to store the state of a Service request packet. - Marks in this field include, - * 0b001: packet needs to do Endpoint selection. - * 0b010: packet has done Endpoint selection. - * 0b011: packet has done Endpoint selection and the selection result needs to - be cached. - -## Network Policy Implementation - -Several tables of the pipeline are dedicated to [K8s Network -Policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/) -implementation ([EgressRuleTable], [EgressDefaultTable], [IngressRuleTable] and -[IngressDefaultTable]). - -The Antrea implementation of K8s Network Policy, including the communication -channel between the Controller and Agents, and how a Network Policy is mapped to -OVS flows at each Node, will be described in details in a separate document. For -the present document, we will use the Network Policy example below, and explain -how these simple ingress and egress rules map to individual flows as we describe -the relevant tables of our pipeline. - -```yaml -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: test-network-policy - namespace: default -spec: - podSelector: - matchLabels: - app: nginx - policyTypes: - - Ingress - - Egress - ingress: - - from: - - podSelector: - matchLabels: - app: nginx - ports: - - protocol: TCP - port: 80 - egress: - - to: - - podSelector: - matchLabels: - app: nginx - ports: - - protocol: TCP - port: 80 -``` +```bash +# Dump flows of a table. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-flows table= -O Openflow15 [--no-stats] [--names] -This Network Policy is applied to all Pods with the `nginx` app label in the -`default` Namespace. For these Pods, it only allows TCP traffic on port 80 from -and to Pods which also have the `nginx` app label. Because Antrea will only -install OVS flows for this Network Policy on Nodes for which some of the Pods -are the target of the policy, we have scheduled 2 `nginx` Pods on the same -Node. They received IP addresses 10.10.1.2 and 10.10.1.3 from the Antrea CNI, so -you will see these addresses show up in the OVS flows. - -## Antrea-native Policies Implementation - -In addition to the above tables created for K8s NetworkPolicy, Antrea creates -additional dedicated tables to support the [Antrea-native policies](../antrea-network-policy.md) -([AntreaPolicyEgressRuleTable] and [AntreaPolicyIngressRuleTable]). - -Consider the following Antrea ClusterNetworkPolicy (ACNP) in the Application tier as an -example for the remainder of this document. - -```yaml -apiVersion: crd.antrea.io/v1beta1 -kind: ClusterNetworkPolicy -metadata: - name: cnp0 -spec: - priority: 10 - tier: application # defaults to application tier if not specified - appliedTo: - - podSelector: - matchLabels: - app: server - ingress: - - action: Drop - from: - - podSelector: - matchLabels: - app: notClient - ports: - - protocol: TCP - port: 80 - egress: - - action: Allow - to: - - podSelector: - matchLabels: - app: dns - ports: - - protocol: UDP - port: 53 +# Dump a group. +kubectl exec -n kube-system -c antrea-ovs -- ovs-ofctl dump-groups -O Openflow15 [--no-stats] [--names] ``` -This ACNP is applied to all Pods with the `app: server` label in all -Namespaces. For these Pods, it drops TCP traffic on port 80 from all -Pods which have the `app: notClient` label. In addition to the ingress rules, -this policy also allows egress UDP traffic on port 53 to all Pods with the -label `app: dns`. Similar to K8s NetworkPolicy, Antrea will only install OVS -flows for this ACNP on Nodes for which some of the Pods are the target of the -policy. Thus, we have scheduled three Pods (appServer, appDns, appNotClient) -on the same Node and they have the following IP addresses: +where `` is the name of a table in the pipeline, `` is the ID of a group. -- appServer: 10.10.1.6 -- appNotClient: 10.10.1.7 -- appDns: 10.10.1.8 +## Registers -## Tables +We use some OVS registers to carry information throughout the pipeline. + +| Register | Field Range | Field Name | RegMark Value | RegMark Name | Description | +|---------------|-------------|---------------------------|---------------|---------------------------------|------------------------------------------------------------------------------------------| +| NXM_NX_REG0 | bits 0-3 | PktSourceField | 0x1 | FromTunnelRegMark | Packet source is tunnel port. | +| | | | 0x2 | FromGatewayRegMark | Packet source is Antrea gateway port. | +| | | | 0x3 | FromLocalRegMark | Packet source is local Pod. | +| | | | 0x4 | FromUplinkRegMark | Packet source is uplink port. | +| | | | 0x5 | FromBridgeRegMark | Packet source is local bridge port. | +| | | | 0x6 | FromTCReturnRegMark | Packet source is TrafficControl return port. | +| | bits 4-7 | PktDestinationField | 0x1 | ToTunnelRegMark | Packet destination is tunnel port. | +| | | | 0x2 | ToGatewayRegMark | Packet destination is local Antrea gateway port. | +| | | | 0x3 | ToLocalRegMark | Packet destination is local Pod. | +| | | | 0x4 | ToUplinkRegMark | Packet destination is uplink port. | +| | | | 0x5 | ToBridgeRegMark | Packet destination is local bridge port. | +| | bit 9 | | 0b0 | NotRewriteMACRegMark | Packet's source / destination MAC address does not need to be rewritten. | +| | | | 0b1 | RewriteMACRegMark | Packet's source / destination MAC address needs to be rewritten. | +| | bit 10 | | 0b1 | APDenyRegMark | Packet is denied (Drop / Reject) for Antrea NetworkPolicy. | +| | bits 11-12 | APDispositionField | 0b00 | DispositionAllowRegMark | Indicating Antrea NetworkPolicy disposition: allow. | +| | | | 0b01 | DispositionDropRegMark | Indicating Antrea NetworkPolicy disposition: drop. | +| | | | 0b11 | DispositionPassRegMark | Indicating Antrea NetworkPolicy disposition: pass. | +| | bit 13 | | 0b1 | GeneratedRejectPacketOutRegMark | Indicating packet is a generated reject response packet-out. | +| | bit 14 | | 0b1 | SvcNoEpRegMark | Indicating packet towards a Service without Endpoint (used by AntreaProxy). | +| | bit 19 | | 0b1 | RemoteSNATRegMark | Indicating packet needs SNAT on a remote Node (used by Egress). | +| | bit 22 | | 0b1 | L7NPRedirectRegMark | Indicating L7 Antrea NetworkPolicy disposition of redirect. | +| | bits 21-22 | OutputRegField | 0b01 | OutputToOFPortRegMark | Output packet to an OVS port. | +| | | | 0b10 | OutputToControllerRegMark | Send packet to Antrea Agent. | +| | bits 25-32 | PacketInOperationField | 0b00000001 | PacketInNPLoggingOperation | Indicating packet needs logging for NetworkPolicy packetIn operation. | +| | | | 0b00000010 | PacketInNPRejectOperation | Indicating packet should be rejected for NetworkPolicy packetIn operation. | +| | | | 0b00000100 | PacketInNPStoreDenyOperation | Indicating the corresponding connection has been dropped or rejected. | +| NXM_NX_REG1 | bits 0-31 | TargetOFPortField | | | Egress OVS port of packet. | +| NXM_NX_REG2 | bits 0-31 | SwapField | | | Swap values in flow fields in OpenFlow actions. | +| | | PacketInTableField | | | OVS table where the packet is decided to be sent to controller. | +| NXM_NX_REG3 | bits 0-31 | EndpointIPField | | | Field where stores selected Service Endpoint IPv4 address. | +| | bits 0-31 | APConjIDField | | | Field to store Conjunction ID for Antrea Policy. | +| NXM_NX_REG4 | bits 0-15 | EndpointPortField | | | Field stores IPv4 address of a Service's selected Endpoint. | +| | bits 16-18 | ServiceEPStateField | 0b001 | EpToSelectRegMark | Packet needs to do Service Endpoint selection. | +| | bits 16-18 | ServiceEPStateField | 0b010 | EpSelectedRegMark | Packet has done Service Endpoint selection. | +| | bits 16-18 | ServiceEPStateField | 0b011 | EpToLearnRegMark | Packet has done Service Endpoint selection and the selected Endpoint needs to be cached. | +| | bits 0-18 | EpUnionField | | | The union value of EndpointIPField and EndpointPortField. | +| | bit 19 | | 0b1 | ToNodePortAddressRegMark | Packet is destined to a Service of type NodePort. | +| | bit 20 | | 0b1 | AntreaFlexibleIPAMRegMark | Packet is from local Antrea IPAM Pod. | +| | bit 20 | | 0b0 | NotAntreaFlexibleIPAMRegMark | Packet is not from local Antrea IPAM Pod. | +| | bit 21 | | 0b1 | ToExternalAddressRegMark | Packet is destined to a Service's external IP. | +| | bits 22-23 | TrafficControlActionField | 0b01 | TrafficControlMirrorRegMark | Indicating packet needs to be mirrored (used by TrafficControl). | +| | | | 0b10 | TrafficControlRedirectRegMark | Indicating packet needs to be redirected (used by TrafficControl). | +| | bit 24 | | 0b1 | NestedServiceRegMark | Packet is destined to a Service which is using other other Service as Endpoints. | +| | bit 25 | | 0b1 | DSRServiceRegMark | Packet is destined to a Service working in DSR mode. | +| | | | 0b0 | NotDSRServiceRegMark | Packet is destined to a Service working not in DSR mode. | +| | bit 26 | | 0b1 | RemoteEndpointRegMark | Packet is destined to a Service selecting a remote non-hostNetwork Endpoint. | +| | bit 27 | | 0b1 | FromExternalRegMark | Packet is from Antrea gateway, but its source IP is not the gateway IP. | +| NXM_NX_REG5 | bits 0-31 | TFEgressConjIDField | | | Egress conjunction ID hit by TraceFlow packet. | +| NXM_NX_REG6 | bits 0-31 | TFIngressConjIDField | | | Ingress conjunction ID hit by TraceFlow packet. | +| NXM_NX_REG7 | bits 0-31 | ServiceGroupIDField | | | GroupID corresponding to the Service. | +| NXM_NX_REG8 | bits 0-11 | VLANIDField | | | VLAN ID. | +| | bits 12-15 | CtZoneTypeField | 0b0001 | IPCtZoneTypeRegMark | Ct zone type is IPv4. | +| | | | 0b0011 | IPv6CtZoneTypeRegMark | Ct zone type is IPv6. | +| | bits 0-15 | CtZoneField | | | Ct zone ID which is a combination of VLANIDField and CtZoneTypeField. | +| NXM_NX_XXREG3 | bits 0-127 | EndpointIP6Field | | | Field where stores IPv6 address of a Service's selected Endpoint. | + +Note that, regmarks that have overlapped bits will not be used at the same time, like `SwapField` and `PacketInTableField`. + +## CT Marks + +| Field Range | Field Name | CT Mark Value | CT Mark Name | Description | +|-------------|-----------------------|---------------|--------------------|-----------------------------------------------------------------| +| bits 0-3 | ConnSourceCTMarkField | 0b0010 | FromGatewayCTMark | Connection source is Antrea gateway port. | +| | | 0b0101 | FromBridgeCTMark | Connection source is local bridge port. | +| bit 4 | | 0b1 | ServiceCTMark | Connection is for Service. | +| | | 0b0 | NotServiceCTMark | Connection is not for Service. | +| bit 5 | | 0b1 | ConnSNATCTMark | SNAT is performed on the connection for Service. | +| bit 6 | | 0b1 | HairpinCTMark | Hairpin connection. | +| bit 7 | | 0b1 | L7NPRedirectCTMark | Connection should be redirected to an application-aware engine. | + +## CT Labels + +| Field Range | Field Name | Description | +|-------------|-----------------------|------------------------------------| +| bits 0-31 | IngressRuleCTLabel | Ingress rule ID. | +| bits 32-63 | EgressRuleCTLabel | Egress rule ID. | +| bits 64-75 | L7NPRuleVlanIDCTLabel | VLAN ID for L7 NetworkPolicy rule. | + +## CT Zones + +| Zone ID | Zone Name | Description | +|---------|--------------|----------------------------------------------------| +| 65520 | CtZone | Tracking IPv4 connections that don't require SNAT. | +| 65510 | CtZoneV6 | Tracking IPv6 connections that don't require SNAT. | +| 65521 | SNATCtZone | Tracking IPv4 connections that require SNAT. | +| 65511 | SNATCtZoneV6 | Tracking IPv6 connections that require SNAT. | | + +## OVS Tables ![OVS pipeline](../assets/ovs-pipeline-antrea-proxy.svg) -### ClassifierTable (0) +### PipelineRootClassifier -This table is used to determine which "category" of traffic (tunnel, local -gateway or local Pod) the packet belongs to. This is done by matching on the -ingress port for the packet. The appropriate value is then written to bits -[0..3] in NXM_NX_REG0: 0 for tunnel, 1 for local gateway and 2 for local Pod. -This information is used by matches in subsequent tables. For a packet received -from the tunnel port, bit 19 in NXM_NX_REG0 is set to 1, to indicate MAC rewrite -should be performed for the packet in [L3ForwardingTable]. +This table serves as the primary entry point in the pipeline, directing packets to different tables based on their +respective protocols. -If you dump the flows for this table, you may see the following: - -```text -1. table=0, priority=200,in_port=2 actions=set_field:0x1/0xf->reg0,resubmit(,10) -2. table=0, priority=200,in_port=1 actions=set_field:0/0xf->reg0,load:0x1->NXM_NX_REG0[19],resubmit(,30) -3. table=0, priority=190,in_port=4 actions=set_field:0x2/0xf->reg0,resubmit(,10) -4. table=0, priority=190,in_port=3 actions=set_field:0x2/0xf->reg0,resubmit(,10) -5. table=0, priority=0 actions=drop -``` - -Flow 1 is for traffic coming in on the local gateway. Flow 2 is for traffic -coming in through an overlay tunnel (i.e. from another Node). The next two -flows (3 and 4) are for local Pods. - -Local traffic then goes to [SpoofGuardTable], while tunnel traffic from other -Nodes goes to [ConntrackTable]. The table-miss flow entry will drop all -unmatched packets (in practice this flow entry should almost never be used). - -### SpoofGuardTable (10) - -This table prevents IP and ARP -[spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods. For -each Pod (as identified by the ingress port), we ensure that: - -* for IP traffic, the source IP and MAC addresses are correct, i.e. match the - values configured on the interface when Antrea set-up networking for the Pod. -* for ARP traffic, the advertised IP and MAC addresses are correct, i.e. match - the values configured on the interface when Antrea set-up networking for the - Pod. - -Because Antrea currently relies on kube-proxy to load-balance traffic destined -to Services, implementing that kind of IP spoofing check for traffic coming-in -on the local gateway port is not as trivial. Traffic from local Pods destined to -Services will first go through the gateway, get load-balanced by the kube-proxy -datapath (DNAT) then sent back through the gateway. This means that legitimate -traffic can be received on the gateway port with a source IP belonging to a -local Pod. We may add some fine-grained rules in the future to accommodate for -this, but for now we just allow all IP traffic received from the gateway. We do -have an ARP spoofing check for the gateway however, since there is no reason for -the host to advertise a different MAC address on antrea-gw0. - -If you dump the flows for this table, you may see the following: +If you dump the flows for this table, you should see the following: ```text -1. table=10, priority=200,ip,in_port=2 actions=resubmit(,23) -2. table=10, priority=200,arp,in_port=2,arp_spa=10.10.0.1,arp_sha=3a:dd:79:0f:55:4c actions=resubmit(,20) -3. table=10, priority=200,arp,in_port=4,arp_spa=10.10.0.2,arp_sha=ce:99:ca:bd:62:c5 actions=resubmit(,20) -4. table=10, priority=200,arp,in_port=3,arp_spa=10.10.0.3,arp_sha=3a:41:49:42:98:69 actions=resubmit(,20) -5. table=10, priority=200,ip,in_port=4,dl_src=ce:99:ca:bd:62:c5,nw_src=10.10.0.2 actions=resubmit(,23) -6. table=10, priority=200,ip,in_port=3,dl_src=3a:41:49:42:98:69,nw_src=10.10.0.3 actions=resubmit(,23) -7. table=10, priority=0 actions=drop +1. table=PipelineRootClassifier, priority=200,arp actions=goto_table:ARPSpoofGuard +2. table=PipelineRootClassifier, priority=200,ip actions=goto_table:Classifier +3. table=PipelineRootClassifier, priority=0 actions=drop ``` -After this table, ARP traffic goes to [ARPResponderTable], while IP -traffic goes to [ServiceHairpinTable]. Traffic which does not match -any of the rules described above will be dropped by the table-miss flow entry. +- Flow 1 forwards ARP packets to table [ARPSpoofGuard]. +- Flow 2 forwards IP packets to table [Classifier]. +- Flow 3 is the default drop flow, not normally used. -### ARPResponderTable (20) +### ARPSpoofGuard -The main purpose of this table is to reply to ARP requests from the local -gateway asking for the MAC address of a remote peer gateway (another Node's -gateway). This ensures that the local Node can reach any remote Pod, which in -particular is required for Service traffic which has been load-balanced to a -remote Pod backend by kube-proxy. Note that the table is programmed to reply to -such ARP requests with a "Global Virtual MAC" ("Global" because it is used by -all Antrea OVS bridges), and not with the actual MAC address of the remote -gateway. This ensures that once the traffic is received by the remote OVS -bridge, it can be directly forwarded to the appropriate Pod without actually -going through the gateway. The Virtual MAC is used as the destination MAC -address for all the traffic being tunnelled. +This table drops ARP [spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods and local Antrea gateway. +For each Pod (as identified by the ingress port), we ensure that: the advertised IP and MAC addresses are correct, i.e. +match the values configured on the interface when Antrea set up networking for the Pod. If you dump the flows for this table, you may see the following: ```text -1. table=20, priority=200,arp,arp_tpa=10.10.1.1,arp_op=1 actions=move:NXM_OF_ETH_SRC[]->NXM_OF_ETH_DST[],mod_dl_src:aa:bb:cc:dd:ee:ff,set_field:2->arp_op,move:NXM_NX_ARP_SHA[]->NXM_NX_ARP_THA[],load:0xaabbccddeeff->NXM_NX_ARP_SHA[],move:NXM_OF_ARP_SPA[]->NXM_OF_ARP_TPA[],load:0xa0a0101->NXM_OF_ARP_SPA[],IN_PORT -2. table=20, priority=190,arp actions=NORMAL -3. table=20, priority=0 actions=drop +1. table=ARPSpoofGuard, priority=200,arp,in_port="antrea-gw0",arp_spa=10.10.0.1,arp_sha=ba:5e:d1:55:aa:c0 actions=goto_table:ARPResponder +2. table=ARPSpoofGuard, priority=200,arp,in_port="nginx-d9-cd1533",arp_spa=10.10.0.7,arp_sha=2e:ba:06:b2:44:91 actions=goto_table:ARPResponder +3. table=ARPSpoofGuard, priority=200,arp,in_port="nginx-d9-b93cc5",arp_spa=10.10.0.8,arp_sha=c2:5a:5e:50:95:9b actions=goto_table:ARPResponder +4. table=ARPSpoofGuard, priority=0 actions=drop ``` -Flow 1 is the "ARP responder" for the peer Node whose local Pod subnet is -10.10.1.0/24. If we were to look at the routing table for the local Node, we -would see the following "onlink" route: - -```text -10.10.1.0/24 via 10.10.1.1 dev antrea-gw0 onlink -``` +- Flow 1 matches ARP packets from local Antrea gateway. +- Flows 2-3 match ARP packets from local Pods. +- Flow 4 is the default flow to drop ARP spoofing packets which are not matched flows 1-4. -A similar route is installed on the gateway (antrea-gw0) interface every time the -Antrea Node Route Controller is notified that a new Node has joined the -cluster. The route must be marked as "onlink" since the kernel does not have a -route to the peer gateway 10.10.1.1: we trick the kernel into believing that -10.10.1.1 is directly connected to the local Node, even though it is on the -other side of the tunnel. +For more details of flows 1-3: -Flow 2 ensures that OVS handle the remainder of ARP traffic as a regular L2 -learning switch (using the `normal` action). In particular, this takes care of -forwarding ARP requests and replies between local Pods. +- Match condition `arp` is to match ARP packets. +- Match condition `in_port=` is to match the port where packets are originated. +- Match condition `arp_spa=` is to match packets with ARP source protocol, address, which corresponds to the + IP address of a local Pod or Antrea gateway. +- Match condition `arp_sha=` matches packets with ARP source hardware address, which corresponds to the MAC + address of a local Pod or Antrea gateway. -The table-miss flow entry (flow 3) will drop all other packets. This flow should -never be used because only ARP traffic should go to this table, and -ARP traffic will either match flow 1 or flow 2. +### ARPResponder -### ServiceHairpinTable (23) +The main purpose of this table is to reply to ARP requests from local Antrea gateway asking for the MAC address of a +remote peer gateway (another Node's gateway). This ensures that the local Node can reach any remote Pod, which in particular +is required for Service traffic which has been load-balanced to a remote Pod backend by kube-proxy. Note that the table +is programmed to reply to such ARP requests with a "Global Virtual MAC" ("Global" means it is used by all Antrea OVS +bridges, which is `aa:bb:cc:dd:ee:ff`), and not with the actual MAC address of the remote gateway. This ensures that +once the traffic is received by the remote OVS bridge, it can be directly forwarded to the appropriate Pod without actually +going through the gateway. The virtual MAC is used as the destination MAC address for all the traffic being tunnelled. -When a backend Pod of a Service accesses the Service, and the Pod itself is selected -as the destination, then we have the hairpin case, in which the source IP should be -SNAT'd with a virtual hairpin IP in [hairpinSNATTable]. The source and destination -IP addresses cannot be the same, otherwise the connection will be broken. It will be -explained in detail in [hairpinSNATTable]. For response packets, the -destination IP is the virtual hairpin IP, so the destination IP should be changed back -to the IP of the backend Pod. Then the response packets can be forwarded back correctly. - -If you dump the flows for this table, you should see the flows: +If you dump the flows for this table, you may see the following: ```text -1. table=23, priority=200,ip,nw_dst=169.254.169.252 actions=move:NXM_OF_IP_SRC[]->NXM_OF_IP_DST[],load:0x1->NXM_NX_REG0[18],resubmit(,30) -2. table=23, priority=0 actions=resubmit(,24) +1. table=ARPResponder, priority=200,arp,arp_tpa=10.10.1.1,arp_op=1 actions=move:NXM_OF_ETH_SRC[]->NXM_OF_ETH_DST[],set_field:aa:bb:cc:dd:ee:ff->eth_src,set_field:2->arp_op,move:NXM_NX_ARP_SHA[]->NXM_NX_ARP_THA[],set_field:aa:bb:cc:dd:ee:ff->arp_sha,move:NXM_OF_ARP_SPA[]->NXM_OF_ARP_TPA[],set_field:10.10.1.1->arp_spa,IN_PORT +2. table=ARPResponder, priority=190,arp actions=NORMAL +3. table=ARPResponder, priority=0 actions=drop ``` -Flow 1 is used to match packet whose destination IP is virtual hairpin IP and -change the destination IP of the matched packet by loading register `NXM_OF_IP_SRC` -to `NXM_OF_IP_DST`. Bit 18 in NXM_NX_REG0 is set to 0x1, which indicates that the -packet should be output to the port on which it was received, which is done in -[L2ForwardingOutTable]. - -### ConntrackTable (30) +- Flow 1 matches ARP requests from Antrea gateway asking for the MAC address of a remote peer gateway with IP address + 10.10.1.1. The actions are taken to craft an ARP reply packet and send it back to the port where the ARP request was received. +- Flow 2 handles ARP request packets normally. +- Flow 3 is the default drop flow. + +For more details of flow 1: + +- Match condition `arp` is to match ARP packets. +- Match condition `arp_tpa=10.10.1.1` is to match packets with ARP target protocol address, which corresponds to the IP + address of a remote peer gateway. +- Match condition `arp_op=1` is to match ARP request packets. +- Action `move:NXM_OF_ETH_SRC[]->NXM_OF_ETH_DST[]` is to set destination MAC address with source MAC address of the + current packet. +- Action `set_field:aa:bb:cc:dd:ee:ff->eth_src` is to set source MAC address with global virtual MAC address `aa:bb:cc:dd:ee:ff`. +- Action `set_field:2->arp_op` is to set ARP type to reply. +- Action `move:NXM_NX_ARP_SHA[]->NXM_NX_ARP_THA[]` is to set ARP target hardware address with ARP source hardware address + of the current packet. +- Action `set_field:aa:bb:cc:dd:ee:ff->arp_sha` is to set ARP source hardware address with global virtual MAC address + `aa:bb:cc:dd:ee:ff`. +- Action `move:NXM_OF_ARP_SPA[]->NXM_OF_ARP_TPA[]` is to set ARP target protocol address with source protocol address of + the current packet. +- Action `set_field:10.10.1.1->arp_spa` is to set ARP source protocol address. +- Action `IN_PORT` is to set output port, ensuring the packet is sent to where it was received. + +### Classifier + +This table is used to determine which "category" of traffic (tunnel, local gateway or local Pod, etc.) a packet belongs +to. This is done by matching on the ingress port of the packet. -The sole purpose of this table is to invoke the `ct` action on all packets and -set the `ct_zone` (connection tracking context) to a hard-coded value, then -forward traffic to [ConntrackStateTable]. If you dump the flows for this table, -you should only see 1 flow: +If you dump the flows for this table, you may see the following: ```text -1. table=30, priority=200,ip actions=ct(table=31,zone=65520) +1. table=Classifier, priority=210,ip,in_port="antrea-gw0",nw_src=10.10.0.1 actions=set_field:0x2/0xf->reg0,goto_table:SpoofGuard +2. table=Classifier, priority=200,in_port="antrea-gw0" actions=set_field:0x2/0xf->reg0,set_field:0x8000000/0x8000000->reg4,goto_table:SpoofGuard +3. table=Classifier, priority=200,in_port="antrea-tun0" actions=set_field:0x1/0xf->reg0,set_field:0x200/0x200->reg0,goto_table:UnSNAT +4. table=Classifier, priority=190,in_port="nginx-d9-cd1533" actions=set_field:0x3/0xf->reg0,goto_table:SpoofGuard +5. table=Classifier, priority=190,in_port="nginx-d9-b93cc5" actions=set_field:0x3/0xf->reg0,goto_table:SpoofGuard +6. table=Classifier, priority=0 actions=drop ``` -A `ct_zone` is simply used to isolate connection tracking rules. It is similar -in spirit to the more generic Linux network namespaces, but `ct_zone` is -specific to conntrack and has less overhead. - -After invoking the ct action, packets will be in the "tracked" (`trk`) state and -all [connection tracking -fields](https://www.openvswitch.org/support/dist-docs/ovs-fields.7.txt) will be -set to the correct value. Packets will then move on to [ConntrackStateTable]. - -Refer to [this -document](https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/) for -more information on connection tracking in OVS. - -### ConntrackStateTable (31) - -This table handles "tracked" packets (packets which are moved to the tracked -state by the previous table [ConntrackTable]) and "untracked" packets (packets -is not in tracked state). - -This table serves the following purposes: +- Flow 1 matches the packets originated from local Node through Antrea gateway port. + - Match conditions `ip` and `nw_src=10.10.0.1` is to match packets originated from local Node. Note that, + `nw_src=10.10.0.1` cannot be used alone with ip family match condition like `ip`. + - Match condition `in_port="antrea-gw0"` is to match packets which are received on Antrea gateway port. + - Action `set_field:0x2/0xf->reg0` is to load `FromGatewayRegMark` to mark packet source. + - Action `goto_table:SpoofGuard` is to forward packets to table [SpoofGuard] to validate their legitimacy. +- Flow 2 matches the packets initiated from external through Antrea gateway port. Since packets originating from local + Node through local Antrea gateway are covered by flow 1, flow 2 can only match packets initiated from external. + - Match condition `in_port="antrea-gw0"` is to match packets which are received on Antrea gateway port. + - Action `set_field:0x2/0xf->reg0` is to load `FromGatewayRegMark` to mark packet source. + - Action `set_field:0x8000000/0x8000000->reg4` is to load `FromExternalRegMark` mark packets from external, not + local Node. + - Action `goto_table:SpoofGuard` is the same as flow 1. +- Flow 3 matches the packets through an overlay tunnel (i.e. from another Node). + - Match condition `in_port="antrea-tun0"` is to match packets which are received on Antrea tunnel port. + - Action `set_field:0x1/0xf->reg0` is to load `FromTunnelRegMark` to mark packet source. + - Action `set_field:0x200/0x200->reg0` is to load `RewriteMACRegMark`, which indicates that the destination and source + MACs of the packets should be overwritten. This mark will be used in table [L3Forwarding]. + - Action `goto_table:UnSNAT` is to forward packets to table [UnSNAT], rather than [SpoofGuard] without further verification. + This approach is based on the understanding that tunnel-borne packets stem from remote Nodes, potentially bearing + varying source IP addresses. It's important to note that these packets undergo verification before being tunneled. + As a consequence, packets from the tunnel should be seamlessly forwarded to table [UnSNAT]. +- Flows 4-5 match packets from local Pods. + - Match condition `in_port=` is to match packets which are received on local Pod ports. + - Action `set_field:0x3/0xf->reg0` is to load `FromLocalRegMark` to mark packet source. + - Action `goto_table:SpoofGuard` is the same as flow 1. +- Flow 6 is the default drop flow, not normally used in practice. + +### SpoofGuard + +This table drops IP [spoofing](https://en.wikipedia.org/wiki/Spoofing_attack) from local Pods. For each Pod (as identified by the ingress port), we ensure that +the source IP and MAC addresses are correct, i.e. match the values configured on the interface when Antrea set up +networking for the Pod. -* For tracked Service packets, bit 19 in NXM_NX_REG0 will be set to 0x1, then - the tracked packet will be forwarded to [EgressRuleTable] directly. -* Drop packets reported as invalid by conntrack. -* Non-Service tracked packets goes to [EgressRuleTable] directly. -* Untracked packets goes to [SessionAffinityTable] and [ServiceLBTable]. - -If you dump the flows for this table, you should see the following: +If you dump the flows for this table, you may see the following: ```text -1. table=31, priority=200,ct_state=-new+trk,ct_mark=0x21,ip actions=load:0x1->NXM_NX_REG0[19],resubmit(,50) -2. table=31, priority=190,ct_state=+inv+trk,ip actions=drop -3. table=31, priority=190,ct_state=-new+trk,ip actions=resubmit(,50) -4. table=31, priority=0 actions=resubmit(,40),resubmit(,41) +1. table=SpoofGuard, priority=200,ip,in_port="antrea-gw0" actions=goto_table:UnSNAT +2. table=SpoofGuard, priority=200,ip,in_port="nginx-d9-cd1533",dl_src=2e:ba:06:b2:44:91,nw_src=10.10.0.7 actions=goto_table:UnSNAT +3. table=SpoofGuard, priority=200,ip,in_port="nginx-d9-b93cc5",dl_src=c2:5a:5e:50:95:9b,nw_src=10.10.0.8 actions=goto_table:UnSNAT +4. table=SpoofGuard, priority=0 actions=drop ``` -Flow 1 is used to forward tracked Service packets to [EgressRuleTable] directly, -without passing [SessionAffinityTable], [ServiceLBTable] and [EndpointDNATTable]. -The flow also sets bit 19 in NXM_NX_REG0 to 0x1, which indicates that the destination -and source MACs of the matched packets should be rewritten in [l3ForwardingTable]. - -Flow 2 is used to drop packets which is reported as invalid by conntrack. - -Flow 3 is used to forward tracked non-Service packets to [EgressRuleTable] directly, -without passing [SessionAffinityTable], [ServiceLBTable] and [EndpointDNATTable]. - -Flow 4 is used to match the first packet of untracked connection and forward it to -[SessionAffinityTable] and [ServiceLBTable]. - -### SessionAffinityTable (40) +- Flow 1 matches packets from local Antrea gateway, but does not check source IP and MAC. There are some cases where the + source IP of packets through local Antrea gateway is not the local Antrea gateway IP: + - When Antrea is deployed with kube-proxy and AntreaProxy is not enabled, packets from local Pods destined to Services + will first go through the gateway, get load-balanced by the kube-proxy datapath (DNAT) then sent back through the + gateway. This means that legitimate packets can be received on the gateway port with a source IP belonging to a local + Pod. + - When both AntreaProxy and proxyAll are enabled, packets from external destined to Services will be routed to OVS + through the gateway without changing the source IP of the packets. + - When Antrea is deployed with kube-proxy and AntreaProxy is enabled, packets from external destined to Services will + get load-balanced by the kube-proxy datapath (DNAT), then be routed to OVS through the gateway without SNAT. +- Flows 2-3 matches regular IP packets from local Pods. + - Match condition `dl_src=` is to match packets with source MAC address, which corresponds to the MAC + address of a local Pod. + - Match condition `nw_src=` is to match packets with source IP address, which corresponds to the IP + address of a local Pod. + - Other match conditions and actions are +- Flow 4 is the default flow to drop IP spoofing packets. + +### UnSNAT + +This table is used to invoke `ct` action on the responded packets from Service connections that have been committed in +`SNATCtZone` or `SNATCtZoneV6`. After invoking `ct` action, packets will be in the "tracked" state and all [connection +tracking fields] (https://www.openvswitch.org//support/dist-docs/ovs-fields.7.txt) will be set to the correct value. + +Ct zone is simply used to isolate connection tracking rules. It is conceptually similar to the more generic Linux network +namespaces, but ct zone is specific to conntrack and has less overhead. Please refer to [this document] +(https://docs.openvswitch.org/en/latest/tutorials/ovs-conntrack/) for more information on connection tracking in OVS. -If `service.spec.sessionAffinity` of a Service is `None`, this table will set the value -of bits [16..18] in NXM_NX_REG4 to 0b001, which indicates that the Service needs to do -Endpoint selection. If you dump the flow, you should see the flow: +If you dump the flows for this table, you may see the following: ```text -table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18] +1. table=UnSNAT, priority=200,ip,nw_dst=169.254.0.253 actions=ct(table=ConntrackZone,zone=65521,nat) +2. table=UnSNAT, priority=200,ip,nw_dst=10.10.0.1 actions=ct(table=ConntrackZone,zone=65521,nat) +3. table=UnSNAT, priority=0 actions=goto_table:ConntrackZone ``` -If `service.spec.sessionAffinity` of a Service is `ClientIP`, when a client accesses -the Service for the first time, a learned flow with hard timeout which equals -`service.spec.sessionAffinityConfig.clientIP.timeoutSeconds` of the Service will be -generated in this table. This will be explained in detail in chapter [ServiceLBTable]. +- Flows 1-2 matches reply packets of SNATed Service connections. + - Match condition `nw_dst=169.254.0.253` is to match packets SNAT'd with a virtual IP. + - Match condition `nw_dst=10.10.0.1` is to match packets SNAT'd with local Antrea gateway IP. This will also match + packets which arefrom local Pods destined to local Antrea gateway, but it will not cause any side effect since such + connections will never be committed in `SNATCtZone`. + - Action `ct(table=ConntrackZone,zone=65521,nat)` is to invoke the `ct` action on matched packets. The packets will be + forked to table [ConntrackZone] and restored the "tracked" state in `SNATCtZone`, like the original IP before SNAT. +- Flow 3 is the default flow to forward packets to table [ConntrackZone]. -### ServiceLBTable (41) +### Conntrack -This table is used to implement Service Endpoint selection. Note that, currently, only -ClusterIP Service request from Pods is supported. NodePort, LoadBalancer and ClusterIP -whose client is from K8s Node will be supported in the future. +This table is used to invoke the `ct` action on packets from all connections. After invoking `ct` action, packets will +be in the "tracked" state. It's worth noting that when upon invoking `ct` action with `CtZone` to packets that have +"tracked" state in `SNATCtZone`, the "tracked" state in `SNATCtZone` will be inaccessible in `CtZone`. This transition +occurs because the "tracked" state shifts to the current ct zone. As previously mentioned, a ct zone is similar in +spirit to the more generic Linux network namespaces, uniquely containing a "tracked" state within each ct zone. -When a ClusterIP Service is created with `service.spec.sessionAffinity` set to `None`, if you -dump the flows, you should see the following flow: +If you dump the flows for this table, you may see the following: ```text -1. table=41, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19],group:5 +1. table=ConntrackZone, priority=200,ip actions=ct(table=ConntrackState,zone=65520,nat) +2. table=ConntrackZone, priority=0 actions=goto_table:ConntrackState ``` -Among the match conditions of the above flow: - -* `reg4=0x10000/0x70000`, value of bits [16..18] in NXM_NX_REG4 is 0b001, which is used - to match Service packet whose state is to do Endpoint selection. The value of - bits [16..18] in NXM_NX_REG4 is set in [SessionAffinityTable] by flow `table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18]`. +- Flow 1 invokes `ct` action on packets from all connections. The packets will be forked to table [ConntrackStateTable] + and restored the "tracked" state in `CtZone`. For Service connections, the original IP before DNAT will be restored. +- Flow 2 is an auto-generated flow that should remain unused. -The actions of the above flow: +### ConntrackState -* `load:0x2->NXM_NX_REG4[16..18]` is used to set the value of bits [16..18] in NXM_NX_REG4 - to 0b002, which indicates that Endpoint selection "is performed". Note that, Endpoint - selection has not really been done yet - it will be done by group action. The current - action should have been done in target OVS group entry after Endpoint selection. However, - we set the bits here, for the purpose of supporting more Endpoints in an OVS group. - Please check PR [#2101](https://github.com/antrea-io/antrea/pull/2101) to learn more information. -* `load:0x1->NXM_NX_REG0[19]` is used to set the value of bit 19 in NXM_NX_REG0 to 0x1, - which means that the source and destination MACs need to be rewritten. -* `group:5` is used to set the target OVS group. Note that, the target group needs to be - created first before the flow is created. +This table handles "tracked" packets from connections tracked in `CtZone` or `CtZoneV6`. The packets which are moved to +the "tracked" state in the previous table [ConntrackTable]). -Dump the group entry with command `ovs-ofctl dump-groups br-int 5`, you should see the -following: +If you dump the flows for this table, you may see the following: ```text -group_id=5,type=select,\ -bucket=bucket_id:0,weight:100,actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42),\ -bucket=bucket_id:1,weight:100,actions=load:0xa0a0003->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42),\ -bucket=bucket_id:2,weight:100,actions=load:0xa0a0004->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,42) +1. table=ConntrackState, priority=200,ct_state=+inv+trk,ip actions=drop +2. table=ConntrackState, priority=190,ct_state=-new+trk,ct_mark=0/0x10,ip actions=goto_table:AntreaPolicyEgressRule +3. table=ConntrackState, priority=190,ct_state=-new+trk,ct_mark=0x10/0x10,ip actions=set_field:0x200/0x200->reg0,goto_table:AntreaPolicyEgressRule +4. table=ConntrackState, priority=0 actions=goto_table:PreRoutingClassifier ``` -For the above OVS group, there are three buckets which have the same weight. Every bucket -has the same chance to be selected since they have the same weight. The selected bucket -will load Endpoint IPv4 address to NXM_NX_REG3, Endpoint port number to bits [0..15] -in NXM_NX_REG4. Then the matched packet will be resubmitted to [EndpointDNATTable]. +- Flow 1 drops packets which are reported as invalid by conntrack. + - Match condition `ct_state=+inv+trk` is to match packets which are reported as invalid by conntrack. + - Action `drop` is to drop packets. +- Flow 2 matches packets of non-Service connections which are in "tracked" state and committed to connection tracking module. + - Match condition `ct_state=-new+trk` is to match packets which are tracked in `CtZone` but not new. + - Match condition `ct_mark=0/0x10` is to match `NotServiceCTMark`, indicating packets are from non-Service connections. + - Action `goto_table:AntreaPolicyEgressRule` is to forward packets to table [AntreaPolicyEgressRule] to do egress + policy enforcement. +- Flow 3 matches packets of Service which are in "tracked" state and committed to connection tracking module. + - Match condition `ct_state=-new+trk` is the same as flow 2. + - Match condition `ct_mark=0x10/0x10` is to match `ServiceCTMark`, indicating packets are from Service connections. + `ServiceCTMark` is persisted when the corresponding connection is committed in `CtZone`, like sample flows 2-3 in + table [EndpointDNAT]. + - Action `set_field:0x200/0x200->reg0` is to load `RewriteMACRegMark`, which indicates that the destination and source + MACs of packets should be overwritten. This mark will be used in table [L3Forwarding]. + - Action `goto_table:AntreaPolicyEgressRule` is to forward packets to table [AntreaPolicyEgressRule] to do egress + policy enforcement directly, skipping the tables for Service Endpoint selection. +- Flow 4 matches packets which are not matched by flows 1-3 and forwards them to table[PreRoutingClassifier]. + +### PreRoutingClassifier + +This table sequentially resubmits the first packet from untracked connections to table [SessionAffinity] and table +[ServiceLB] to do Service Endpoint selection. -When a ClusterIP Service is created with `service.spec.sessionAffinity` set to `ClientIP`, you may -see the following flows: +If you dump the flows for this table, you should see the following: ```text -1. table=41, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=load:0x3->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19],group:5 -2. table=41, priority=190,tcp,reg4=0x30000/0x70000,nw_dst=10.107.100.231,tp_dst=443 actions=\ - learn(table=40,hard_timeout=300,priority=200,delete_learned,cookie=0x2040000000008, \ - eth_type=0x800,nw_proto=6,NXM_OF_TCP_DST[],NXM_OF_IP_DST[],NXM_OF_IP_SRC[],\ - load:NXM_NX_REG3[]->NXM_NX_REG3[],load:NXM_NX_REG4[0..15]->NXM_NX_REG4[0..15],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19]),\ - load:0x2->NXM_NX_REG4[16..18],\ - resubmit(,42) +1. table=PreRoutingClassifier, priority=200,ip actions=resubmit(,SessionAffinity),resubmit(,ServiceLB) +2. table=PreRoutingClassifier, priority=0 actions=goto_table:SessionAffinity ``` -When a client (assumed that the source IP is 10.10.0.2) accesses the ClusterIP for the first -time, the first packet of the connection will be matched by flow 1. Note that the action -`load:0x3->NXM_NX_REG4[16..18]` indicates that the Service Endpoint selection result needs -to be cached. - -Dump the group entry with command `ovs-ofctl dump-groups br-int 5`, you should see the -following: +- Flow 1 sequentially resubmits packets to table [SessionAffinity] and table [ServiceLB]. +- Flow 2 is the auto-generated flow which is never used. -```text -group_id=5,type=select,\ -bucket=bucket_id:0,weight:100,actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41),\ -bucket=bucket_id:1,weight:100,actions=load:0xa0a0003->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41),\ -bucket=bucket_id:2,weight:100,actions=load:0xa0a0004->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],resubmit(,41) -``` +### SessionAffinity -Note the action `resubmit(,41)` resubmits the first packet of a ClusterIP Service connection -back to [ServiceLBTable], not resubmits the packet to [EndpointDNATTable]. Then the -packet will be matched by flow 2 since value of bits [16..18] in NXM_NX_REG4 is 0b011. One -action of the flow is to generate a learned flow in [SessionAffinityTable], the other -action is to resubmit the packet to [EndpointDNATTable]. +This table is used to implement Service session affinity. The learned flows that cache the selected Endpoints are +installed here. -Now if you dump flows of table [SessionAffinityTable], you may see the following flows: +If you dump the flows for this table, you may see the following: ```text -1. table=40, hard_timeout=300, priority=200,tcp,nw_src=10.10.0.2,nw_dst=10.107.100.231,tp_dst=443 \ - actions=load:0xa0a0002->NXM_NX_REG3[],load:0x23c1->NXM_NX_REG4[0..15],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[19] -2. table=40, priority=0 actions=load:0x1->NXM_NX_REG4[16..18] +1. table=SessionAffinity, hard_timeout=300, priority=200,tcp,nw_src=10.10.0.7,nw_dst=10.96.76.15,tp_dst=80 \ + actions=set_field:0x50/0xffff->reg4,set_field:0/0x4000000->reg4,set_field:0xa0a0007->reg3,set_field:0x20000/0x70000->reg4,set_field:0x200/0x200->reg0 +2. table=SessionAffinity, priority=0 actions=set_field:0x10000/0x70000->reg4 ``` -Note that, flow 1 (the generated learned flow) has higher priority than flow 2 in table -[SessionAffinityTable]. When a particular client accesses the ClusterIP once again, the first -packet of the connection will be matched by flow 1 due to the match condition `nw_src=10.10.0.2`. - -The actions of flow 1: +- Flow 1 is a learned flow generated by flow 8 in [ServiceLB]. It matches the first packet of the subsequent connections + (the first is to do Endpoint selection and trigger learned flow) destined to a Service with setting + `service.spec.sessionAffinity` to `ClientIP`. When a client accesses the Service for the first time, this flow with + hard timeout which equals `service.spec.sessionAffinityConfig.clientIP.timeoutSeconds` of the Service will be installed. + - Field `hard_timeout=300` is the hard timeout of the learned flow. After the hard timeout, the learned flow will be deleted. + - Match condition `tcp` is to match TCP packets, generated by `eth_type=0x800` and `nw_proto=6` in table [ServiceLB]. + - Match condition `nw_src=10.10.0.7` is to match packets with source IP address, which corresponds to the IP address + of a client, generated by `NXM_OF_IP_SRC[]` in table [ServiceLB], flow 8. + - Match condition `nw_dst=10.96.76.15` is to match packets with destination IP address, which corresponds to the IP + address of a Service, generated by `NXM_OF_IP_DST[]` in table [ServiceLB], flow 8. + - Match condition `tp_dst=80` is to match packets with destination port, which corresponds to the port of a Service, + generated by `NXM_OF_TCP_DST[]` in table [ServiceLB], flow 8. + - Action `set_field:0x50/0xffff->reg4` is to set `EndpointPortField`, indicating that the packet is to do Endpoint + selection. + - Action `set_field:0/0x4000000->reg4` is to set bit 26 of `NXM_NX_REG4` to 0b0, which indicates that the cached + Endpoint is on remote Node. If the cached Endpoint is on local Node, bit 26 of `NXM_NX_REG4` should be 0b1. It is + generated by `load:NXM_NX_REG4[26]->NXM_NX_REG4[26]` in table [ServiceLB], flow 8. + - Action `set_field:0xa0a0007->reg3` is to load the selected Endpoint IP to `EndpointIPField`. It is generated by + `load:NXM_NX_REG3[]->NXM_NX_REG3[]` in table [ServiceLB], flow 8. + - Action `set_field:0x20000/0x70000->reg4` is to load `EpSelectedRegMark`, which indicates that the packet has done + Endpoint selection. + - Action `set_field:0x200/0x200->reg0` is to load `RewriteMACRegMark`, which indicates that the destination and source + MACs of packets should be overwritten. +- Flow 2 is to match the first packet of connections destined to Services. + - Action `set_field:0x10000/0x70000->reg4` is to load `EpToSelectRegMark`, which indicates that the packet is to do + Endpoint selection. + +### ServiceLB + +This table is used to implement Service Endpoint selection. By default, only ClusterIP Service requests from Pods is +supported. NodePort, LoadBalancer and ClusterIP whose client is non-Pod requests are supported when `proxyAll` is enabled. -* `load:0xa0a0004->NXM_NX_REG3[]` is used to load Endpoint IPv4 address to NXM_NX_REG3. -* `load:0x50->NXM_NX_REG4[0..15]` is used to load Endpoint port number to bits [0..15] in - NXM_NX_REG4. -* `load:0x2->NXM_NX_REG4[16..18]` is used to set the value of bits [16..18] in NXM_NX_REG4 to - 0b010, which indicates that the Service has done Endpoint selection. -* `load:0x1->NXM_NX_REG0[19]` is used to set the value of bit 19 in NXM_NX_REG0 to 0x1, which - indicates that the source and destination MACs need to be rewritten. - -Note that, if the value of bits [16..18] in NXM_NX_REG4 is 0b010 (set by action `load:0x2->NXM_NX_REG4[16..18]` -in table [SessionAffinityTable]), then packet will not be matched by any flows in table -[ServiceLBTable] except the last one. The last one just forwards the packet to table -[EndpointDNATTable] without selecting target OVS group. Then connections from a particular -client will always access the same backend Pod within the session timeout setting by -`service.spec.sessionAffinityConfig.clientIP.timeoutSeconds`. - -### EndpointDNATTable (42) +If you dump the flows for this table, you may see the following: -The table implements DNAT for Service traffic after Endpoint selection for the first -packet of a Service connection. +```text +1. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.101.255.29,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0x9->reg7,group:9 +2. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.105.31.235,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x20000/0x70000->reg4,set_field:0xc->reg7,group:12 +3. table=ServiceLB, priority=200,tcp,reg4=0x10000/0x70000,nw_dst=10.96.76.15,tp_dst=80 actions=set_field:0x200/0x200->reg0,set_field:0x30000/0x70000->reg4,set_field:0xa->reg7,group:10 +4. table=ServiceLB, priority=190,tcp,reg4=0x30000/0x70000,nw_dst=10.96.76.15,tp_dst=80 actions=learn(table=SessionAffinity,hard_timeout=300,priority=200,delete_learned,cookie=0x203000000000a,\ + eth_type=0x800,nw_proto=6,NXM_OF_TCP_DST[],NXM_OF_IP_DST[],NXM_OF_IP_SRC[],load:NXM_NX_REG4[0..15]->NXM_NX_REG4[0..15],load:NXM_NX_REG4[26]->NXM_NX_REG4[26],load:NXM_NX_REG3[]->NXM_NX_REG3[],load:0x2->NXM_NX_REG4[16..18],load:0x1->NXM_NX_REG0[9]),\ + set_field:0x20000/0x70000->reg4,goto_table:EndpointDNAT +5. table=ServiceLB, priority=0 actions=goto_table:EndpointDNAT +``` -If you dump the flows for this table, you should see flows like the following: +- Flows 1-2 match the first packet of connections destined to Services whose `service.spec.sessionAffinity` set to `None`. + - Match condition `tcp` matches the protocol of Service. This could also be `udp` or `sctp`. + - Match condition `reg4=0x10000/0x70000` is to match packets with `EpToSelectRegMark` (loaded in table [SessionAffinity], + flow 2), which indicates the corresponding Services should do Endpoint selection. + - Match condition `nw_dst=` is to match the IP of Service ClusterIP. + - Match condition `tp_dst=` is to match the port of Service ClusterIP. + - Action `set_field:0x200/0x200->reg0` is to load `RewriteMACRegMark`, which indicates that the destination and source + MACs of packets should be overwritten. This mark will be used in table [L3Forwarding]. + - Action `set_field:0x20000/0x70000->reg4` is to load `EpSelectedRegMark`, which indicates that Endpoint selection "is + performed". This mark will be used in table [EndpointDNAT]. Note that, Endpoint selection has not really been done + yet - it will be done by group action. The current action should have been done in target OVS group entry after + Endpoint selection. However, we set the bits here, for the purpose of supporting more Endpoints in an OVS group. + Please check PR [#2101](https://github.com/antrea-io/antrea/pull/2101) to learn more information. + - Action `set_field:0xa->reg7` loads the value of group ID to `ServiceGroupIDField`, which is used by the implementation + of NetworkPolicy. + - Action `group:10` sets the target OVS group. Note that, the target group needs to be created before the flow is created. +- Flow 3 matches the first packet of connection destined to Services whose `service.spec.sessionAffinity` set to `ClientIP`. + - Action `set_field:0x30000/0x70000->reg4` is to load `EpToLearnRegMark`, which indicates that Endpoint selection + "is performed" and the selection result needs to be cached. + - Other match conditions and actions are the same with flows 1-2. +- Flow 4 matches the packet previously matched by flow 3 (sending it to the related OVS group to do Endpoint selection + and resubmitting it back to this table). This flow will generate a learned flow in table [SessionAffinity] to match + the packets of subsequent connections of the same client IP, ensuring that the packets are forwarded to the same + Endpoint selected by the first time. + - Match condition `reg4=0x30000/0x70000` is to match packets with `EpToLearnRegMark` (loaded in table [ServiceLB], + flow 3), which indicates the corresponding Services should do Endpoint selection and cache the selection result. + - Action `learn` is to generate a learned flow in table [SessionAffinity]. + - Field `table=SessionAffinity` is the table where learned flow is generated. + - Field `hard_timeout=300` is the hard timeout of learned flow. + - Field `priority=200` is the priority of learned flow. + - Field `delete_learned` means that learned flow will be deleted after hard timeout. + - Field `cookie=0x203000000000a` is the cookie of learned flow. + - Field `eth_type=0x800` generates a match condition in learned flow to match packets with IPv4 protocol. + - Field `nw_proto=6` generates a match condition in learned flow to match the packets with TCP protocol. + - Field `NXM_OF_TCP_DST[]` generates a match condition in learned flow to match the packets with the TCP destination + port (Service port) of the current packet. In learned flow, it could be like `tcp_dst=80`. This field could be + also `NXM_OF_UDP_DST[]` or `NXM_OF_SCTP_DST[]` if the protocol of the Service is UDP or SCTP. + - Field `NXM_OF_IP_DST[]` generates a match condition in learned flow to match the packets with the destination IP + (Service IP) of the current packet. In learned flow, it could be like `nw_dst=10.96.76.15`. + - Field `NXM_OF_IP_SRC[]` generates a match condition in the learned flow to match the packets with the source IP + (client IP) of the current packet. In learned flow, it could be like `nw_src=10.10.0.7`. + - Field `load:NXM_NX_REG4[0..15]->NXM_NX_REG4[0..15]` generates an action in learned flow, which loads the value of + `EndpointPortField` in current flow to the same bits of `NXM_NX_REG4` in learned flow. In learned flow, it could + be like `set_field:0x50/0xffff->reg4`. The generated action is used to cache the selected Endpoint port. + - Field `load:NXM_NX_REG4[26]->NXM_NX_REG4[26]` generates an action in learned flow, which loads bit 26 of + `NXM_NX_REG4` in current flow to the same bits of `NXM_NX_REG4` in learned flow. In learned flow, it could be like + `set_field:0/0x4000000->reg4` or `set_field:0x4000000/0x4000000->reg4`. The generated action is used to indicate + that the selected Endpoint is on local Node or not. + - Field `load:NXM_NX_REG3[]->NXM_NX_REG3[]` generates an action in the learned flow, which loads `EndpointIPField` + in the current packet to the same bits of `NXM_NX_REG3` in learned flow. In the learned flow, it could be like + `set_field:0xa0a0007->reg3`. The generated action is used to cache the selected Endpoint IP. + - Field `load:0x2->NXM_NX_REG4[16..18]` generates an action in learned flow, which loads `EpSelectedRegMark`. In the + learned flow, it should be `set_field:0x20000/0x70000->reg4`. The generated action is used to indicate that Endpoint + selection "is performed". + - Field `load:0x1->NXM_NX_REG0[9]` generates an action in the learned flow, which loads `RewriteMACRegMark`. In + learned flow, it should be `set_field:0x200/0x200->reg0`. The generated action is used to indicate that the + destination and source MACs of packets should be overwritten. + - Action `set_field:0x20000/0x70000->reg4` is to load `EpSelectedRegMark`, which indicates that Endpoint selection + "is performed". This mark will be used in table [EndpointDNAT]. + - Action `goto_table:EndpointDNAT` is to send the packet to table [EndpointDNAT] after generating learned flow. + - Other match conditions are the same with flows 1-2. +- Flow 4 is the default auto-generated flow. + +The Endpoint selection is performed in OVS groups. If you dump the groups, you may see the following: ```text -1. table=42, priority=200,tcp,reg3=0xc0a84d64,reg4=0x2192b/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=192.168.77.100:6443),exec(load:0x21->NXM_NX_CT_MARK[])) -2. table=42, priority=200,tcp,reg3=0xc0a84d65,reg4=0x2286d/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=192.168.77.101:10349),exec(load:0x21->NXM_NX_CT_MARK[])) -3. table=42, priority=200,tcp,reg3=0xa0a0004,reg4=0x20050/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.0.4:80),exec(load:0x21->NXM_NX_CT_MARK[])) -4. table=42, priority=200,tcp,reg3=0xa0a0102,reg4=0x20050/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.1.2:80),exec(load:0x21->NXM_NX_CT_MARK[])) -5. table=42, priority=200,udp,reg3=0xa0a0002,reg4=0x20035/0x7ffff actions=ct(commit,table=45,zone=65520,nat(dst=10.10.0.2:53),exec(load:0x21->NXM_NX_CT_MARK[])) -6. table=42, priority=190,reg4=0x20000/0x70000 actions=load:0x1->NXM_NX_REG4[16..18],resubmit(,41) -7. table=42, priority=0 actions=resubmit(,45) +9. group_id=9,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0xa0a0007->reg3,set_field:0x50/0xffff->reg4,resubmit(,EndpointDNAT),\ + bucket=bucket_id:1,weight:100,actions=set_field:0xa0a0008->reg3,set_field:0x50/0xffff->reg4,resubmit(,EndpointDNAT) +10. group_id=10,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0xa0a0008->reg3,set_field:0x50/0xffff->reg4,resubmit(,ServiceLB),\ + bucket=bucket_id:1,weight:100,actions=set_field:0xa0a0007->reg3,set_field:0x50/0xffff->reg4,resubmit(,ServiceLB) +12. group_id=12,type=select,\ + bucket=bucket_id:0,weight:100,actions=set_field:0x4000/0x4000->reg0,resubmit(,EndpointDNAT) ``` -For flow 1-5, DNAT is performed with the IPv4 address stored in NXM_NX_REG3 and port number stored in -bits[0..15] in NXM_NX_REG4 by `ct commit` action. Note that, the match condition `reg4=0x2192b/0x7ffff` -is a union value. The value of bits [0..15] is port number. The value of bits [16..18] is 0b010, -which indicates that Service has done Endpoint selection. Service ct_mark `0x21` is also marked. +- Group 9 is the target of flow 6. It is used to select an Endpoint for Service whose `service.spec.sessionAffinity` set + to `None`. There are two buckets in this group. Every bucket has the same chance to be selected since they have the + same weight. + - Action `load:0xa0a0007->NXM_NX_REG3[]` is to load Endpoint IPv4 address `10.10.0.7` to `EndpointIPField`. + - Action `load:0x50->NXM_NX_REG4[0..15]` is to load Endpoint port number `80` to `EndpointPortField`. + - Action `resubmit(,EndpointDNAT)` resubmits packets to table [EndpointDNAT]. +- Group 10 is the target of flow 7. It is used to select an Endpoint for Service whose `service.spec.sessionAffinity` + set to `ClientIP` like group 9. + - Action `load:0xa0a0008->NXM_NX_REG3[]` is to load Endpoint IPv4 address `10.10.0.8` to `EndpointIPField`. + - Action `load:0x50->NXM_NX_REG4[0..15]` is to load Endpoint port number `80` to `EndpointPortField`. + - Action `resubmit(,ServiceLB)` resubmits packets back to table [ServiceLB]. Then the packet will be matched by + flow 8. +- Group 12 is the target of flow 6. The group has only a single bucket. + - Action `load:0x4000/0x4000->reg0` is to load `SvcNoEpRegMark`, which indicates that the Service has no Endpoint. + - Action `resubmit(,EndpointDNAT)` resubmits packets to table [EndpointDNAT]. -If none of the flows described above are hit, flow 6 is used to forward packet back to table [ServiceLBTable] -to select Endpoint again. +### EndpointDNAT -Flow 7 is used to match non-Service packet. +The table implements DNAT for Service connection after Endpoint selection in table [ServiceLB] -### AntreaPolicyEgressRuleTable (45) +If you dump the flows for this table, you may see the following:: -For this table, you will need to keep in mind the ACNP -[specification](#antrea-native-policies-implementation) -that we are using. +```text +1. table=EndpointDNAT, priority=200,reg0=0x4000/0x4000 actions=controller(reason=no_match,id=62373,userdata=04) +2. table=EndpointDNAT, priority=200,tcp,reg3=0xa0a0007,reg4=0x20050/0x7ffff actions=ct(commit,table=AntreaPolicyEgressRule,zone=65520,nat(dst=10.10.0.7:80),exec(set_field:0x10/0x10->ct_mark,move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +3. table=EndpointDNAT, priority=200,tcp,reg3=0xa0a0008,reg4=0x20050/0x7ffff actions=ct(commit,table=AntreaPolicyEgressRule,zone=65520,nat(dst=10.10.0.8:80),exec(set_field:0x10/0x10->ct_mark,move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +4. table=EndpointDNAT, priority=190,reg4=0x20000/0x70000 actions=set_field:0x10000/0x70000->reg4,resubmit(,ServiceLB) +5. table=EndpointDNAT, priority=0 actions=goto_table:AntreaPolicyEgressRule +``` -This table is used to implement the egress rules across all Antrea-native policies, -except for policies that are created in the Baseline Tier. Antrea-native policies -created in the Baseline Tier will be enforced after K8s NetworkPolicies, and their -egress rules are installed in the [EgressDefaultTable] and [EgressRuleTable] -respectively, i.e. +- Flow 1 matches the first packet of connections destined to any Services that have no Endpoint. + - Match condition `reg0=0x4000/0x4000` is to match `SvcNoEpRegMark`, which indicates that the Service has no Endpoint. + The mark is loaded in OVS group, like group 12 mentioned above. + - Action `controller(reason=no_match,id=62373,userdata=04)` forwards the packet to Antrea Agent to do further process. +- Flows 2-3 matches the first packet of connections destined to any Services that have selected the Endpoint whose IPv4 + address stored in `EndpointIPField` and port number stored in `EndpointPortField`. + - Match condition `reg4=0x20050/0x7ffff` is a union matching of `EndpointPortField` and `ServiceEPStateField`. The + value of `ServiceEPStateField` is 0b010 (`EpSelectedRegMark`), which indicates that Service has done Endpoint selection. + - Action `ct` performs DNAT and set some bits of ct mark. After this action, a new packet will be forked from the original + packet, with replacing the destination IP address and port with the Endpoint's IP address and port, to table + [AntreaPolicyEgressRule]. + - Field `commit` means to commit connection to the connection tracking module. + - Field `table=AntreaPolicyEgressRule` is the table where packets will be forked. + - Field `zone=65520` is to commit connection to `CtZone`. + - Field `nat(dst=:)` is to replace the destination IP and destination port (DNAT). + - Field `exec` sets some bits of ct mark. + - Action `set_field:0x10/0x10->ct_mark` is to load `ServiceCTMark`, which indicates that packet is from Service connections. + - Action `move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3]` is load value of `PktSourceField` to `ConnSourceCTMarkField`. +- Flow 4 is forward the packet which is not matched by flows 1-3 back to table [ServiceLB]. +- Flow 5 is the auto-generated flow to match the packet of non-Service connections. + +### AntreaPolicyEgressRule + +For this table, you will need to keep in mind the ACNP [specification](#antrea-native-policies-implementation) that we +are using. + +This table is used to implement the egress rules across all Antrea-native policies, except for policies that are created +in the Baseline Tier. Antrea-native policies created in the Baseline Tier will be enforced after K8s NetworkPolicies, +and their egress rules are installed in table [EgressDefault] and [EgressRule] respectively, i.e. ```text Baseline Tier -> EgressDefaultTable(60) @@ -586,58 +637,63 @@ K8s NetworkPolicy -> EgressRuleTable(50) All other Tiers -> AntreaPolicyEgressRuleTable(45) ``` -Since the example ACNP resides in the Application tier, if you dump the flows for -table 45, you should see something like this: +ACNP relies on the OVS built-in `conjunction` action to implement policies efficiently. Assumed that there is an ACNP +resides in the Application tier. If you dump the flows for this table, you may see the following: ```text -1. table=45, priority=64990,ct_state=-new+est,ip actions=resubmit(,61) -2. table=45, priority=14000,conj_id=1,ip actions=load:0x1->NXM_NX_REG5[],ct(commit,table=61,zone=65520,exec(load:0x1->NXM_NX_CT_LABEL[32..63])) -3. table=45, priority=14000,ip,nw_src=10.10.1.6 actions=conjunction(1,1/3) -4. table=45, priority=14000,ip,nw_dst=10.10.1.8 actions=conjunction(1,2/3) -5. table=45, priority=14000,udp,tp_dst=53 actions=conjunction(1,3/3) -6. table=45, priority=0 actions=resubmit(,50) +1. table=AntreaPolicyEgressRule, priority=64990,ct_state=-new+est,ip actions=goto_table:EgressMetric +2. table=AntreaPolicyEgressRule, priority=14800,ip,nw_src=10.10.0.7 actions=conjunction(2,1/3) +3. table=AntreaPolicyEgressRule, priority=14800,ip,nw_src=10.10.0.8 actions=conjunction(2,1/3) +4. table=AntreaPolicyEgressRule, priority=14800,ip,nw_dst=10.10.0.1 actions=conjunction(2,2/3) +5. table=AntreaPolicyEgressRule, priority=14800,ip,nw_dst=10.10.0.2 actions=conjunction(2,2/3) +6. table=AntreaPolicyEgressRule, priority=14800,tcp,tp_dst=80 actions=conjunction(2,3/3) +7.1. table=AntreaPolicyEgressRule, priority=14800,conj_id=2,ip actions=set_field:0x2->reg5,ct(commit,table=EgressMetric,zone=65520,exec(set_field:0x200000000/0xffffffff00000000->ct_label)) +7.2. table=AntreaPolicyEgressRule, priority=14800,conj_id=2 actions=set_field:0x2->reg3,set_field:0x400/0x400->reg0,goto_table:EgressMetric +7.3. table=AntreaPolicyEgressRule, priority=14800,conj_id=2 actions=set_field:0x2->reg3,set_field:0x400/0x400->reg0,set_field:0x4000000/0xfe000000->reg0,set_field:0xd/0xff->reg2,group:2 +8. table=AntreaPolicyEgressRule, priority=0 actions=goto_table:EgressRule ``` -Similar to [K8s NetworkPolicy implementation](#egressruletable-50), -AntreaPolicyEgressRuleTable also relies on the OVS built-in `conjunction` action to -implement policies efficiently. - -The above example flows read as follow: if the source IP address is in set -{10.10.1.6}, and the destination IP address is in the set {10.10.1.8}, and the -destination TCP port is in the set {53}, then use the `conjunction` action with -id 1, which stores the `conj_id` 1 in `ct_label[32..63]` for egress metrics collection -purposes, and forwards the packet to EgressMetricsTable, then [L3ForwardingTable]. -Otherwise, go to [EgressRuleTable] if no conjunctive flow above priority 0 is matched. -This corresponds to the case where the packet is not matched by any of the Antrea-native -policy egress rules in any tier (except for the "baseline" tier). - -If the `conjunction` action is matched, packets are "allowed" or "dropped" -based on the `action` field of the policy rule. If allowed, they follow a similar -path as described in the following [EgressRuleTable] section. - -Unlike the default of K8s NetworkPolicies, Antrea-native policies have no such -default rules. Hence, they are evaluated as-is, and there is no need for a -AntreaPolicyEgressDefaultTable. - -### EgressRuleTable (50) - -For this table, you will need to keep mind the Network Policy -[specification](#network-policy-implementation) that we are using. We have 2 -Pods running on the same Node, with IP addresses 10.10.1.2 to 10.10.1.3. They -are allowed to talk to each other using TCP on port 80, but nothing else. +- Flow 1 matches packets of connection whose state is `established` and not `new` and forward them to table [EgressMetric]. +- Flow 2-3 matches packets sourced from `10.10.0.7` and `10.10.0.8` respectively. Action `conjunction(2,1/3)` means that + this is as the first condition of all 3 conditions for `conj_id` 2. +- Flow 4-5 matches packets destined to `10.10.0.1` and `10.10.0.2` respectively. Action `conjunction(2,2/3)` means that + this is as the second condition of all 3 conditions for `conj_id` 2. +- Flow 6 matches packets destined to TCP port `80`. Action `conjunction(2,2/3)` means that this is as the third condition + of all 3 conditions for `conj_id` 2. +- Flow 7.1 matches packets which are matched by all 3 conditions of `conj_id` 2 when the action of the ACNP is `Allow`. + - Action `set_field:0x2->reg5` is to load `conj_id` to `APConjIDField`, which is used by feature Traceflow. + - Action `ct` is to persist `conj_id` to ct label in `CtZone`. + - Field `commit` means to commit connection to the connection tracking module. + - Field `table=EgressMetric` is the table where packets will be forked. + - Field `zone=65520` is to commit connection to `CtZone`. + - Field `exec` sets some bits of ct mark. + - Action `set_field:0x200000000/0xffffffff00000000->ct_label` is to load current `conj_id` value to `EgressRuleCTLabel` + for egress metrics collection purposes. +- Flow 7.2 matches packets which are matched by all 3 conditions of `conj_id` 2 when the action of the ACNP is `Drop`. + - Action `set_field:0x3->reg3` is to load `conj_id` to `APConjIDField`, which is used by feature Traceflow. + - Action `set_field:0x400/0x400->reg0` is to load `APDenyRegMark`, indicating the packet is denied (Drop / Reject) for + ACNP, then K8s default drop will not be recorded in this reg. +- Flow 7.3 matches packets which are matched by all 3 conditions of `conj_id` 2 when the action of the ACNP is `Reject`. + - Action `set_field:0x3->reg3` is the same as flow 7.2. + - Action `set_field:0x400/0x400->reg0` is the same as flow 7.2. + - Action `set_field:0x4000000/0xfe000000->reg0` is to load value 0b1 to `PacketInOperationField`. + - Action `set_field:0xd/0xff->reg2` is to load value 0xd (current table ID) to `PacketInReasonField`. + - Action `group:2` is to send the packet to group 2. + +Unlike the default of K8s NetworkPolicies, Antrea-native policies have no such default rules. Hence, they are evaluated +as-is, and there is no need for a table [AntreaPolicyEgressDefault]. + +### EgressRule + +For this table, you will need to keep mind the Network Policy [specification](#network-policy-implementation) that we are +using. We have 2 Pods running on the same Node, with IP addresses 10.10.1.2 to 10.10.1.3. They are allowed to talk to +each other using TCP on port 80, but nothing else. This table is used to implement the egress rules across all Network Policies. If you dump the flows for this table, you should see something like this: ```text -1. table=50, priority=210,ct_state=-new+est,ip actions=goto_table:70 -2. table=50, priority=200,ip,nw_src=10.10.1.2 actions=conjunction(2,1/3) -3. table=50, priority=200,ip,nw_src=10.10.1.3 actions=conjunction(2,1/3) -4. table=50, priority=200,ip,nw_dst=10.10.1.2 actions=conjunction(2,2/3) -5. table=50, priority=200,ip,nw_dst=10.10.1.3 actions=conjunction(2,2/3) -6. table=50, priority=200,tcp,tp_dst=80 actions=conjunction(2,3/3) -7. table=50, priority=190,conj_id=2,ip actions=load:0x2->NXM_NX_REG5[],ct(commit,table=61,zone=65520,exec(load:0x2->NXM_NX_CT_LABEL[32..63])) -8. table=50, priority=0 actions=goto_table:60 +table=EgressRule, priority=0 actions=resubmit(,EgressDefaultRule) ``` Notice how we use the OVS built-in `conjunction` action to implement policies @@ -680,7 +736,7 @@ port. Therefore we cannot use the port as the match condition to identify if the Pod has been applied a Network Policy - which is what we do for the [IngressRuleTable] -, but instead have to use the source IP address. -### EgressDefaultTable (60) +### EgressDefault This table complements [EgressRuleTable] for Network Policy egress rule implementation. In K8s, when a Network Policy is applied to a set of Pods, the @@ -695,9 +751,7 @@ to drop traffic originating from our 2 Pods (10.10.1.2 and 10.10.1.3), which is confirmed by dumping the flows: ```text -1. table=60, priority=200,ip,nw_src=10.10.1.2 actions=drop -2. table=60, priority=200,ip,nw_src=10.10.1.3 actions=drop -3. table=60, priority=0 actions=goto_table:61 + table=EgressDefaultRule, priority=0 actions=resubmit(,EgressMetric) ``` This table is also used to implement Antrea-native policy egress rules that are @@ -707,224 +761,360 @@ priority than K8s default drop flows. For example, a baseline rule to drop egress traffic to 10.0.10.0/24 for a Namespace will look like the following: ```text -1. table=60, priority=80,ip,nw_src=10.10.1.11 actions=conjunction(5,1/2) -2. table=60, priority=80,ip,nw_src=10.10.1.10 actions=conjunction(5,1/2) -3. table=60, priority=80,ip,nw_dst=10.0.10.0/24 actions=conjunction(5,2) -4. table=60, priority=80,conj_id=5,ip actions=load:0x3->NXM_NX_REG5[],load:0x1->NXM_NX_REG0[20],resubmit(,61) ``` The table-miss flow entry, which is used for non-isolated Pods, forwards traffic to the next table EgressMetricsTable, then ([L3ForwardingTable]). -### L3ForwardingTable (70) - -This is the L3 routing table. It implements the following functionality: - -* Tunnelled traffic coming-in from a peer Node and destined to a local Pod is - directly forwarded to the Pod. This requires setting the source MAC to the MAC - of the local gateway interface and setting the destination MAC to the Pod's - MAC address. Then the packets will go to [L3DecTTLTable] for decrementing - the IP TTL value. Such packets can be identified by bit 19 of the NXM_NX_REG0 - register (which was set to 1 in the [ClassifierTable]) and the destination IP - address (which should match the IP address of a local Pod). We therefore - install one flow for each Pod created locally on the Node. For example: - -```text -table=70, priority=200,ip,reg0=0x80000/0x80000,nw_dst=10.10.0.2 actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:12:9e:a6:47:d0:70,goto_table:72 -``` +### EgressMetric -* All tunnelled traffic destined to the local gateway (i.e. for which the - destination IP matches the local gateway's IP) is forwarded to the gateway - port by rewriting the destination MAC (from the Global Virtual MAC to the - local gateway's MAC). +### L3Forwarding -```text -table=70, priority=200,ip,reg0=0x80000/0x80000,nw_dst=10.10.0.1 actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:80 -``` +This is the L3 routing table. -* All reply traffic of connections initiated through the gateway port, i.e. for - which the first packet of the connection (SYN packet for TCP) was received - through the gateway. Such packets can be identified by the packet's direction - in `ct_state` and the `ct_mark` value `0x20` which is committed in - [ConntrackCommitTable] when the first packet of the connection was handled. - A flow will overwrite the destination MAC to the local gateway MAC to ensure - that they get forwarded through the gateway port. This is required to handle - the following cases: - - reply traffic for connections from a local Pod to a ClusterIP Service, which - are handled by kube-proxy and go through DNAT. In this case the destination - IP address of the reply traffic is the Pod which initiated the connection to - the Service (no SNAT by kube-proxy). We need to make sure that these packets - are sent back through the gateway so that the source IP can be rewritten to - the ClusterIP ("undo" DNAT). If we do not use connection tracking and do not - rewrite the destination MAC, reply traffic from the backend will go directly - to the originating Pod without going first through the gateway and - kube-proxy. This means that the reply traffic will arrive at the originating - Pod with the incorrect source IP (it will be set to the backend's IP instead - of the Service IP). - - when hair-pinning is involved, i.e. connections between 2 local Pods, for - which NAT is performed. One example is a Pod accessing a NodePort Service - for which `externalTrafficPolicy` is set to `Local` using the local Node's - IP address, as there will be no SNAT for such traffic. Another example could - be `hostPort` support, depending on how the feature is implemented. +If you dump the flows for this table, you may see the following: ```text -table=70, priority=210,ct_state=+rpl+trk,ct_mark=0x20,ip actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:80 +1. table=L3Forwarding, priority=210,ip,nw_dst=10.10.0.1 actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +2. table=L3Forwarding, priority=210,ct_state=+rpl+trk,ct_mark=0x2/0xf,ip actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +3. table=L3Forwarding, priority=200,ip,reg0=0/0x200,nw_dst=10.10.0.0/24 actions=goto_table:L2ForwardingCalc +4. table=L3Forwarding, priority=200,ip,nw_dst=10.10.1.0/24 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:aa:bb:cc:dd:ee:ff->eth_dst,set_field:192.168.77.103->tun_dst,set_field:0x10/0xf0->reg0,goto_table:L3DecTTL +5. table=L3Forwarding, priority=200,ip,reg0=0x200/0x200,nw_dst=10.10.0.7 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:2e:ba:06:b2:44:91->eth_dst,goto_table:L3DecTTL +6. table=L3Forwarding, priority=200,ip,reg0=0x200/0x200,nw_dst=10.10.0.8 actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:c2:5a:5e:50:95:9b->eth_dst,goto_table:L3DecTTL +7. table=L3Forwarding, priority=190,ct_mark=0x10/0x10,reg0=0x202/0x20f actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,set_field:0x20/0xf0->reg0,goto_table:L3DecTTL +8. table=L3Forwarding, priority=190,ct_state=-rpl+trk,ip,reg0=0x3/0xf,reg4=0/0x100000 actions=goto_table:EgressMark +9. table=L3Forwarding, priority=190,ct_state=-rpl+trk,ip,reg0=0x1/0xf actions=set_field:ba:5e:d1:55:aa:c0->eth_dst,goto_table:EgressMark +10. table=L3Forwarding, priority=0 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc ``` -* All traffic destined to a remote Pod is forwarded through the appropriate - tunnel. This means that we install one flow for each peer Node, each one - matching the destination IP address of the packet against the Pod subnet for - the Node. In case of a match the source MAC is set to the local gateway MAC, - the destination MAC is set to the Global Virtual MAC and we set the OF - `tun_dst` field to the appropriate value (i.e. the IP address of the remote - gateway). Traffic then goes to [L3DecTTLTable]. - For a given peer Node, the flow may look like this: +- Flow 1 matches packets destined to local Antrea gateway IP. + - Match condition `nw_dst=10.10.0.1` is to match packets destined to local gateway IP. + - Action `set_field:ba:5e:d1:55:aa:c0->eth_dst` is to rewrite the destination MAC address of packets to the local + gateway MAC address. Note that, this action is not necessary for Pod-to-gateway request packets because the + destination MAC address is already the local gateway MAC address. However, it is used by some feature gates which + are not enabled by default. + - Action `set_field:0x20/0xf0->reg0` is to load `ToGatewayRegMark`, which indicates that the output port is local + Antrea gateway. + - Action `goto_table:L3DecTTL` is to forward packets to table [L3DecTTL] to decrease TTL value. +- Flow 2 matches reply packets from connections initiated through the local Antrea gateway, i.e. for which the first + packet of the connection (SYN packet for TCP) was received through the gateway, to ensure that reply packets can be + forwarded back to the local gateway, guaranteeing the availability of the connection. This is required to handle the + following cases: + - Reply traffic for connections from a local Pod to a ClusterIP Service, which are handled by kube-proxy and go through + DNAT. In this case the destination IP address of the reply traffic is the Pod which initiated the connection to the + Service (no SNAT by kube-proxy). We need to make sure that these packets are sent back through the gateway so that + the source IP can be rewritten to the ClusterIP ("undo" DNAT). If we do not use connection tracking and do not rewrite + the destination MAC, reply traffic from the backend will go directly to the originating Pod without going first + through the gateway and kube-proxy. This means that the reply traffic will arrive at the originating Pod with the + incorrect source IP (it will be set to the backend's IP instead of the Service IP). + - when hair-pinning is involved, i.e. connections between 2 local Pods, for which NAT is performed. One example is a + Pod accessing a NodePort Service for which externalTrafficPolicy is set to Local using the local Node's IP address, + as there will be no SNAT for such traffic. Another example could be hostPort support, depending on how the feature + is implemented. + For match conditions and actions: + - Match condition `ct_state=+rpl+trk` is to match reply "tracked" packets. + - Match condition `ct_mark=0x2/0xf` is to match `FromGatewayCTMark`, indicating packets are from connections originated + from the local Antrea gateway port. + - Actions `set_field:ba:5e:d1:55:aa:c0->eth_dst`, `set_field:0x20/0xf0->reg0` and `goto_table:L3DecTTL` are the same + with flow 1. +- Flow 3 matches packets from intra-Node connections (not including Service connections). + - Match condition `reg0=0/0x200` is to match `NotRewriteMACRegMark`, indicating that the destination and source MACs + of packets should not be overwritten. For Service or inter-Node connections, `RewriteMACRegMark` is loaded, like + flows 4-6. + - Match condition `nw_dst=10.10.0.0/24` is to match packets whose destination IP is in local Pod CIDR. + - Action `goto_table:L2ForwardingCalc` is to forward packets to table [L2ForwardingCalc], rather than table [L3DecTTL], + since it is no need to decrease TTL value of packets from connections among local Pods. +- Flow 4 matches packets destined to remote Pod CIDR. This means that we install one flow for each peer Node, each one + matching the destination IP address of the packet against the Pod subnet for the Node. + - Match condition `nw_dst=10.10.1.0/24` is to match packets destined to remote Pod CIDR. + - Action `set_field:ba:5e:d1:55:aa:c0->eth_src` is to rewrite the source MAC address of packets to the local gateway + MAC address. + - Action `set_field:aa:bb:cc:dd:ee:ff->eth_dst` is to rewrite the destination MAC address of packets to the Global + Virtual MAC address. + - Action `set_field:192.168.77.103->tun_dst` is to set destination IP address (i.e. the IP address of the remote gateway) + of tunnel to remote Node. + - Action `set_field:0x10/0xf0->reg0` is to load `ToRemoteRegMark`, which indicates that the output port is tunnel. +- Flow 5-6 matches packets destined to local Pods. The packets could be from Service or inter-Node connections. + - Match condition `reg0=0x200/0x200` is to match `RewriteMACRegMark`, indicating that the destination and source MACs + of packets should be overwritten. This is to match Service or inter-Node packets from connections. + - Match condition `nw_dst=` is to match packets whose destination IP is the IP address of a local Pod. + - Action `set_field:ba:5e:d1:55:aa:c0->eth_src` is to rewrite the source MAC address of packets to the local gateway + MAC address. + - Action `set_field:->eth_dst` is to rewrite the destination MAC address of packets to the local Pod + MAC address. + - Action `goto_table:L3DecTTL` is the same as flow 1. +- Flow 7 is to match packets of Service connections sourced from local Antrea gateway and destined to external network. + - Match condition `ct_mark=0x10/0x10` is to match `ServiceCTMark`, indicating packets are from Service connections. + - Match condition `reg0=0x202/0x20f` is to match `RewriteMACRegMark` and `FromGatewayRegMark`, indicating packets are + from Service connections originated from local Antrea gateway. + - Action `set_field:ba:5e:d1:55:aa:c0->eth_dst` is to rewrite the destination MAC address of packets to the local + gateway MAC address. + - Action `set_field:0x20/0xf0->reg0` is to load `ToGatewayRegMark`, which indicates that the output port is local + Antrea gateway. + - Action `goto_table:L3DecTTL` is the same as flow 1. +- Flow 8 is to match packets that are from local Pods and destined to external network, and send them to table [EgressMark], + where SNAT IPs are looked up for the packets. + - Match condition `ct_state=-rpl+trk` is to match request "tracked" packets. + - Match condition `reg0=0x3/0xf` is to match `FromLocalRegMark`, indicating packets are from connections originated + from local Pods. + - Match condition `reg4=0/0x100000` is to match `NotAntreaFlexibleIPAMRegMark` since Egress can only be applied to + non-AntreaIPAM Pods. For packets from AntreaIPAM Pods, `AntreaFlexibleIPAMRegMark` will be loaded. + - Action `goto_table:EgressMark` is to forward packets to table [EgressMark] to lookup SNAT IPs. +- Flow 9 is to match packets that are from remote Pods and destined to external network, which needs to perform SNAT for + feature Egress, and send them to table [EgressMark], where SNAT IPs are looked up for the packets. + - Match condition `ct_state=-rpl+trk` is the same as flow 10. + - Match condition `reg0=0x1/0xf` is to match `FromTunnelRegMark`, indicating packets are from remote Pods through tunnel. + - Match condition `set_field:ba:5e:d1:55:aa:c0->eth_dst` is to rewrite the destination MAC address of packets to the + local gateway MAC address. This is because the packets are from remote Pods through tunnel, and the destination MAC + is the Global Virtual MAC address. + - Action `goto_table:EgressMark` is the same as flow 10. +- Flow 10 is the table-miss flow entry to match packets which are from local Pods and destined to external network. + - Action `set_field:0x20/0xf0->reg0` is the same as flow 1 or 2. + +### EgressMark + +This table is created only when the Egress feature is enabled. It includes flows to implement Egresses and select the +right SNAT IPs for egress traffic from Pods to external network. -```text -table=70, priority=200,ip,nw_dst=10.10.1.0/24 actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:aa:bb:cc:dd:ee:ff,load:0x1->NXM_NX_REG1[],set_field:0x10000/0x10000->reg0,load:0xc0a80102->NXM_NX_TUN_IPV4_DST[],goto_table:72 -``` - -If none of the flows described above are hit, traffic goes directly to -[L2ForwardingCalcTable]. This is the case for external traffic, whose -destination is outside the cluster (such traffic has already been -forwarded to the local gateway by the local source Pod, and only L2 switching -is required), as well as for local Pod-to-Pod traffic. +If you dump the flows for this table, you may see the following: ```text -table=70, priority=0 actions=goto_table:80 + 1. table=EgressMark, priority=210,ip,nw_dst=192.168.77.102 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc + 2. table=EgressMark, priority=210,ip,nw_dst=192.168.77.103 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc + 3. table=EgressMark, priority=210,ip,nw_dst=10.96.0.0/12 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc + 4. table=EgressMark, priority=200,ip,in_port="nginx-d9-dfc134" actions=set_field:ba:5e:d1:55:aa:c0->eth_src,set_field:aa:bb:cc:dd:ee:ff->eth_dst,set_field:192.168.77.103->tun_dst,set_field:0x10/0xf0->reg0,set_field:0x80000/0x80000->reg0,goto_table:L2ForwardingCalc + 5. table=EgressMark, priority=200,ct_state=+new+trk,ip,tun_dst=192.168.77.102 actions=set_field:0x1/0xff->pkt_mark,set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc + 6. table=EgressMark, priority=200,ct_state=+new+trk,ip,in_port="nginx-d9-b93cc5" actions=set_field:0x1/0xff->pkt_mark,set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc + 7. table=EgressMark, priority=190,ct_state=+new+trk,ip,reg0=0x1/0xf actions=drop + 8. table=EgressMark, priority=0 actions=set_field:0x20/0xf0->reg0,goto_table:L2ForwardingCalc ``` -When the Egress feature is enabled, extra flows will be added to -[L3ForwardingTable], which send the egress traffic from Pods to external network -to [SNATTable]. The following two flows match traffic to local Pods and traffic -to the local Node IP respectively, and keep them in the normal forwarding path -(to [L2ForwardingCalcTable]), so they will not be sent to [SNATTable]: +- Flows 1-2 match packets which are from local Pods and destined to the transport IP of remote Nodes to skip Egress SNAT. + - Match condition `nw_dst=` is to match packets destined to transport IP of remote Nodes. + - Action `set_field:0x20/0xf0->reg0` is to load `ToGatewayRegMark`, which indicates that the output port is local + Antrea gateway. + - Action `goto_table:L2ForwardingCalc` is to forward packets to table [L2ForwardingCalc]. +- Flow 3 match packets which are from local Pods and destined to Services to skip Egress SNAT. + - Match condition `nw_dst=10.96.0.0/12` is to match packets destined to Service CIDR. + - Actions `set_field:0x20/0xf0->reg0` and `goto_table:L2ForwardingCalc` are the same as flows 1-2. +- Flow 4 match packets which are from local Pods selected by an Egress and its SNAT IP of the Egress is configured on + remote Node. + - Match condition `nginx-d9-dfc134` is to match packets from the local Pod. + - Action `set_field:ba:5e:d1:55:aa:c0->eth_src` is to rewrite the source MAC address of packets to the local gateway + MAC address. + - Action `set_field:aa:bb:cc:dd:ee:ff->eth_dst` is to rewrite the destination MAC address of packets to the Global + Virtual MAC address. + - Action `set_field:192.168.77.203->tun_dst` is to set destination IP address (transport IP of remote Nodes) of tunnel + to remote Node. + - Action `set_field:0x10/0xf0->reg0` is to load `ToRemoteRegMark`, which indicates that the output port is tunnel. + - Action `set_field:0x80000/0x80000->reg0` is to load `EgressSNATRegMark`, which indicates that packets should be + SNAT'd on a remote Node. +- Flow 5 match packets which are from remote Pods selected by an Egress and its SNAT IP of the Egress is configured on + local Node and sets an 8 bits ID allocated for the SNAT IP to pkt_mark. + - Match condition `ct_state=+new+trk` is to match the first "tracked" packet. + - Match condition `tun_dst=` is to match packets destined to transport IP of local Node, ensure + that the packets are from remote Nodes. + - Action `set_field:0x1/0xff->pkt_mark` is to set the 8 bits ID allocated for SNAT IP to pkt_mark. The ID is for + iptables SNAT rules to match the packets and perform SNAT with the right SNAT IP (Antrea Agent adds an iptables SNAT + rule for each local SNAT IP that matches the ID). + - Actions `set_field:0x20/0xf0->reg0` and `goto_table:L2ForwardingCalc` are the same as flows 1-2. +- Flow 6 match packets which are from local Pods selected by an Egress and its SNAT IP of the Egress is configured on + local Node. + - Match condition `ct_state=+new+trk` is the same as flow 5. + - Match condition `nginx-d9-b93cc5` is to match packets from a local Pod. + - Actions `set_field:0x1/0xff->pkt_mark`, `set_field:0x20/0xf0->reg0` and `goto_table:L2ForwardingCalc` are the same as + flow 5. +- Flow 7 is to drop egress traffic tunnelled from remote Nodes that does not match any SNAT IP configured on local Node. + - Match condition `ct_state=+new+trk` is the same as flow 5. + - Match condition `reg0=0x1/0xf` is to match `FromTunnelRegMark`, indicating packets are from remote Pods through tunnel. +- Flow 8 is to match "tracked" but not the first packets from Egress connections and forward them to table [L2ForwardingCalc]. + - Actions `set_field:0x20/0xf0->reg0` and `goto_table:L2ForwardingCalc` are the same as flows 1-2. + +Note that, when no Egress applies to Pods on the Node, and no SNAT IP is configured on the Node, [SNATMark] just has +flows 1-3, 7-8; when there is an Egress applied to a Pod on local Node, and the SNAT IP of the Egress is configured on a +remote Node, flow 4 will be added; when there is an Egress applied to a Pod on remote Node, and the SNAT IP of the Egress +is configured on local Node, flow 5 will be added; when there is an Egress applied to a Pod on local Node, and the SNAT +IP of the Egress is configured on local Node, flow 6 will be added. + +### L3DecTTL + +This is the table to decrement TTL for IP packets. -```text -table=70, priority=200,ip,reg0=0/0x80000,nw_dst=10.10.1.0/24 actions=goto_table:80 -table=70, priority=200,ip,reg0=0x2/0xffff,nw_dst=192.168.1.1 actions=goto_table:80 -``` - -The following two flows send the traffic not matched by other flows to -[SNATTable]. One of the flows is for egress traffic from local Pods; another -one is for egress traffic from remote Pods, which is tunnelled to this Node to -be SNAT'd with a SNAT IP configured on the Node. In the latter case, the flow -also rewrites the destination MAC to the local gateway interface MAC. +If you dump the flows for this table, you may see the following: ```text -table=70, priority=190,ip,reg0=0x2/0xf actions=goto_table:71 -table=70, priority=190,ip,reg0=0/0xf actions=mod_dl_dst:e2:e5:a4:9b:1c:b1,goto_table:71 +1. table=L3DecTTL, priority=210,ip,reg0=0x2/0xf actions=goto_table:SNATMark +2. table=L3DecTTL, priority=200,ip actions=dec_ttl,goto_table:SNATMark +3. table=L3DecTTL, priority=0 actions=goto_table:SNATMark ``` -### SNATTable (71) +- Flow 1 matches packets which enter OVS pipeline from local Antrea gateway, as the host IP stack should have decremented + the TTL already for such packets, TTL should not be decremented again. + - Match condition `reg0=0x2/0xf` is to match `FromGatewayRegMark`, indicating packets are from local Antrea gateway. +- Flow 2 is to decrement TTL for packets which are not matched by flow 1. +- Flow 3 is an auto-generated flow that should remain unused. -This table is created only when the Egress feature is enabled. It includes flows -to implement Egresses and select the right SNAT IPs for egress traffic from Pods -to external network. +### SNATMark -When no Egress applies to Pods on the Node, and no SNAT IP is configured on the -Node, [SNATTable] just has two flows. One drops egress traffic tunnelled from -remote Nodes that does not match any SNAT IP configured on this Node, and the -default flow that sends egress traffic from local Pods, which do not have any -Egress applied, to [L2ForwardingCalcTable]. Such traffic will be SNAT'd with -the default SNAT IP (by an iptables masquerade rule). +This table marks connections requiring SNAT within the OVS pipeline, distinct from Egress SNAT handled by iptables. -```text -table=71, priority=190,ct_state=+new+trk,ip,reg0=0/0xf actions=drop -table=71, priority=0 actions=goto_table:80 -``` - -When there is an Egress applied to a Pod on the Node, a flow will be added for -the Pod's egress traffic. If the SNAT IP of the Egress is configured on the -local Node, the flow sets an 8 bits ID allocated for the SNAT IP to pkt_mark. -The ID is for iptables SNAT rules to match the packets and perfrom SNAT with -the right SNAT IP (Antrea Agent adds an iptables SNAT rule for each local SNAT -IP that matches the ID). +If you dump the flows for this table, you may see the following: ```text -table=71, priority=200,ct_state=+new+trk,ip,in_port="pod1-7e503a" actions=set_field:0x1/0xff->pkt_mark,goto_table:80 +1. table=SNATMark, priority=200,ct_state=+new+trk,ip,reg0=0x22/0xff actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +2. table=SNATMark, priority=200,ct_state=+new+trk,ip,reg0=0x12/0xff,reg4=0x200000/0x2200000 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark)) +3. table=SNATMark, priority=190,ct_state=+new+trk,ip,nw_src=10.10.0.7,nw_dst=10.10.0.7 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +4. table=SNATMark, priority=190,ct_state=+new+trk,ip,nw_src=10.10.0.8,nw_dst=10.10.0.8 actions=ct(commit,table=SNAT,zone=65520,exec(set_field:0x20/0x20->ct_mark,set_field:0x40/0x40->ct_mark)) +5. table=SNATMark, priority=0 actions=goto_table:SNAT ``` -When the SNAT IP of the Egress is on a remote Node, the flow will tunnel the -packets to the remote Node with the tunnel's destination IP to be the SNAT IP. -The packets will be SNAT'd on the remote Node. The same as a normal tunnel flow -in [L3ForwardingTable], the flow will rewrite the packets' source and -destination MAC addresses, load the SNAT IP to NXM_NX_TUN_IPV4_DST, and send the -packets to [L3DecTTLTable]. +- Flow 1 matches packets whose source and destination are both local Antrea gateway port. Such hair-pin connection should + be SNAT'd with the virtual Service IP. + - Match condition `ct_state=+new+trk` is to match the first packet tracked in `CtZone`. + - Match condition `reg0=0x22/0xff` is to match `FromGatewayRegMark` and `ToGatewayRegMark`, indicating packets are from + local Antrea gateway port and also destined to it. + - Action `ct` is applied to matched packets with the commit parameter in `CtZone` to persist some ct marks. + - Field `commit` means to commit connection to the connection tracking module. Note that, a packet can be committed + in the same ct zone multiple times. For Service connections, the first `commit` is performed table [EndpointDNAT]. + - Field `table=SNAT` is the table where packets will be forked. + - Field `zone=65520` is to commit connection to `CtZone`. + - Field `exec` is to persist some ct marks. + - Action `set_field:0x20/0x20->ct_mark` is to load `ConnSNATCTMark`, indicating the connection requires SNAT. + - Action `set_field:0x40/0x40->ct_mark` is to load `HairpinCTMark`, indicating this is a hair-pin connection. +- Flow 2 matches packets whose source is local Antrea gateway port and destination is a remote Pod. Such connection should + be SNAT'd with the IP address of local Antrea gateway. + - Match condition `ct_state=+new+trk` is the same as flow 1. + - Match condition `reg0=0x12/0xff` is to match `FromGatewayRegMark` and `ToTunnelRegMark`, indicating packets are from + local Antrea gateway port and destined to a remote Pod through tunnel. + - Match condition `reg4=0x200000/0x2200000` is to match `ToExternalAddressRegMark` and `NotDSRServiceRegMark`, + indicating packets are destined to a Service's external IP, like NodePort, LoadBalancerIP or ExternalIP, but it is + not DSR mode. + - Action `ct` is the same as flow 1 except that `HairpinCTMark` is not loaded since this is not a hair-pin connection. +- Flow 3-4 match packets whose source and destination are the same local Pod. Such hair-pin connection should be SNAT'd + with the IP address of local Antrea gateway. + - Match condition `ct_state=+new+trk` is the same as flow 1. + - Match condition `nw_src=` and `nw_dst=` are to match packets whose source and + destination are both the IP address of a local Pod. + - Action `ct` is the same as flow 1. +- Flow 5 is the auto-generated flow. + +### SNAT + +This table performs SNAT for connections requiring SNAT within the OVS pipeline. -```text -table=71, priority=200,ct_state=+new+trk,ip,in_port="pod2-357c21" actions=mod_dl_src:e2:e5:a4:9b:1c:b1,mod_dl_dst:aa:bb:cc:dd:ee:ff,load:0x1->NXM_NX_REG1[],set_field:0x10000/0x10000->reg0,load:0xc0a80a66->NXM_NX_TUN_IPV4_DST[],goto_table:72 -``` - -Last, when a SNAT IP configured for Egresses is on the local Node, an additional -flow is added in [SNATTable] for egress traffic from remote Node that should -use the SNAT IP. The flow matches the tunnel destination IP (which should be -equal to the SNAT IP), and sets the 8 bits ID of the SNAT IP to pkt_mark. +If you dump the flows for this table, you should see the following: ```text -table=71, priority=200,ct_state=+new+trk,ip,tun_dst="192.168.10.101" actions=set_field:0x1/0xff->pkt_mark,goto_table:80 +1. table=SNAT, priority=200,ct_state=+new+trk,ct_mark=0x40/0x40,ip,reg0=0x2/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=169.254.0.253),exec(set_field:0x10/0x10->ct_mark,set_field:0x40/0x40->ct_mark)) +2. table=SNAT, priority=200,ct_state=+new+trk,ct_mark=0x40/0x40,ip,reg0=0x3/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=10.10.0.1),exec(set_field:0x10/0x10->ct_mark,set_field:0x40/0x40->ct_mark)) +3. table=SNAT, priority=200,ct_state=-new-rpl+trk,ct_mark=0x20/0x20,ip actions=ct(table=L2ForwardingCalc,zone=65521,nat) +4. table=SNAT, priority=190,ct_state=+new+trk,ct_mark=0x20/0x20,ip,reg0=0x2/0xf actions=ct(commit,table=L2ForwardingCalc,zone=65521,nat(src=10.10.0.1),exec(set_field:0x10/0x10->ct_mark)) +5. table=SNAT, priority=0 actions=goto_table:L2ForwardingCalc ``` -### L3DecTTLTable (72) - -This is the table to decrement TTL for the IP packets destined to remote Nodes -through a tunnel, or the IP packets received from a tunnel. But for the packets -that enter the OVS pipeline from the local gateway and are destined to a remote -Node, TTL should not be decremented in OVS on the source Node, because the host -IP stack should have already decremented TTL if that is needed. +- Flow 1 is to match packets from hair-pin connections initiated through the local Antrea gateway port. Such connection + should be SNAT'd with the virtual Service IP. + - Match condition `ct_state=+new+trk` is to match the first packet from connections tracked in `CtZone`. + - Match condition `ct_mark=0x40/0x40` is to match `HairpinCTMark` in `CtZone`, indicating that this is hair-pin connection. + - Match condition `reg0=0x2/0xf` is to match `FromGatewayRegMark`, indicating packets from connections initiated + through the local Antrea gateway port + - Action `ct` is applied to matched packets with the commit parameter to perform SNAT and persist some ct marks in + `SNATCtZone`. + - Field `commit` means to commit connection to the connection tracking module. + - Field `table=L2ForwardingCalc` is the table where packets will be forked. + - Field `zone=65521` is to commit connection to `SNATCtZone`. + - Field `nat(src=169.254.0.253)` is to perform SNAT with virtual Service IP `169.254.0.253`. + - Field `exec` is to persist some ct marks in `SNATCtZone`. + - Action `set_field:0x10/0x10->ct_mark` is to load `ServiceCTMark` in `SNATCtZone`, indicating this is a Service + connection. + - Action `set_field:0x40/0x40->ct_mark` is to load `HairpinCTMark` in `SNATCtZone`, indicating this is a hair-pin + connection. +- Flow 2 is to match packets from hair-pin connections initiated through a local Pod. Such connection should be SNAT'd + with the IP address of local Antrea gateway. + - Match conditions `ct_state=+new+trk` and `ct_mark=0x40/0x40` are the same as flow 1. + - Match condition `reg0=0x3/0xf` is to match `FromLocalRegMark`, indicating packets from connections initiated through + a local Pod. + - Action `ct` is the same as flow 1 except that `nat(src=10.10.0.1)` is used instead of `nat(src=169.254.0.253)` since + the connection should be SNAT'd with the IP address of local Antrea gateway. +- Flow 3 is to match the subsequent request packets of connection whose first request packet has been committed in + `SNATCtZone`, then invoke `ct` action on the packets again to recover "tracked" state in `SNATCtZone`. + - Match condition `ct_state=-new-rpl+trk` is to match request "tracked" packets, but not new (the first packet. + - Match condition `ct_mark=0x20/0x20` is to match `ConnSNATCTMark`, indicating the connection requires SNAT. + - Action `ct` is applied to matched packets to recover "tracked" state in `SNATCtZone`. +- Flow 4 is to match the first packet of connections (non-hairpin) destined to external Service IP initiated through the + Antrea gateway, and the Endpoint is a remote Pod, then perform SNAT in `SNATCtZone` with the Antrea gateway IP. + - Match conditions `ct_state=+new+trk` and `ct_mark=0x20/0x20` are the same as flow 3. + - Match condition `reg0=0x2/0xf` is the same as flow 2. + - Action `ct` is the same as flow 2 except that `HairpinCTMark` is not loaded since this is not a hair-pin connection. +- Flow 5 is the auto-generated flow. + +### L2ForwardingCalc + +This is essentially the "dmac" table of the switch. We program one flow for each port (tunnel port, local Antrea gateway +port, and local Pod ports). -If you dump the flows for this table, you should see flows like the following: +If you dump the flows for this table, you may see the following: ```text -1. table=72, priority=210,ip,reg0=0x1/0xf, actions=goto_table:80 -2. table=72, priority=200,ip, actions=dec_ttl,goto_table:80 -3. table=72, priority=0, actions=goto_table:80 +1. cookie=0x2010000000000, table=L2ForwardingCalc, priority=200,dl_dst=ba:5e:d1:55:aa:c0 actions=set_field:0x2->reg1,set_field:0x200000/0x600000->reg0,goto_table:IngressSecurityClassifier +2. cookie=0x2010000000000, table=L2ForwardingCalc, priority=200,dl_dst=aa:bb:cc:dd:ee:ff actions=set_field:0x1->reg1,set_field:0x200000/0x600000->reg0,goto_table:IngressSecurityClassifier +3. cookie=0x2010000000000, table=L2ForwardingCalc, priority=200,dl_dst=2e:ba:06:b2:44:91 actions=set_field:0x8->reg1,set_field:0x200000/0x600000->reg0,goto_table:IngressSecurityClassifier +4. cookie=0x2010000000000, table=L2ForwardingCalc, priority=200,dl_dst=c2:5a:5e:50:95:9b actions=set_field:0x9->reg1,set_field:0x200000/0x600000->reg0,goto_table:IngressSecurityClassifier +5. cookie=0x2000000000000, table=L2ForwardingCalc, priority=0 actions=goto_table:IngressSecurityClassifier ``` -The first flow is to bypass the TTL decrement for the packets from the gateway -port. - -### L2ForwardingCalcTable (80) +- Flow 1 is to match packets destined to the local Antrea gateway. + - Match condition `dl_dst=ba:5e:d1:55:aa:c0` is to match packets destined to the local Antrea gateway MAC address. + - Action `set_field:0x2->reg1` is to load output OVS port number to `TargetOFPortField`. + - Action `set_field:0x200000/0x600000->reg0` is to load `OutputToOFPortRegMark`, indicating packets should output to + an OVS port. + - Action `goto_table:IngressSecurityClassifier` is to forward packets to table [IngressSecurityClassifier]. +- Flow 2 is to match packets destined to tunnel. + - Match condition `dl_dst=aa:bb:cc:dd:ee:ff` is to match packets destined to the Global Virtual MAC address, which is + used for tunnel traffic. + - Actions are the same as flow 1. +- Flows 3-4 are to match packets destined to local Pods. + - Match conditions `dl_dst=2e:ba:06:b2:44:91` and `dl_dst=c2:5a:5e:50:95:9b` are to match packets destined to the MAC + addresses of local Pods. + - Actions are the same as flow 1. +- Flow 4 is the auto-generated flow. + +In above flows 1-5, we load `OutputToOFPortRegMark` to indicate that there was a matching entry for the destination MAC +address and that the packet must be forwarded. We also use the `TargetOFPortField` to store the egress port for packet, +which will be used as a parameter to the `output` OpenFlow action in table [Output]. + +### IngressSecurityClassifier + +This table is to classify packets before entering the tables for ingress security. -This is essentially the "dmac" table of the switch. We program one flow for each -port (tunnel port, gateway port, and local Pod ports), as you can see if you -dump the flows: +If you dump the flows for this table, you should see the following: ```text -1. table=80, priority=200,dl_dst=aa:bb:cc:dd:ee:ff actions=set_field:0x1->reg1,set_field:0x10000/0x10000->reg0,goto_table:105 -2. table=80, priority=200,dl_dst=e2:e5:a4:9b:1c:b1 actions=set_field:0x2->reg1,set_field:0x10000/0x10000->reg0,goto_table:105 -3. table=80, priority=200,dl_dst=12:9e:a6:47:d0:70 actions=set_field:0x3->reg1,set_field:0x10000/0x10000->reg0,goto_table:90 -4. table=80, priority=200,dl_dst=ba:a8:13:ca:ed:cf actions=set_field:0x4->reg1,set_field:0x10000/0x10000->reg0,goto_table:90 -5. table=80, priority=0 actions=goto_table:105 +1. table=IngressSecurityClassifier, priority=210,pkt_mark=0x80000000/0x80000000,ct_state=-rpl+trk,ip actions=goto_table:ConntrackCommit +2. table=IngressSecurityClassifier, priority=200,reg0=0x20/0xf0 actions=goto_table:IngressMetric +3. table=IngressSecurityClassifier, priority=200,reg0=0x10/0xf0 actions=goto_table:IngressMetric +4. table=IngressSecurityClassifier, priority=200,reg0=0x40/0xf0 actions=goto_table:IngressMetric +5. table=IngressSecurityClassifier, priority=200,ct_mark=0x40/0x40 actions=goto_table:ConntrackCommit +6. table=IngressSecurityClassifier, priority=0 actions=goto_table:AntreaPolicyIngressRule ``` -For each port flow (1 through 5 in the example above), we set bit 16 of the -NXM_NX_REG0 register to indicate that there was a matching entry for the -destination MAC address and that the packet must be forwarded. In the last table -of the pipeline ([L2ForwardingOutTable]), we will drop all packets for which -this bit is not set. We also use the NXM_NX_REG1 register to store the egress -port for the packet, which will be used as a parameter to the `output` OpenFlow -action in [L2ForwardingOutTable]. - -The packets that match local Pods' MAC entries will go to the first table -([AntreaPolicyIngressRuleTable] when AntreaPolicy is enabled, or -[IngressRuleTable] when AntreaPolicy is not enabled) for NetworkPolicy ingress -rules. Other packets will go to [ConntrackCommitTable]. Specifically, packets -to the gateway port or the tunnel port will also go to [ConntrackCommitTable] -and bypass the NetworkPolicy ingress rule tables, as NetworkPolicy ingress rules -are not enforced for these packets on the source Node. - -What about L2 multicast / broadcast traffic? ARP requests will never reach this -table, as they will be handled by the OpenFlow `normal` action in the -[ArpResponderTable]. As for the rest, if it is IP traffic, it will hit the -"last" flow in this table and go to [ConntrackCommitTable]; and finally the last -table of the pipeline ([L2ForwardingOutTable]), and get dropped there since bit -16 of the NXM_NX_REG0 will not be set. Traffic which is non-ARP and non-IP -(assuming any can be received by the switch) is actually dropped much earlier in -the pipeline ([SpoofGuardTable]). In the future, we may need to support more -cases for L2 multicast / broadcast traffic. - -### AntreaPolicyIngressRuleTable (85) +- Flow 1 is to match locally generated request packets and forward them to table [ConntrackCommit] directly to bypass + all tables for ingress security. + - Match condition `pkt_mark=0x80000000/0x80000000` is to match packets with iptables fwmark 0x80000000, which is set + by iptables rules in the host network namespace to mark locally generated packets. + - Match condition `ct_state=-rpl+trk` is to match request packets. +- Flow 2-4 are to match some packets destined to local Antrea gateway, tunnel, uplink port by matching `ToGatewayRegMark`, + `ToTunnelRegMark` or `ToUplinkRegMark` respectively and forward them + to table [IngressMetric] directly to bypass tables for ingress security rules. + - Match condition `reg0=0x20/0xf0` is to match `ToGatewayRegMark`, indicating packets are destined to local Antrea + gateway port. + - Match condition `reg0=0x10/0xf0` is to match `ToTunnelRegMark`, indicating packets are destined to tunnel port. + - Match condition `reg0=0x40/0xf0` is to match `ToUplinkRegMark`, indicating packets are destined to uplink port. +- Flow 5 is to match packets from hair-pin connections and forward them to table [ConntrackCommit] directly to bypass + all tables for ingress security. + - Match condition `ct_mark=0x40/0x40` is to match `HairpinCTMark`, indicating packets are from hair-pin connections. + +### AntreaPolicyIngressRule This table is very similar to [AntreaPolicyEgressRuleTable], but implements the ingress rules of Antrea-native Policies. Depending on the tier to which the policy @@ -943,12 +1133,12 @@ Since the example ACNP resides in the Application tier, if you dump the flows for table 85, you should see something like this: ```text -1. table=85, priority=64990,ct_state=-new+est,ip actions=resubmit(,105) -2. table=85, priority=14000,conj_id=4,ip actions=load:0x4->NXM_NX_REG3[],load:0x1->NXM_NX_REG0[20],resubmit(,101) -3. table=85, priority=14000,ip,nw_src=10.10.1.7 actions=conjunction(4,1/3) -4. table=85, priority=14000,ip,reg1=0x19c actions=conjunction(4,2/3) -5. table=85, priority=14000,tcp,tp_dst=80 actions=conjunction(4,3/3) -6. table=85, priority=0 actions=resubmit(,90) + table=AntreaPolicyIngressRule, priority=64991,conj_id=1 actions=controller(reason=no_match,max_len=128,id=10876,userdata=02,pause),resubmit(,IngressMetric) + table=AntreaPolicyIngressRule, priority=64991,udp,tp_src=53 actions=conjunction(1,1/2) + table=AntreaPolicyIngressRule, priority=64991,tcp,tp_src=53,tcp_flags=+psh+ack actions=conjunction(1,1/2) + table=AntreaPolicyIngressRule, priority=64990,ct_state=-new+est,ip actions=resubmit(,IngressMetric) + table=AntreaPolicyIngressRule, priority=64990,ct_state=-new+rel,ip actions=resubmit(,IngressMetric) + table=AntreaPolicyIngressRule, priority=0 actions=resubmit(,IngressRule) ``` As for [AntreaPolicyEgressRuleTable], flow 1 (highest priority) ensures that for @@ -974,7 +1164,7 @@ in the following [IngressRuleTable] section. As seen in [AntreaPolicyEgressRuleTable], the default action is to evaluate K8s Network Policy [IngressRuleTable] and a AntreaPolicyIngressDefaultTable does not exist. -### IngressRuleTable (90) +### IngressRule This table is very similar to [EgressRuleTable], but implements ingress rules for Network Policies. Once again, you will need to keep mind the Network Policy @@ -985,15 +1175,7 @@ are allowed to talk to each other using TCP on port 80, but nothing else. If you dump the flows for this table, you should see something like this: ```text -1. table=90, priority=210,ct_state=-new+est,ip actions=goto_table:101 -2. table=90, priority=210,pkt_mark=0x1/0x1 actions=goto_table:105 -3. table=90, priority=200,ip,nw_src=10.10.1.2 actions=conjunction(3,1/3) -4. table=90, priority=200,ip,nw_src=10.10.1.3 actions=conjunction(3,1/3) -5. table=90, priority=200,ip,reg1=0x3 actions=conjunction(3,2/3) -6. table=90, priority=200,ip,reg1=0x4 actions=conjunction(3,2/3) -7. table=90, priority=200,tcp,tp_dst=80 actions=conjunction(3,3/3) -8. table=90, priority=190,conj_id=3,ip actions=load:0x3->NXM_NX_REG6[],ct(commit,table=101,zone=65520,exec(load:0x3->NXM_NX_CT_LABEL[0..31])) -9. table=90, priority=0 actions=goto_table:100 + table=IngressRule, priority=0 actions=resubmit(,IngressDefaultRule) ``` As for [EgressRuleTable], flow 1 (highest priority) ensures that for established @@ -1028,7 +1210,7 @@ the correct destination MAC address but a different destination IP address to by an egress Network Policy rule. This is also why the Network Policy ingress rules are enforced after the egress port has been determined. -### IngressDefaultTable (100) +### IngressDefault This table is similar in its purpose to [EgressDefaultTable], and it complements [IngressRuleTable] for Network Policy ingress rule implementation. In K8s, when @@ -1057,141 +1239,97 @@ For example, a baseline rule to isolate ingress traffic for a Namespace will loo like the following: ```text -table=100, priority=80,ip,reg1=0xb actions=conjunction(6,2/3) -table=100, priority=80,ip,reg1=0xc actions=conjunction(6,2/3) -table=100, priority=80,ip,nw_src=10.10.1.9 actions=conjunction(6,1/3) -table=100, priority=80,ip,nw_src=10.10.1.7 actions=conjunction(6,1/3) -table=100, priority=80,tcp,tp_dst=8080 actions=conjunction(6,3/3) -table=100, priority=80,conj_id=6,ip actions=load:0x6->NXM_NX_REG3[],load:0x1->NXM_NX_REG0[20],resubmit(,101) +table=IngressDefaultRule, priority=0 actions=resubmit(,IngressMetric) ``` The table-miss flow entry, which is used for non-isolated Pods, forwards traffic to the next table ([ConntrackCommitTable]). -### ConntrackCommitTable (105) - -As mentioned before, this table is in charge of committing all new connections -which are not dropped because of Network Policies. If you dump the flows for this -table, you should see something like this: +### IngressMetric ```text -1. table=105, priority=200,ct_state=+new+trk,ip,reg0=0x1/0xf actions=ct(commit,table=108,zone=65520,exec(load:0x20->NXM_NX_CT_MARK[])) -2. table=105, priority=190,ct_state=+new+trk,ip actions=ct(commit,table=108,zone=65520) -3. table=105, priority=0 actions=goto_table:108 + table=IngressMetric, priority=0 actions=resubmit(,ConntrackCommit) ``` -Flow 1 ensures that we commit connections initiated through the gateway -interface and mark them with a `ct_mark` of `0x20`. This ensures that -[ConntrackStateTable] can perform its functions correctly and rewrite the -destination MAC address to the gateway's MAC address for connections which -require it. Such connections include Pod-to-ClusterIP traffic. Note that the -`0x20` mark is applied to *all* connections initiated through the gateway -(i.e. for which the first packet of the connection was received through the -gateway) and that [ConntrackStateTable] will perform the destination MAC address -for the reply traffic of *all* such connections. In some cases (the ones -described for [ConntrackStateTable]), this rewrite is necessary. For others -(e.g. a connection from the host to a local Pod), this rewrite is not necessary -but is also harmless, as the destination MAC is already correct. - -Flow 2 commits all other new connections. - -All traffic then goes to [HairpinSNATTable]. +### ConntrackCommit -### HairpinSNATTable (108) +This table is in charge of committing all new non-Service connections. -The table is used to handle Service hairpin case, which indicates that the -packet should be output to the port on which it was received. - -If you dump the flows for this table, you should see the flows: - -```text -1. table=108, priority=200,ip,nw_src=10.10.0.4,nw_dst=10.10.0.4 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -2. table=108, priority=200,ip,nw_src=10.10.0.2,nw_dst=10.10.0.2 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -3. table=108, priority=200,ip,nw_src=10.10.0.3,nw_dst=10.10.0.3 actions=mod_nw_src:169.254.169.252,load:0x1->NXM_NX_REG0[18],resubmit(,110) -4. table=108, priority=0 actions=resubmit(,110) -``` - -Flow 1-3 are used to match Service packets from Pods. The source IP of the matched -packets by flow 1-3 should be SNAT'd with a virtual hairpin IP since the source and -destination IP addresses should not be the same. Without SNAT, response packets from -a Pod will not be forwarded back to OVS pipeline as the destination IP is the Pod's -own IP, then the connection is interrupted because the conntrack state is only stored -in OVS ct zone, not in the Pod. With SNAT, the destination IP will be the virtual -hairpin IP and forwarded back to OVS pipeline. Note that, bit 18 in NXM_NX_REG0 is -set to 0x1, and it is consumed in [L2ForwardingOutTable] to output the packet -to the port on which it was received with action `IN_PORT`. - -### L2ForwardingOutTable (110) - -It is a simple table and if you dump the flows for this table, you should only -see 2 flows: +If you dump the flows for this table, you should see the following: ```text -1. table=110, priority=200,ip,reg0=0x10000/0x10000 actions=output:NXM_NX_REG1[] -2. table=110, priority=0, actions=drop +1. table=ConntrackCommit, priority=200,ct_state=+new+trk-snat,ct_mark=0/0x10,ip actions=ct(commit,table=Output,zone=65520,exec(move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3])) +2. table=ConntrackCommit, priority=0 actions=goto_table:Output ``` -The first flow outputs all unicast packets to the correct port (the port was -resolved by the "dmac" table, [L2ForwardingCalcTable]). IP packets for which -[L2ForwardingCalcTable] did not set bit 16 of NXM_NX_REG0 will be dropped. - -## Tables (AntreaProxy is disabled) +- Flow 1 is to match new non-Service connections and commit them to the connection tracking module. + - Match condition `ct_state=+new+trk-snat` is to match the first packet from connections tracked in `CtZone`. + - Match condition `ct_mark=0x0/0x10` is to match `NotServiceCTMark` in `CtZone`, indicating packets are from non-Service + connection. + - Action `ct` is applied to matched packets with the commit parameter to persist a ct mark in `CtZone`. + - Field `commit` means to commit connection to the connection tracking module. + - Field `table=Output` is the table where packets will be forked. + - Field `zone=65521` is to commit connection to `CtZone`. + - Field `exec` is to persist some ct marks in `CtZone`. + - Action `move:NXM_NX_REG0[0..3]->NXM_NX_CT_MARK[0..3]` is load value of `PktSourceField` to `ConnSourceCTMarkField`. +- Flow 2 is the auto-generated flow that should remain unused. -![OVS pipeline](../assets/ovs-pipeline.svg) +### Output -### DNATTable (40) +This is the table to output packets to an OVS port, controller or drop them. -This table is created only when AntreaProxy is disabled. Its only job is to -send traffic destined to Services through the local gateway interface, without any -modifications. kube-proxy will then take care of load-balancing the connections -across the different backends for each Service. - -If you dump the flows for this table, you should see something like this: +If you dump the flows for this table, you should see the following: ```text -1. table=40, priority=200,ip,nw_dst=10.96.0.0/12 actions=set_field:0x2->reg1,load:0x1->NXM_NX_REG0[16],goto_table:105 -2. table=40, priority=0 actions=goto_table:45 +1. table=Output, priority=210,ct_mark=0x40/0x40 actions=IN_PORT +2. table=Output, priority=200,reg0=0x200000/0x600000 actions=output:NXM_NX_REG1[] +3. table=Output, priority=200,reg0=0x2400000/0xfe600000 actions=meter:256,controller(reason=no_match,id=62373,userdata=01.01) +4. table=Output, priority=200,reg0=0x4400000/0xfe600000 actions=meter:256,controller(reason=no_match,id=62373,userdata=01.02) +5. table=Output, priority=0 actions=drop ``` -In the example above, 10.96.0.0/12 is the Service CIDR (this is the default -value used by `kubeadm init`). This flow is not actually required for -forwarding, but to bypass [EgressRuleTable] and [EgressDefaultTable] for Service -traffic on its way to kube-proxy through the gateway. If we omitted this flow, -such traffic would be unconditionally dropped if a Network Policy is applied on -the originating Pod. For such traffic, we instead enforce Network Policy egress -rules when packets come back through the gateway and the destination IP has been -rewritten by kube-proxy (DNAT to a backend for the Service). We cannot output -the Service traffic to the gateway port directly as we haven't committed the -connection yet; instead we store the port in NXM_NX_REG1 - similarly to how we -process non-Service traffic in [L2ForwardingCalcTable] - and forward it to -[ConntrackCommitTable]. By committing the connection we ensure that reply -traffic (traffic from the Service backend which has already gone through -kube-proxy for source IP rewrite) will not be dropped because of Network -Policies. - -The table-miss flow entry (flow 2) for this table forwards all non-Service -traffic to the next table, [AntreaPolicyEgressRuleTable]. - -[ClassifierTable]: #classifiertable-0 -[SpoofGuardTable]: #spoofguardtable-10 -[ARPResponderTable]: #arprespondertable-20 -[ServiceHairpinTable]: #servicehairpintable-23 -[ConntrackTable]: #conntracktable-30 -[ConntrackStateTable]: #conntrackstatetable-31 -[DNATTable]: #dnattable-40 -[SessionAffinityTable]: #sessionaffinitytable-40 -[ServiceLBTable]: #servicelbtable-41 -[EndpointDNATTable]: #endpointdnattable-42 -[AntreaPolicyEgressRuleTable]: #antreapolicyegressruletable-45 -[EgressRuleTable]: #egressruletable-50 -[EgressDefaultTable]: #egressdefaulttable-60 -[L3ForwardingTable]: #l3forwardingtable-70 -[SNATTable]: #snattable-71 -[L3DecTTLTable]: #l3decttltable-72 -[L2ForwardingCalcTable]: #l2forwardingcalctable-80 -[AntreaPolicyIngressRuleTable]: #antreapolicyingressruletable-85 -[IngressRuleTable]: #ingressruletable-90 -[IngressDefaultTable]: #ingressdefaulttable-100 -[ConntrackCommitTable]: #conntrackcommittable-105 -[HairpinSNATTable]: #hairpinsnattable-108 -[L2ForwardingOutTable]: #l2forwardingouttable-110 +- Flow 1 is to output packets from hair-pin connections to the ingress port. + - Match condition `ct_mark=0x40/0x40` is to match `HairpinCTMark`, indicating packets are from hair-pin connections. + - Action `IN_PORT` is to output packets to the ingress port. +- Flow 2 is to output packets to an OVS port. + - Match condition `reg0=0x200000/0x600000` is to match `OutputToOFPortRegMark`, indicating packets should output to + an OVS port. + - Action `output:NXM_NX_REG1[]` is to output packets to the OVS port stored in `TargetOFPortField`. +- Flow 3-4 are to output packets to the controller. + - Match condition `reg0=0x2400000/0xfe600000` is to match `OutputToControllerRegMark`, indicating packets should output + to the controller. + - Action `meter:256` is to meter packets with meter ID 256. + - Action `controller(reason=no_match,id=62373,userdata=01.01)` is to output packets to the controller with reason + `no_match`, ID 62373 and userdata 01.01. +- Flow 5 is to drop packets. + + +[PipelineRootClassifier]: #pipelineRootClassifier +[ARPSpoofGuard]: #arpSpoofGuard +[ARPResponder]: #arpResponder +[Classifier]: #classifier +[SpoofGuard]: #spoofGuard +[UnSNAT]: #unSNAT +[Conntrack]: #conntrack +[ConntrackState]: #conntrackState +[PreRoutingClassifier]: #preRoutingClassifier +[SessionAffinity]: #sessionAffinity +[ServiceLB]: #serviceLB +[EndpointDNAT]: #endpointDNAT +[AntreaPolicyEgressRule]: #antreaPolicyEgressRule +[EgressRule]: #egressRule +[EgressDefaultRule]: #egressDefaultRule +[EgressMetric]: #egressMetric +[L3Forwarding]: #l3Forwarding +[EgressMark]: #egressMark +[L3DecTTL]: #l3DecTTL +[SNATMark]: #snatMark +[SNAT]: #snat +[L2ForwardingCalc]: #l2ForwardingCalc +[IngressSecurityClassifier]: #ingressSecurityClassifier +[AntreaPolicyIngressRule]: #antreaPolicyIngressRule +[IngressRule]: #ingressRule +[IngressDefaultRule]: #ingressDefaultRule +[IngressMetric]: #ingressMetric +[ConntrackCommit]: #conntrackCommit +[Output]: #output diff --git a/pkg/agent/openflow/fields.go b/pkg/agent/openflow/fields.go index 87d0521af2e..073ab31432f 100644 --- a/pkg/agent/openflow/fields.go +++ b/pkg/agent/openflow/fields.go @@ -109,12 +109,12 @@ var ( APConjIDField = binding.NewRegField(3, 0, 31) // reg4(NXM_NX_REG4) - // reg4[0..15]: Field to store the selected Service Endpoint port. + // reg4[0..15]: Field to store the selected Service Endpoint port number. EndpointPortField = binding.NewRegField(4, 0, 15) // reg4[16..18]: Field to store the state of a packet accessing a Service. Marks in this field include: - // - 0b001: packet need to do service selection. - // - 0b010: packet has done service selection. - // - 0b011: packet has done service selection and the selection result needs to be cached. + // - 0b001: packet needs to do Endpoint selection. + // - 0b010: packet has done Endpoint selection. + // - 0b011: packet has done Endpoint selection and the selection result needs to be cached. ServiceEPStateField = binding.NewRegField(4, 16, 18) EpToSelectRegMark = binding.NewRegMark(ServiceEPStateField, 0b001) EpSelectedRegMark = binding.NewRegMark(ServiceEPStateField, 0b010)