1#!/bin/bash 2# SPDX-License-Identifier: GPL-2.0 3 4source lib.sh 5 6# Conntrack needs to reassemble fragments in order to have complete 7# packets for rule matching. Reassembly can lead to packet loss. 8 9# Consider the following setup: 10# +--------+ +---------+ +--------+ 11# |Router A|-------|Wanrouter|-------|Router B| 12# | |.IPIP..| |..IPIP.| | 13# +--------+ +---------+ +--------+ 14# / mtu 1400 \ 15# / \ 16#+--------+ +--------+ 17#|Client A| |Client B| 18#| | | | 19#+--------+ +--------+ 20 21# Router A and Router B use IPIP tunnel interfaces to tunnel traffic 22# between Client A and Client B over WAN. Wanrouter has MTU 1400 set 23# on its interfaces. 24 25rx=$(mktemp) 26 27checktool "iptables --version" "run test without iptables" 28checktool "socat -h" "run test without socat" 29 30setup_ns r_a r_b r_w c_a c_b 31 32cleanup() { 33 cleanup_all_ns 34 rm -f "$rx" 35} 36 37trap cleanup EXIT 38 39listener_ready() 40{ 41 ns="$1" 42 port="$2" 43 ss -N "$ns" -lnu -o "sport = :$port" | grep -q "$port" 44} 45 46test_path() { 47 msg="$1" 48 49 ip netns exec "$c_b" socat -t 3 - udp4-listen:5000,reuseaddr > "$rx" < /dev/null & 50 51 busywait $BUSYWAIT_TIMEOUT listener_ready "$c_b" 5000 52 53 for i in 1 2 3; do 54 head -c1400 /dev/zero | tr "\000" "a" | \ 55 ip netns exec "$c_a" socat -t 1 -u STDIN UDP:192.168.20.2:5000 56 done 57 58 wait 59 60 bytes=$(wc -c < "$rx") 61 62 if [ "$bytes" -eq 1400 ];then 63 echo "OK: PMTU $msg connection tracking" 64 else 65 echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400" 66 exit 1 67 fi 68} 69 70# Detailed setup for Router A 71# --------------------------- 72# Interfaces: 73# eth0: 10.2.2.1/24 74# eth1: 192.168.10.1/24 75# ipip0: No IP address, local 10.2.2.1 remote 10.4.4.1 76# Routes: 77# 192.168.20.0/24 dev ipip0 (192.168.20.0/24 is subnet of Client B) 78# 10.4.4.1 via 10.2.2.254 (Router B via Wanrouter) 79# No iptables rules at all. 80 81ip link add veth0 netns "$r_a" type veth peer name veth0 netns "$r_w" 82ip link add veth1 netns "$r_a" type veth peer name veth0 netns "$c_a" 83 84l_addr="10.2.2.1" 85r_addr="10.4.4.1" 86ip netns exec "$r_a" ip link add ipip0 type ipip local "$l_addr" remote "$r_addr" mode ipip || exit $ksft_skip 87 88for dev in lo veth0 veth1 ipip0; do 89 ip -net "$r_a" link set "$dev" up 90done 91 92ip -net "$r_a" addr add 10.2.2.1/24 dev veth0 93ip -net "$r_a" addr add 192.168.10.1/24 dev veth1 94 95ip -net "$r_a" route add 192.168.20.0/24 dev ipip0 96ip -net "$r_a" route add 10.4.4.0/24 via 10.2.2.254 97 98ip netns exec "$r_a" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null 99 100# Detailed setup for Router B 101# --------------------------- 102# Interfaces: 103# eth0: 10.4.4.1/24 104# eth1: 192.168.20.1/24 105# ipip0: No IP address, local 10.4.4.1 remote 10.2.2.1 106# Routes: 107# 192.168.10.0/24 dev ipip0 (192.168.10.0/24 is subnet of Client A) 108# 10.2.2.1 via 10.4.4.254 (Router A via Wanrouter) 109# No iptables rules at all. 110 111ip link add veth0 netns "$r_b" type veth peer name veth1 netns "$r_w" 112ip link add veth1 netns "$r_b" type veth peer name veth0 netns "$c_b" 113 114l_addr="10.4.4.1" 115r_addr="10.2.2.1" 116 117ip netns exec "$r_b" ip link add ipip0 type ipip local "${l_addr}" remote "${r_addr}" mode ipip || exit $ksft_skip 118 119for dev in veth0 veth1 ipip0; do 120 ip -net "$r_b" link set $dev up 121done 122 123ip -net "$r_b" addr add 10.4.4.1/24 dev veth0 124ip -net "$r_b" addr add 192.168.20.1/24 dev veth1 125 126ip -net "$r_b" route add 192.168.10.0/24 dev ipip0 127ip -net "$r_b" route add 10.2.2.0/24 via 10.4.4.254 128ip netns exec "$r_b" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null 129 130# Client A 131ip -net "$c_a" addr add 192.168.10.2/24 dev veth0 132ip -net "$c_a" link set dev veth0 up 133ip -net "$c_a" route add default via 192.168.10.1 134 135# Client A 136ip -net "$c_b" addr add 192.168.20.2/24 dev veth0 137ip -net "$c_b" link set dev veth0 up 138ip -net "$c_b" route add default via 192.168.20.1 139 140# Wan 141ip -net "$r_w" addr add 10.2.2.254/24 dev veth0 142ip -net "$r_w" addr add 10.4.4.254/24 dev veth1 143 144ip -net "$r_w" link set dev veth0 up mtu 1400 145ip -net "$r_w" link set dev veth1 up mtu 1400 146 147ip -net "$r_a" link set dev veth0 mtu 1400 148ip -net "$r_b" link set dev veth0 mtu 1400 149 150ip netns exec "$r_w" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null 151 152# Path MTU discovery 153# ------------------ 154# Running tracepath from Client A to Client B shows PMTU discovery is working 155# as expected: 156# 157# clienta:~# tracepath 192.168.20.2 158# 1?: [LOCALHOST] pmtu 1500 159# 1: 192.168.10.1 0.867ms 160# 1: 192.168.10.1 0.302ms 161# 2: 192.168.10.1 0.312ms pmtu 1480 162# 2: no reply 163# 3: 192.168.10.1 0.510ms pmtu 1380 164# 3: 192.168.20.2 2.320ms reached 165# Resume: pmtu 1380 hops 3 back 3 166 167# ip netns exec ${c_a} traceroute --mtu 192.168.20.2 168 169# Router A has learned PMTU (1400) to Router B from Wanrouter. 170# Client A has learned PMTU (1400 - IPIP overhead = 1380) to Client B 171# from Router A. 172 173#Send large UDP packet 174#--------------------- 175#Now we send a 1400 bytes UDP packet from Client A to Client B: 176 177# clienta:~# head -c1400 /dev/zero | tr "\000" "a" | socat -u STDIN UDP:192.168.20.2:5000 178test_path "without" 179 180# The IPv4 stack on Client A already knows the PMTU to Client B, so the 181# UDP packet is sent as two fragments (1380 + 20). Router A forwards the 182# fragments between eth1 and ipip0. The fragments fit into the tunnel and 183# reach their destination. 184 185#When sending the large UDP packet again, Router A now reassembles the 186#fragments before routing the packet over ipip0. The resulting IPIP 187#packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is 188#dropped on Router A before sending. 189 190ip netns exec "$r_a" iptables -A FORWARD -m conntrack --ctstate NEW 191test_path "with" 192