1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3
4source lib.sh
5
6# Conntrack needs to reassemble fragments in order to have complete
7# packets for rule matching.  Reassembly can lead to packet loss.
8
9# Consider the following setup:
10#            +--------+       +---------+       +--------+
11#            |Router A|-------|Wanrouter|-------|Router B|
12#            |        |.IPIP..|         |..IPIP.|        |
13#            +--------+       +---------+       +--------+
14#           /                  mtu 1400                   \
15#          /                                               \
16#+--------+                                                 +--------+
17#|Client A|                                                 |Client B|
18#|        |                                                 |        |
19#+--------+                                                 +--------+
20
21# Router A and Router B use IPIP tunnel interfaces to tunnel traffic
22# between Client A and Client B over WAN. Wanrouter has MTU 1400 set
23# on its interfaces.
24
25rx=$(mktemp)
26
27checktool "iptables --version" "run test without iptables"
28checktool "socat -h" "run test without socat"
29
30setup_ns r_a r_b r_w c_a c_b
31
32cleanup() {
33	cleanup_all_ns
34	rm -f "$rx"
35}
36
37trap cleanup EXIT
38
39listener_ready()
40{
41	ns="$1"
42	port="$2"
43	ss -N "$ns" -lnu -o "sport = :$port" | grep -q "$port"
44}
45
46test_path() {
47	msg="$1"
48
49	ip netns exec "$c_b" socat -t 3 - udp4-listen:5000,reuseaddr > "$rx" < /dev/null &
50
51	busywait $BUSYWAIT_TIMEOUT listener_ready "$c_b" 5000
52
53	for i in 1 2 3; do
54		head -c1400 /dev/zero | tr "\000" "a" | \
55			ip netns exec "$c_a" socat -t 1 -u STDIN UDP:192.168.20.2:5000
56	done
57
58	wait
59
60	bytes=$(wc -c < "$rx")
61
62	if [ "$bytes" -eq 1400 ];then
63		echo "OK: PMTU $msg connection tracking"
64	else
65		echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400"
66		exit 1
67	fi
68}
69
70# Detailed setup for Router A
71# ---------------------------
72# Interfaces:
73# eth0: 10.2.2.1/24
74# eth1: 192.168.10.1/24
75# ipip0: No IP address, local 10.2.2.1 remote 10.4.4.1
76# Routes:
77# 192.168.20.0/24 dev ipip0    (192.168.20.0/24 is subnet of Client B)
78# 10.4.4.1 via 10.2.2.254      (Router B via Wanrouter)
79# No iptables rules at all.
80
81ip link add veth0 netns "$r_a" type veth peer name veth0 netns "$r_w"
82ip link add veth1 netns "$r_a" type veth peer name veth0 netns "$c_a"
83
84l_addr="10.2.2.1"
85r_addr="10.4.4.1"
86ip netns exec "$r_a" ip link add ipip0 type ipip local "$l_addr" remote "$r_addr" mode ipip || exit $ksft_skip
87
88for dev in lo veth0 veth1 ipip0; do
89    ip -net "$r_a" link set "$dev" up
90done
91
92ip -net "$r_a" addr add 10.2.2.1/24 dev veth0
93ip -net "$r_a" addr add 192.168.10.1/24 dev veth1
94
95ip -net "$r_a" route add 192.168.20.0/24 dev ipip0
96ip -net "$r_a" route add 10.4.4.0/24 via 10.2.2.254
97
98ip netns exec "$r_a" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
99
100# Detailed setup for Router B
101# ---------------------------
102# Interfaces:
103# eth0: 10.4.4.1/24
104# eth1: 192.168.20.1/24
105# ipip0: No IP address, local 10.4.4.1 remote 10.2.2.1
106# Routes:
107# 192.168.10.0/24 dev ipip0    (192.168.10.0/24 is subnet of Client A)
108# 10.2.2.1 via 10.4.4.254      (Router A via Wanrouter)
109# No iptables rules at all.
110
111ip link add veth0 netns "$r_b" type veth peer name veth1 netns "$r_w"
112ip link add veth1 netns "$r_b" type veth peer name veth0 netns "$c_b"
113
114l_addr="10.4.4.1"
115r_addr="10.2.2.1"
116
117ip netns exec "$r_b" ip link add ipip0 type ipip local "${l_addr}" remote "${r_addr}" mode ipip || exit $ksft_skip
118
119for dev in veth0 veth1 ipip0; do
120	ip -net "$r_b" link set $dev up
121done
122
123ip -net "$r_b" addr add 10.4.4.1/24 dev veth0
124ip -net "$r_b" addr add 192.168.20.1/24 dev veth1
125
126ip -net "$r_b" route add 192.168.10.0/24 dev ipip0
127ip -net "$r_b" route add 10.2.2.0/24 via 10.4.4.254
128ip netns exec "$r_b" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
129
130# Client A
131ip -net "$c_a" addr add 192.168.10.2/24 dev veth0
132ip -net "$c_a" link set dev veth0 up
133ip -net "$c_a" route add default via 192.168.10.1
134
135# Client A
136ip -net "$c_b" addr add 192.168.20.2/24 dev veth0
137ip -net "$c_b" link set dev veth0 up
138ip -net "$c_b" route add default via 192.168.20.1
139
140# Wan
141ip -net "$r_w" addr add 10.2.2.254/24 dev veth0
142ip -net "$r_w" addr add 10.4.4.254/24 dev veth1
143
144ip -net "$r_w" link set dev veth0 up mtu 1400
145ip -net "$r_w" link set dev veth1 up mtu 1400
146
147ip -net "$r_a" link set dev veth0 mtu 1400
148ip -net "$r_b" link set dev veth0 mtu 1400
149
150ip netns exec "$r_w" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
151
152# Path MTU discovery
153# ------------------
154# Running tracepath from Client A to Client B shows PMTU discovery is working
155# as expected:
156#
157# clienta:~# tracepath 192.168.20.2
158# 1?: [LOCALHOST]                      pmtu 1500
159# 1:  192.168.10.1                                          0.867ms
160# 1:  192.168.10.1                                          0.302ms
161# 2:  192.168.10.1                                          0.312ms pmtu 1480
162# 2:  no reply
163# 3:  192.168.10.1                                          0.510ms pmtu 1380
164# 3:  192.168.20.2                                          2.320ms reached
165# Resume: pmtu 1380 hops 3 back 3
166
167# ip netns exec ${c_a} traceroute --mtu 192.168.20.2
168
169# Router A has learned PMTU (1400) to Router B from Wanrouter.
170# Client A has learned PMTU (1400 - IPIP overhead = 1380) to Client B
171# from Router A.
172
173#Send large UDP packet
174#---------------------
175#Now we send a 1400 bytes UDP packet from Client A to Client B:
176
177# clienta:~# head -c1400 /dev/zero | tr "\000" "a" | socat -u STDIN UDP:192.168.20.2:5000
178test_path "without"
179
180# The IPv4 stack on Client A already knows the PMTU to Client B, so the
181# UDP packet is sent as two fragments (1380 + 20). Router A forwards the
182# fragments between eth1 and ipip0. The fragments fit into the tunnel and
183# reach their destination.
184
185#When sending the large UDP packet again, Router A now reassembles the
186#fragments before routing the packet over ipip0. The resulting IPIP
187#packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is
188#dropped on Router A before sending.
189
190ip netns exec "$r_a" iptables -A FORWARD -m conntrack --ctstate NEW
191test_path "with"
192