1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3/*
4 * This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
5 * between src and dst. The netns fwd has veth links to each src and dst. The
6 * client is in src and server in dst. The test installs a TC BPF program to each
7 * host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
8 * neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
9 * switch from ingress side; it also installs a checker prog on the egress side
10 * to drop unexpected traffic.
11 */
12
13#include <arpa/inet.h>
14#include <linux/if_tun.h>
15#include <linux/limits.h>
16#include <linux/sysctl.h>
17#include <linux/time_types.h>
18#include <linux/net_tstamp.h>
19#include <net/if.h>
20#include <stdbool.h>
21#include <stdio.h>
22#include <sys/stat.h>
23#include <unistd.h>
24
25#include "test_progs.h"
26#include "network_helpers.h"
27#include "netlink_helpers.h"
28#include "test_tc_neigh_fib.skel.h"
29#include "test_tc_neigh.skel.h"
30#include "test_tc_peer.skel.h"
31#include "test_tc_dtime.skel.h"
32
33#ifndef TCP_TX_DELAY
34#define TCP_TX_DELAY 37
35#endif
36
37#define NS_SRC "ns_src"
38#define NS_FWD "ns_fwd"
39#define NS_DST "ns_dst"
40
41#define IP4_SRC "172.16.1.100"
42#define IP4_DST "172.16.2.100"
43#define IP4_TUN_SRC "172.17.1.100"
44#define IP4_TUN_FWD "172.17.1.200"
45#define IP4_PORT 9004
46
47#define IP6_SRC "0::1:dead:beef:cafe"
48#define IP6_DST "0::2:dead:beef:cafe"
49#define IP6_TUN_SRC "1::1:dead:beef:cafe"
50#define IP6_TUN_FWD "1::2:dead:beef:cafe"
51#define IP6_PORT 9006
52
53#define IP4_SLL "169.254.0.1"
54#define IP4_DLL "169.254.0.2"
55#define IP4_NET "169.254.0.0"
56
57#define MAC_DST_FWD "00:11:22:33:44:55"
58#define MAC_DST "00:22:33:44:55:66"
59
60#define IFADDR_STR_LEN 18
61#define PING_ARGS "-i 0.2 -c 3 -w 10 -q"
62
63#define TIMEOUT_MILLIS 10000
64#define NSEC_PER_SEC 1000000000ULL
65
66#define log_err(MSG, ...) \
67	fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
68		__FILE__, __LINE__, strerror(errno), ##__VA_ARGS__)
69
70static const char * const namespaces[] = {NS_SRC, NS_FWD, NS_DST, NULL};
71
72static int write_file(const char *path, const char *newval)
73{
74	FILE *f;
75
76	f = fopen(path, "r+");
77	if (!f)
78		return -1;
79	if (fwrite(newval, strlen(newval), 1, f) != 1) {
80		log_err("writing to %s failed", path);
81		fclose(f);
82		return -1;
83	}
84	fclose(f);
85	return 0;
86}
87
88static int netns_setup_namespaces(const char *verb)
89{
90	const char * const *ns = namespaces;
91	char cmd[128];
92
93	while (*ns) {
94		snprintf(cmd, sizeof(cmd), "ip netns %s %s", verb, *ns);
95		if (!ASSERT_OK(system(cmd), cmd))
96			return -1;
97		ns++;
98	}
99	return 0;
100}
101
102static void netns_setup_namespaces_nofail(const char *verb)
103{
104	const char * const *ns = namespaces;
105	char cmd[128];
106
107	while (*ns) {
108		snprintf(cmd, sizeof(cmd), "ip netns %s %s > /dev/null 2>&1", verb, *ns);
109		system(cmd);
110		ns++;
111	}
112}
113
114enum dev_mode {
115	MODE_VETH,
116	MODE_NETKIT,
117};
118
119struct netns_setup_result {
120	enum dev_mode dev_mode;
121	int ifindex_src;
122	int ifindex_src_fwd;
123	int ifindex_dst;
124	int ifindex_dst_fwd;
125};
126
127static int get_ifaddr(const char *name, char *ifaddr)
128{
129	char path[PATH_MAX];
130	FILE *f;
131	int ret;
132
133	snprintf(path, PATH_MAX, "/sys/class/net/%s/address", name);
134	f = fopen(path, "r");
135	if (!ASSERT_OK_PTR(f, path))
136		return -1;
137
138	ret = fread(ifaddr, 1, IFADDR_STR_LEN, f);
139	if (!ASSERT_EQ(ret, IFADDR_STR_LEN, "fread ifaddr")) {
140		fclose(f);
141		return -1;
142	}
143	fclose(f);
144	return 0;
145}
146
147static int create_netkit(int mode, char *prim, char *peer)
148{
149	struct rtattr *linkinfo, *data, *peer_info;
150	struct rtnl_handle rth = { .fd = -1 };
151	const char *type = "netkit";
152	struct {
153		struct nlmsghdr n;
154		struct ifinfomsg i;
155		char buf[1024];
156	} req = {};
157	int err;
158
159	err = rtnl_open(&rth, 0);
160	if (!ASSERT_OK(err, "open_rtnetlink"))
161		return err;
162
163	memset(&req, 0, sizeof(req));
164	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
165	req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
166	req.n.nlmsg_type = RTM_NEWLINK;
167	req.i.ifi_family = AF_UNSPEC;
168
169	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, prim, strlen(prim));
170	linkinfo = addattr_nest(&req.n, sizeof(req), IFLA_LINKINFO);
171	addattr_l(&req.n, sizeof(req), IFLA_INFO_KIND, type, strlen(type));
172	data = addattr_nest(&req.n, sizeof(req), IFLA_INFO_DATA);
173	addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode);
174	peer_info = addattr_nest(&req.n, sizeof(req), IFLA_NETKIT_PEER_INFO);
175	req.n.nlmsg_len += sizeof(struct ifinfomsg);
176	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, peer, strlen(peer));
177	addattr_nest_end(&req.n, peer_info);
178	addattr_nest_end(&req.n, data);
179	addattr_nest_end(&req.n, linkinfo);
180
181	err = rtnl_talk(&rth, &req.n, NULL);
182	ASSERT_OK(err, "talk_rtnetlink");
183	rtnl_close(&rth);
184	return err;
185}
186
187static int netns_setup_links_and_routes(struct netns_setup_result *result)
188{
189	struct nstoken *nstoken = NULL;
190	char src_fwd_addr[IFADDR_STR_LEN+1] = {};
191	char src_addr[IFADDR_STR_LEN + 1] = {};
192	int err;
193
194	if (result->dev_mode == MODE_VETH) {
195		SYS(fail, "ip link add src type veth peer name src_fwd");
196		SYS(fail, "ip link add dst type veth peer name dst_fwd");
197
198		SYS(fail, "ip link set dst_fwd address " MAC_DST_FWD);
199		SYS(fail, "ip link set dst address " MAC_DST);
200	} else if (result->dev_mode == MODE_NETKIT) {
201		err = create_netkit(NETKIT_L3, "src", "src_fwd");
202		if (!ASSERT_OK(err, "create_ifindex_src"))
203			goto fail;
204		err = create_netkit(NETKIT_L3, "dst", "dst_fwd");
205		if (!ASSERT_OK(err, "create_ifindex_dst"))
206			goto fail;
207	}
208
209	if (get_ifaddr("src_fwd", src_fwd_addr))
210		goto fail;
211
212	if (get_ifaddr("src", src_addr))
213		goto fail;
214
215	result->ifindex_src = if_nametoindex("src");
216	if (!ASSERT_GT(result->ifindex_src, 0, "ifindex_src"))
217		goto fail;
218
219	result->ifindex_src_fwd = if_nametoindex("src_fwd");
220	if (!ASSERT_GT(result->ifindex_src_fwd, 0, "ifindex_src_fwd"))
221		goto fail;
222
223	result->ifindex_dst = if_nametoindex("dst");
224	if (!ASSERT_GT(result->ifindex_dst, 0, "ifindex_dst"))
225		goto fail;
226
227	result->ifindex_dst_fwd = if_nametoindex("dst_fwd");
228	if (!ASSERT_GT(result->ifindex_dst_fwd, 0, "ifindex_dst_fwd"))
229		goto fail;
230
231	SYS(fail, "ip link set src netns " NS_SRC);
232	SYS(fail, "ip link set src_fwd netns " NS_FWD);
233	SYS(fail, "ip link set dst_fwd netns " NS_FWD);
234	SYS(fail, "ip link set dst netns " NS_DST);
235
236	/** setup in 'src' namespace */
237	nstoken = open_netns(NS_SRC);
238	if (!ASSERT_OK_PTR(nstoken, "setns src"))
239		goto fail;
240
241	SYS(fail, "ip addr add " IP4_SRC "/32 dev src");
242	SYS(fail, "ip addr add " IP6_SRC "/128 dev src nodad");
243	SYS(fail, "ip link set dev src up");
244
245	SYS(fail, "ip route add " IP4_DST "/32 dev src scope global");
246	SYS(fail, "ip route add " IP4_NET "/16 dev src scope global");
247	SYS(fail, "ip route add " IP6_DST "/128 dev src scope global");
248
249	if (result->dev_mode == MODE_VETH) {
250		SYS(fail, "ip neigh add " IP4_DST " dev src lladdr %s",
251		    src_fwd_addr);
252		SYS(fail, "ip neigh add " IP6_DST " dev src lladdr %s",
253		    src_fwd_addr);
254	}
255
256	close_netns(nstoken);
257
258	/** setup in 'fwd' namespace */
259	nstoken = open_netns(NS_FWD);
260	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
261		goto fail;
262
263	/* The fwd netns automatically gets a v6 LL address / routes, but also
264	 * needs v4 one in order to start ARP probing. IP4_NET route is added
265	 * to the endpoints so that the ARP processing will reply.
266	 */
267	SYS(fail, "ip addr add " IP4_SLL "/32 dev src_fwd");
268	SYS(fail, "ip addr add " IP4_DLL "/32 dev dst_fwd");
269	SYS(fail, "ip link set dev src_fwd up");
270	SYS(fail, "ip link set dev dst_fwd up");
271
272	SYS(fail, "ip route add " IP4_SRC "/32 dev src_fwd scope global");
273	SYS(fail, "ip route add " IP6_SRC "/128 dev src_fwd scope global");
274	SYS(fail, "ip route add " IP4_DST "/32 dev dst_fwd scope global");
275	SYS(fail, "ip route add " IP6_DST "/128 dev dst_fwd scope global");
276
277	if (result->dev_mode == MODE_VETH) {
278		SYS(fail, "ip neigh add " IP4_SRC " dev src_fwd lladdr %s", src_addr);
279		SYS(fail, "ip neigh add " IP6_SRC " dev src_fwd lladdr %s", src_addr);
280		SYS(fail, "ip neigh add " IP4_DST " dev dst_fwd lladdr %s", MAC_DST);
281		SYS(fail, "ip neigh add " IP6_DST " dev dst_fwd lladdr %s", MAC_DST);
282	}
283
284	close_netns(nstoken);
285
286	/** setup in 'dst' namespace */
287	nstoken = open_netns(NS_DST);
288	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
289		goto fail;
290
291	SYS(fail, "ip addr add " IP4_DST "/32 dev dst");
292	SYS(fail, "ip addr add " IP6_DST "/128 dev dst nodad");
293	SYS(fail, "ip link set dev dst up");
294	SYS(fail, "ip link set dev lo up");
295
296	SYS(fail, "ip route add " IP4_SRC "/32 dev dst scope global");
297	SYS(fail, "ip route add " IP4_NET "/16 dev dst scope global");
298	SYS(fail, "ip route add " IP6_SRC "/128 dev dst scope global");
299
300	if (result->dev_mode == MODE_VETH) {
301		SYS(fail, "ip neigh add " IP4_SRC " dev dst lladdr " MAC_DST_FWD);
302		SYS(fail, "ip neigh add " IP6_SRC " dev dst lladdr " MAC_DST_FWD);
303	}
304
305	close_netns(nstoken);
306
307	return 0;
308fail:
309	if (nstoken)
310		close_netns(nstoken);
311	return -1;
312}
313
314static int qdisc_clsact_create(struct bpf_tc_hook *qdisc_hook, int ifindex)
315{
316	char err_str[128], ifname[16];
317	int err;
318
319	qdisc_hook->ifindex = ifindex;
320	qdisc_hook->attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS;
321	err = bpf_tc_hook_create(qdisc_hook);
322	snprintf(err_str, sizeof(err_str),
323		 "qdisc add dev %s clsact",
324		 if_indextoname(qdisc_hook->ifindex, ifname) ? : "<unknown_iface>");
325	err_str[sizeof(err_str) - 1] = 0;
326	ASSERT_OK(err, err_str);
327
328	return err;
329}
330
331static int xgress_filter_add(struct bpf_tc_hook *qdisc_hook,
332			     enum bpf_tc_attach_point xgress,
333			     const struct bpf_program *prog, int priority)
334{
335	LIBBPF_OPTS(bpf_tc_opts, tc_attach);
336	char err_str[128], ifname[16];
337	int err;
338
339	qdisc_hook->attach_point = xgress;
340	tc_attach.prog_fd = bpf_program__fd(prog);
341	tc_attach.priority = priority;
342	err = bpf_tc_attach(qdisc_hook, &tc_attach);
343	snprintf(err_str, sizeof(err_str),
344		 "filter add dev %s %s prio %d bpf da %s",
345		 if_indextoname(qdisc_hook->ifindex, ifname) ? : "<unknown_iface>",
346		 xgress == BPF_TC_INGRESS ? "ingress" : "egress",
347		 priority, bpf_program__name(prog));
348	err_str[sizeof(err_str) - 1] = 0;
349	ASSERT_OK(err, err_str);
350
351	return err;
352}
353
354#define QDISC_CLSACT_CREATE(qdisc_hook, ifindex) ({		\
355	if ((err = qdisc_clsact_create(qdisc_hook, ifindex)))	\
356		goto fail;					\
357})
358
359#define XGRESS_FILTER_ADD(qdisc_hook, xgress, prog, priority) ({		\
360	if ((err = xgress_filter_add(qdisc_hook, xgress, prog, priority)))	\
361		goto fail;							\
362})
363
364static int netns_load_bpf(const struct bpf_program *src_prog,
365			  const struct bpf_program *dst_prog,
366			  const struct bpf_program *chk_prog,
367			  const struct netns_setup_result *setup_result)
368{
369	LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd);
370	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd);
371	int err;
372
373	/* tc qdisc add dev src_fwd clsact */
374	QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd);
375	/* tc filter add dev src_fwd ingress bpf da src_prog */
376	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, src_prog, 0);
377	/* tc filter add dev src_fwd egress bpf da chk_prog */
378	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, chk_prog, 0);
379
380	/* tc qdisc add dev dst_fwd clsact */
381	QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd);
382	/* tc filter add dev dst_fwd ingress bpf da dst_prog */
383	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, dst_prog, 0);
384	/* tc filter add dev dst_fwd egress bpf da chk_prog */
385	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, chk_prog, 0);
386
387	return 0;
388fail:
389	return -1;
390}
391
392static void test_tcp(int family, const char *addr, __u16 port)
393{
394	int listen_fd = -1, accept_fd = -1, client_fd = -1;
395	char buf[] = "testing testing";
396	int n;
397	struct nstoken *nstoken;
398
399	nstoken = open_netns(NS_DST);
400	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
401		return;
402
403	listen_fd = start_server(family, SOCK_STREAM, addr, port, 0);
404	if (!ASSERT_GE(listen_fd, 0, "listen"))
405		goto done;
406
407	close_netns(nstoken);
408	nstoken = open_netns(NS_SRC);
409	if (!ASSERT_OK_PTR(nstoken, "setns src"))
410		goto done;
411
412	client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS);
413	if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
414		goto done;
415
416	accept_fd = accept(listen_fd, NULL, NULL);
417	if (!ASSERT_GE(accept_fd, 0, "accept"))
418		goto done;
419
420	if (!ASSERT_OK(settimeo(accept_fd, TIMEOUT_MILLIS), "settimeo"))
421		goto done;
422
423	n = write(client_fd, buf, sizeof(buf));
424	if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
425		goto done;
426
427	n = read(accept_fd, buf, sizeof(buf));
428	ASSERT_EQ(n, sizeof(buf), "recv from server");
429
430done:
431	if (nstoken)
432		close_netns(nstoken);
433	if (listen_fd >= 0)
434		close(listen_fd);
435	if (accept_fd >= 0)
436		close(accept_fd);
437	if (client_fd >= 0)
438		close(client_fd);
439}
440
441static int test_ping(int family, const char *addr)
442{
443	SYS(fail, "ip netns exec " NS_SRC " %s " PING_ARGS " %s > /dev/null", ping_command(family), addr);
444	return 0;
445fail:
446	return -1;
447}
448
449static void test_connectivity(void)
450{
451	test_tcp(AF_INET, IP4_DST, IP4_PORT);
452	test_ping(AF_INET, IP4_DST);
453	test_tcp(AF_INET6, IP6_DST, IP6_PORT);
454	test_ping(AF_INET6, IP6_DST);
455}
456
457static int set_forwarding(bool enable)
458{
459	int err;
460
461	err = write_file("/proc/sys/net/ipv4/ip_forward", enable ? "1" : "0");
462	if (!ASSERT_OK(err, "set ipv4.ip_forward=0"))
463		return err;
464
465	err = write_file("/proc/sys/net/ipv6/conf/all/forwarding", enable ? "1" : "0");
466	if (!ASSERT_OK(err, "set ipv6.forwarding=0"))
467		return err;
468
469	return 0;
470}
471
472static int __rcv_tstamp(int fd, const char *expected, size_t s, __u64 *tstamp)
473{
474	struct __kernel_timespec pkt_ts = {};
475	char ctl[CMSG_SPACE(sizeof(pkt_ts))];
476	struct timespec now_ts;
477	struct msghdr msg = {};
478	__u64 now_ns, pkt_ns;
479	struct cmsghdr *cmsg;
480	struct iovec iov;
481	char data[32];
482	int ret;
483
484	iov.iov_base = data;
485	iov.iov_len = sizeof(data);
486	msg.msg_iov = &iov;
487	msg.msg_iovlen = 1;
488	msg.msg_control = &ctl;
489	msg.msg_controllen = sizeof(ctl);
490
491	ret = recvmsg(fd, &msg, 0);
492	if (!ASSERT_EQ(ret, s, "recvmsg"))
493		return -1;
494	ASSERT_STRNEQ(data, expected, s, "expected rcv data");
495
496	cmsg = CMSG_FIRSTHDR(&msg);
497	if (cmsg && cmsg->cmsg_level == SOL_SOCKET &&
498	    cmsg->cmsg_type == SO_TIMESTAMPNS_NEW)
499		memcpy(&pkt_ts, CMSG_DATA(cmsg), sizeof(pkt_ts));
500
501	pkt_ns = pkt_ts.tv_sec * NSEC_PER_SEC + pkt_ts.tv_nsec;
502	if (tstamp) {
503		/* caller will check the tstamp itself */
504		*tstamp = pkt_ns;
505		return 0;
506	}
507
508	ASSERT_NEQ(pkt_ns, 0, "pkt rcv tstamp");
509
510	ret = clock_gettime(CLOCK_REALTIME, &now_ts);
511	ASSERT_OK(ret, "clock_gettime");
512	now_ns = now_ts.tv_sec * NSEC_PER_SEC + now_ts.tv_nsec;
513
514	if (ASSERT_GE(now_ns, pkt_ns, "check rcv tstamp"))
515		ASSERT_LT(now_ns - pkt_ns, 5 * NSEC_PER_SEC,
516			  "check rcv tstamp");
517	return 0;
518}
519
520static void rcv_tstamp(int fd, const char *expected, size_t s)
521{
522	__rcv_tstamp(fd, expected, s, NULL);
523}
524
525static int wait_netstamp_needed_key(void)
526{
527	int opt = 1, srv_fd = -1, cli_fd = -1, nretries = 0, err, n;
528	char buf[] = "testing testing";
529	struct nstoken *nstoken;
530	__u64 tstamp = 0;
531
532	nstoken = open_netns(NS_DST);
533	if (!nstoken)
534		return -1;
535
536	srv_fd = start_server(AF_INET6, SOCK_DGRAM, "::1", 0, 0);
537	if (!ASSERT_GE(srv_fd, 0, "start_server"))
538		goto done;
539
540	err = setsockopt(srv_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
541			 &opt, sizeof(opt));
542	if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS_NEW)"))
543		goto done;
544
545	cli_fd = connect_to_fd(srv_fd, TIMEOUT_MILLIS);
546	if (!ASSERT_GE(cli_fd, 0, "connect_to_fd"))
547		goto done;
548
549again:
550	n = write(cli_fd, buf, sizeof(buf));
551	if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
552		goto done;
553	err = __rcv_tstamp(srv_fd, buf, sizeof(buf), &tstamp);
554	if (!ASSERT_OK(err, "__rcv_tstamp"))
555		goto done;
556	if (!tstamp && nretries++ < 5) {
557		sleep(1);
558		printf("netstamp_needed_key retry#%d\n", nretries);
559		goto again;
560	}
561
562done:
563	if (!tstamp && srv_fd != -1) {
564		close(srv_fd);
565		srv_fd = -1;
566	}
567	if (cli_fd != -1)
568		close(cli_fd);
569	close_netns(nstoken);
570	return srv_fd;
571}
572
573static void snd_tstamp(int fd, char *b, size_t s)
574{
575	struct sock_txtime opt = { .clockid = CLOCK_TAI };
576	char ctl[CMSG_SPACE(sizeof(__u64))];
577	struct timespec now_ts;
578	struct msghdr msg = {};
579	struct cmsghdr *cmsg;
580	struct iovec iov;
581	__u64 now_ns;
582	int ret;
583
584	ret = clock_gettime(CLOCK_TAI, &now_ts);
585	ASSERT_OK(ret, "clock_get_time(CLOCK_TAI)");
586	now_ns = now_ts.tv_sec * NSEC_PER_SEC + now_ts.tv_nsec;
587
588	iov.iov_base = b;
589	iov.iov_len = s;
590	msg.msg_iov = &iov;
591	msg.msg_iovlen = 1;
592	msg.msg_control = &ctl;
593	msg.msg_controllen = sizeof(ctl);
594
595	cmsg = CMSG_FIRSTHDR(&msg);
596	cmsg->cmsg_level = SOL_SOCKET;
597	cmsg->cmsg_type = SCM_TXTIME;
598	cmsg->cmsg_len = CMSG_LEN(sizeof(now_ns));
599	*(__u64 *)CMSG_DATA(cmsg) = now_ns;
600
601	ret = setsockopt(fd, SOL_SOCKET, SO_TXTIME, &opt, sizeof(opt));
602	ASSERT_OK(ret, "setsockopt(SO_TXTIME)");
603
604	ret = sendmsg(fd, &msg, 0);
605	ASSERT_EQ(ret, s, "sendmsg");
606}
607
608static void test_inet_dtime(int family, int type, const char *addr, __u16 port)
609{
610	int opt = 1, accept_fd = -1, client_fd = -1, listen_fd, err;
611	char buf[] = "testing testing";
612	struct nstoken *nstoken;
613
614	nstoken = open_netns(NS_DST);
615	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
616		return;
617	listen_fd = start_server(family, type, addr, port, 0);
618	close_netns(nstoken);
619
620	if (!ASSERT_GE(listen_fd, 0, "listen"))
621		return;
622
623	/* Ensure the kernel puts the (rcv) timestamp for all skb */
624	err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
625			 &opt, sizeof(opt));
626	if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS_NEW)"))
627		goto done;
628
629	if (type == SOCK_STREAM) {
630		/* Ensure the kernel set EDT when sending out rst/ack
631		 * from the kernel's ctl_sk.
632		 */
633		err = setsockopt(listen_fd, SOL_TCP, TCP_TX_DELAY, &opt,
634				 sizeof(opt));
635		if (!ASSERT_OK(err, "setsockopt(TCP_TX_DELAY)"))
636			goto done;
637	}
638
639	nstoken = open_netns(NS_SRC);
640	if (!ASSERT_OK_PTR(nstoken, "setns src"))
641		goto done;
642	client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS);
643	close_netns(nstoken);
644
645	if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
646		goto done;
647
648	if (type == SOCK_STREAM) {
649		int n;
650
651		accept_fd = accept(listen_fd, NULL, NULL);
652		if (!ASSERT_GE(accept_fd, 0, "accept"))
653			goto done;
654
655		n = write(client_fd, buf, sizeof(buf));
656		if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
657			goto done;
658		rcv_tstamp(accept_fd, buf, sizeof(buf));
659	} else {
660		snd_tstamp(client_fd, buf, sizeof(buf));
661		rcv_tstamp(listen_fd, buf, sizeof(buf));
662	}
663
664done:
665	close(listen_fd);
666	if (accept_fd != -1)
667		close(accept_fd);
668	if (client_fd != -1)
669		close(client_fd);
670}
671
672static int netns_load_dtime_bpf(struct test_tc_dtime *skel,
673				const struct netns_setup_result *setup_result)
674{
675	LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd);
676	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd);
677	LIBBPF_OPTS(bpf_tc_hook, qdisc_src);
678	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst);
679	struct nstoken *nstoken;
680	int err;
681
682	/* setup ns_src tc progs */
683	nstoken = open_netns(NS_SRC);
684	if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC))
685		return -1;
686	/* tc qdisc add dev src clsact */
687	QDISC_CLSACT_CREATE(&qdisc_src, setup_result->ifindex_src);
688	/* tc filter add dev src ingress bpf da ingress_host */
689	XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_INGRESS, skel->progs.ingress_host, 0);
690	/* tc filter add dev src egress bpf da egress_host */
691	XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_EGRESS, skel->progs.egress_host, 0);
692	close_netns(nstoken);
693
694	/* setup ns_dst tc progs */
695	nstoken = open_netns(NS_DST);
696	if (!ASSERT_OK_PTR(nstoken, "setns " NS_DST))
697		return -1;
698	/* tc qdisc add dev dst clsact */
699	QDISC_CLSACT_CREATE(&qdisc_dst, setup_result->ifindex_dst);
700	/* tc filter add dev dst ingress bpf da ingress_host */
701	XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_INGRESS, skel->progs.ingress_host, 0);
702	/* tc filter add dev dst egress bpf da egress_host */
703	XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_EGRESS, skel->progs.egress_host, 0);
704	close_netns(nstoken);
705
706	/* setup ns_fwd tc progs */
707	nstoken = open_netns(NS_FWD);
708	if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD))
709		return -1;
710	/* tc qdisc add dev dst_fwd clsact */
711	QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd);
712	/* tc filter add dev dst_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */
713	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS,
714			  skel->progs.ingress_fwdns_prio100, 100);
715	/* tc filter add dev dst_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */
716	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS,
717			  skel->progs.ingress_fwdns_prio101, 101);
718	/* tc filter add dev dst_fwd egress prio 100 bpf da egress_fwdns_prio100 */
719	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS,
720			  skel->progs.egress_fwdns_prio100, 100);
721	/* tc filter add dev dst_fwd egress prio 101 bpf da egress_fwdns_prio101 */
722	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS,
723			  skel->progs.egress_fwdns_prio101, 101);
724
725	/* tc qdisc add dev src_fwd clsact */
726	QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd);
727	/* tc filter add dev src_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */
728	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS,
729			  skel->progs.ingress_fwdns_prio100, 100);
730	/* tc filter add dev src_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */
731	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS,
732			  skel->progs.ingress_fwdns_prio101, 101);
733	/* tc filter add dev src_fwd egress prio 100 bpf da egress_fwdns_prio100 */
734	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS,
735			  skel->progs.egress_fwdns_prio100, 100);
736	/* tc filter add dev src_fwd egress prio 101 bpf da egress_fwdns_prio101 */
737	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS,
738			  skel->progs.egress_fwdns_prio101, 101);
739	close_netns(nstoken);
740	return 0;
741
742fail:
743	close_netns(nstoken);
744	return err;
745}
746
747enum {
748	INGRESS_FWDNS_P100,
749	INGRESS_FWDNS_P101,
750	EGRESS_FWDNS_P100,
751	EGRESS_FWDNS_P101,
752	INGRESS_ENDHOST,
753	EGRESS_ENDHOST,
754	SET_DTIME,
755	__MAX_CNT,
756};
757
758const char *cnt_names[] = {
759	"ingress_fwdns_p100",
760	"ingress_fwdns_p101",
761	"egress_fwdns_p100",
762	"egress_fwdns_p101",
763	"ingress_endhost",
764	"egress_endhost",
765	"set_dtime",
766};
767
768enum {
769	TCP_IP6_CLEAR_DTIME,
770	TCP_IP4,
771	TCP_IP6,
772	UDP_IP4,
773	UDP_IP6,
774	TCP_IP4_RT_FWD,
775	TCP_IP6_RT_FWD,
776	UDP_IP4_RT_FWD,
777	UDP_IP6_RT_FWD,
778	UKN_TEST,
779	__NR_TESTS,
780};
781
782const char *test_names[] = {
783	"tcp ip6 clear dtime",
784	"tcp ip4",
785	"tcp ip6",
786	"udp ip4",
787	"udp ip6",
788	"tcp ip4 rt fwd",
789	"tcp ip6 rt fwd",
790	"udp ip4 rt fwd",
791	"udp ip6 rt fwd",
792};
793
794static const char *dtime_cnt_str(int test, int cnt)
795{
796	static char name[64];
797
798	snprintf(name, sizeof(name), "%s %s", test_names[test], cnt_names[cnt]);
799
800	return name;
801}
802
803static const char *dtime_err_str(int test, int cnt)
804{
805	static char name[64];
806
807	snprintf(name, sizeof(name), "%s %s errs", test_names[test],
808		 cnt_names[cnt]);
809
810	return name;
811}
812
813static void test_tcp_clear_dtime(struct test_tc_dtime *skel)
814{
815	int i, t = TCP_IP6_CLEAR_DTIME;
816	__u32 *dtimes = skel->bss->dtimes[t];
817	__u32 *errs = skel->bss->errs[t];
818
819	skel->bss->test = t;
820	test_inet_dtime(AF_INET6, SOCK_STREAM, IP6_DST, 50000 + t);
821
822	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
823		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
824	ASSERT_EQ(dtimes[INGRESS_FWDNS_P101], 0,
825		  dtime_cnt_str(t, INGRESS_FWDNS_P101));
826	ASSERT_GT(dtimes[EGRESS_FWDNS_P100], 0,
827		  dtime_cnt_str(t, EGRESS_FWDNS_P100));
828	ASSERT_EQ(dtimes[EGRESS_FWDNS_P101], 0,
829		  dtime_cnt_str(t, EGRESS_FWDNS_P101));
830	ASSERT_GT(dtimes[EGRESS_ENDHOST], 0,
831		  dtime_cnt_str(t, EGRESS_ENDHOST));
832	ASSERT_GT(dtimes[INGRESS_ENDHOST], 0,
833		  dtime_cnt_str(t, INGRESS_ENDHOST));
834
835	for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
836		ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
837}
838
839static void test_tcp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd)
840{
841	__u32 *dtimes, *errs;
842	const char *addr;
843	int i, t;
844
845	if (family == AF_INET) {
846		t = bpf_fwd ? TCP_IP4 : TCP_IP4_RT_FWD;
847		addr = IP4_DST;
848	} else {
849		t = bpf_fwd ? TCP_IP6 : TCP_IP6_RT_FWD;
850		addr = IP6_DST;
851	}
852
853	dtimes = skel->bss->dtimes[t];
854	errs = skel->bss->errs[t];
855
856	skel->bss->test = t;
857	test_inet_dtime(family, SOCK_STREAM, addr, 50000 + t);
858
859	/* fwdns_prio100 prog does not read delivery_time_type, so
860	 * kernel puts the (rcv) timetamp in __sk_buff->tstamp
861	 */
862	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
863		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
864	for (i = INGRESS_FWDNS_P101; i < SET_DTIME; i++)
865		ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i));
866
867	for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
868		ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
869}
870
871static void test_udp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd)
872{
873	__u32 *dtimes, *errs;
874	const char *addr;
875	int i, t;
876
877	if (family == AF_INET) {
878		t = bpf_fwd ? UDP_IP4 : UDP_IP4_RT_FWD;
879		addr = IP4_DST;
880	} else {
881		t = bpf_fwd ? UDP_IP6 : UDP_IP6_RT_FWD;
882		addr = IP6_DST;
883	}
884
885	dtimes = skel->bss->dtimes[t];
886	errs = skel->bss->errs[t];
887
888	skel->bss->test = t;
889	test_inet_dtime(family, SOCK_DGRAM, addr, 50000 + t);
890
891	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
892		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
893	/* non mono delivery time is not forwarded */
894	ASSERT_EQ(dtimes[INGRESS_FWDNS_P101], 0,
895		  dtime_cnt_str(t, INGRESS_FWDNS_P101));
896	for (i = EGRESS_FWDNS_P100; i < SET_DTIME; i++)
897		ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i));
898
899	for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
900		ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
901}
902
903static void test_tc_redirect_dtime(struct netns_setup_result *setup_result)
904{
905	struct test_tc_dtime *skel;
906	struct nstoken *nstoken;
907	int hold_tstamp_fd, err;
908
909	/* Hold a sk with the SOCK_TIMESTAMP set to ensure there
910	 * is no delay in the kernel net_enable_timestamp().
911	 * This ensures the following tests must have
912	 * non zero rcv tstamp in the recvmsg().
913	 */
914	hold_tstamp_fd = wait_netstamp_needed_key();
915	if (!ASSERT_GE(hold_tstamp_fd, 0, "wait_netstamp_needed_key"))
916		return;
917
918	skel = test_tc_dtime__open();
919	if (!ASSERT_OK_PTR(skel, "test_tc_dtime__open"))
920		goto done;
921
922	skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd;
923	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
924
925	err = test_tc_dtime__load(skel);
926	if (!ASSERT_OK(err, "test_tc_dtime__load"))
927		goto done;
928
929	if (netns_load_dtime_bpf(skel, setup_result))
930		goto done;
931
932	nstoken = open_netns(NS_FWD);
933	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
934		goto done;
935	err = set_forwarding(false);
936	close_netns(nstoken);
937	if (!ASSERT_OK(err, "disable forwarding"))
938		goto done;
939
940	test_tcp_clear_dtime(skel);
941
942	test_tcp_dtime(skel, AF_INET, true);
943	test_tcp_dtime(skel, AF_INET6, true);
944	test_udp_dtime(skel, AF_INET, true);
945	test_udp_dtime(skel, AF_INET6, true);
946
947	/* Test the kernel ip[6]_forward path instead
948	 * of bpf_redirect_neigh().
949	 */
950	nstoken = open_netns(NS_FWD);
951	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
952		goto done;
953	err = set_forwarding(true);
954	close_netns(nstoken);
955	if (!ASSERT_OK(err, "enable forwarding"))
956		goto done;
957
958	test_tcp_dtime(skel, AF_INET, false);
959	test_tcp_dtime(skel, AF_INET6, false);
960	test_udp_dtime(skel, AF_INET, false);
961	test_udp_dtime(skel, AF_INET6, false);
962
963done:
964	test_tc_dtime__destroy(skel);
965	close(hold_tstamp_fd);
966}
967
968static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result)
969{
970	struct nstoken *nstoken = NULL;
971	struct test_tc_neigh_fib *skel = NULL;
972
973	nstoken = open_netns(NS_FWD);
974	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
975		return;
976
977	skel = test_tc_neigh_fib__open();
978	if (!ASSERT_OK_PTR(skel, "test_tc_neigh_fib__open"))
979		goto done;
980
981	if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load"))
982		goto done;
983
984	if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
985			   skel->progs.tc_chk, setup_result))
986		goto done;
987
988	/* bpf_fib_lookup() checks if forwarding is enabled */
989	if (!ASSERT_OK(set_forwarding(true), "enable forwarding"))
990		goto done;
991
992	test_connectivity();
993
994done:
995	if (skel)
996		test_tc_neigh_fib__destroy(skel);
997	close_netns(nstoken);
998}
999
1000static void test_tc_redirect_neigh(struct netns_setup_result *setup_result)
1001{
1002	struct nstoken *nstoken = NULL;
1003	struct test_tc_neigh *skel = NULL;
1004	int err;
1005
1006	nstoken = open_netns(NS_FWD);
1007	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
1008		return;
1009
1010	skel = test_tc_neigh__open();
1011	if (!ASSERT_OK_PTR(skel, "test_tc_neigh__open"))
1012		goto done;
1013
1014	skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd;
1015	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
1016
1017	err = test_tc_neigh__load(skel);
1018	if (!ASSERT_OK(err, "test_tc_neigh__load"))
1019		goto done;
1020
1021	if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
1022			   skel->progs.tc_chk, setup_result))
1023		goto done;
1024
1025	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
1026		goto done;
1027
1028	test_connectivity();
1029
1030done:
1031	if (skel)
1032		test_tc_neigh__destroy(skel);
1033	close_netns(nstoken);
1034}
1035
1036static void test_tc_redirect_peer(struct netns_setup_result *setup_result)
1037{
1038	struct nstoken *nstoken;
1039	struct test_tc_peer *skel;
1040	int err;
1041
1042	nstoken = open_netns(NS_FWD);
1043	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
1044		return;
1045
1046	skel = test_tc_peer__open();
1047	if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
1048		goto done;
1049
1050	skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd;
1051	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
1052
1053	err = test_tc_peer__load(skel);
1054	if (!ASSERT_OK(err, "test_tc_peer__load"))
1055		goto done;
1056
1057	if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
1058			   skel->progs.tc_chk, setup_result))
1059		goto done;
1060
1061	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
1062		goto done;
1063
1064	test_connectivity();
1065
1066done:
1067	if (skel)
1068		test_tc_peer__destroy(skel);
1069	close_netns(nstoken);
1070}
1071
1072static int tun_open(char *name)
1073{
1074	struct ifreq ifr;
1075	int fd, err;
1076
1077	fd = open("/dev/net/tun", O_RDWR);
1078	if (!ASSERT_GE(fd, 0, "open /dev/net/tun"))
1079		return -1;
1080
1081	memset(&ifr, 0, sizeof(ifr));
1082
1083	ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
1084	if (*name)
1085		strncpy(ifr.ifr_name, name, IFNAMSIZ);
1086
1087	err = ioctl(fd, TUNSETIFF, &ifr);
1088	if (!ASSERT_OK(err, "ioctl TUNSETIFF"))
1089		goto fail;
1090
1091	SYS(fail, "ip link set dev %s up", name);
1092
1093	return fd;
1094fail:
1095	close(fd);
1096	return -1;
1097}
1098
1099enum {
1100	SRC_TO_TARGET = 0,
1101	TARGET_TO_SRC = 1,
1102};
1103
1104static int tun_relay_loop(int src_fd, int target_fd)
1105{
1106	fd_set rfds, wfds;
1107
1108	FD_ZERO(&rfds);
1109	FD_ZERO(&wfds);
1110
1111	for (;;) {
1112		char buf[1500];
1113		int direction, nread, nwrite;
1114
1115		FD_SET(src_fd, &rfds);
1116		FD_SET(target_fd, &rfds);
1117
1118		if (select(1 + MAX(src_fd, target_fd), &rfds, NULL, NULL, NULL) < 0) {
1119			log_err("select failed");
1120			return 1;
1121		}
1122
1123		direction = FD_ISSET(src_fd, &rfds) ? SRC_TO_TARGET : TARGET_TO_SRC;
1124
1125		nread = read(direction == SRC_TO_TARGET ? src_fd : target_fd, buf, sizeof(buf));
1126		if (nread < 0) {
1127			log_err("read failed");
1128			return 1;
1129		}
1130
1131		nwrite = write(direction == SRC_TO_TARGET ? target_fd : src_fd, buf, nread);
1132		if (nwrite != nread) {
1133			log_err("write failed");
1134			return 1;
1135		}
1136	}
1137}
1138
1139static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result)
1140{
1141	LIBBPF_OPTS(bpf_tc_hook, qdisc_tun_fwd);
1142	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd);
1143	struct test_tc_peer *skel = NULL;
1144	struct nstoken *nstoken = NULL;
1145	int err;
1146	int tunnel_pid = -1;
1147	int src_fd, target_fd = -1;
1148	int ifindex;
1149
1150	/* Start a L3 TUN/TAP tunnel between the src and dst namespaces.
1151	 * This test is using TUN/TAP instead of e.g. IPIP or GRE tunnel as those
1152	 * expose the L2 headers encapsulating the IP packet to BPF and hence
1153	 * don't have skb in suitable state for this test. Alternative to TUN/TAP
1154	 * would be e.g. Wireguard which would appear as a pure L3 device to BPF,
1155	 * but that requires much more complicated setup.
1156	 */
1157	nstoken = open_netns(NS_SRC);
1158	if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC))
1159		return;
1160
1161	src_fd = tun_open("tun_src");
1162	if (!ASSERT_GE(src_fd, 0, "tun_open tun_src"))
1163		goto fail;
1164
1165	close_netns(nstoken);
1166
1167	nstoken = open_netns(NS_FWD);
1168	if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD))
1169		goto fail;
1170
1171	target_fd = tun_open("tun_fwd");
1172	if (!ASSERT_GE(target_fd, 0, "tun_open tun_fwd"))
1173		goto fail;
1174
1175	tunnel_pid = fork();
1176	if (!ASSERT_GE(tunnel_pid, 0, "fork tun_relay_loop"))
1177		goto fail;
1178
1179	if (tunnel_pid == 0)
1180		exit(tun_relay_loop(src_fd, target_fd));
1181
1182	skel = test_tc_peer__open();
1183	if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
1184		goto fail;
1185
1186	ifindex = if_nametoindex("tun_fwd");
1187	if (!ASSERT_GT(ifindex, 0, "if_indextoname tun_fwd"))
1188		goto fail;
1189
1190	skel->rodata->IFINDEX_SRC = ifindex;
1191	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
1192
1193	err = test_tc_peer__load(skel);
1194	if (!ASSERT_OK(err, "test_tc_peer__load"))
1195		goto fail;
1196
1197	/* Load "tc_src_l3" to the tun_fwd interface to redirect packets
1198	 * towards dst, and "tc_dst" to redirect packets
1199	 * and "tc_chk" on dst_fwd to drop non-redirected packets.
1200	 */
1201	/* tc qdisc add dev tun_fwd clsact */
1202	QDISC_CLSACT_CREATE(&qdisc_tun_fwd, ifindex);
1203	/* tc filter add dev tun_fwd ingress bpf da tc_src_l3 */
1204	XGRESS_FILTER_ADD(&qdisc_tun_fwd, BPF_TC_INGRESS, skel->progs.tc_src_l3, 0);
1205
1206	/* tc qdisc add dev dst_fwd clsact */
1207	QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd);
1208	/* tc filter add dev dst_fwd ingress bpf da tc_dst_l3 */
1209	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, skel->progs.tc_dst_l3, 0);
1210	/* tc filter add dev dst_fwd egress bpf da tc_chk */
1211	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, skel->progs.tc_chk, 0);
1212
1213	/* Setup route and neigh tables */
1214	SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP4_TUN_SRC "/24");
1215	SYS(fail, "ip -netns " NS_FWD " addr add dev tun_fwd " IP4_TUN_FWD "/24");
1216
1217	SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP6_TUN_SRC "/64 nodad");
1218	SYS(fail, "ip -netns " NS_FWD " addr add dev tun_fwd " IP6_TUN_FWD "/64 nodad");
1219
1220	SYS(fail, "ip -netns " NS_SRC " route del " IP4_DST "/32 dev src scope global");
1221	SYS(fail, "ip -netns " NS_SRC " route add " IP4_DST "/32 via " IP4_TUN_FWD
1222	    " dev tun_src scope global");
1223	SYS(fail, "ip -netns " NS_DST " route add " IP4_TUN_SRC "/32 dev dst scope global");
1224	SYS(fail, "ip -netns " NS_SRC " route del " IP6_DST "/128 dev src scope global");
1225	SYS(fail, "ip -netns " NS_SRC " route add " IP6_DST "/128 via " IP6_TUN_FWD
1226	    " dev tun_src scope global");
1227	SYS(fail, "ip -netns " NS_DST " route add " IP6_TUN_SRC "/128 dev dst scope global");
1228
1229	SYS(fail, "ip -netns " NS_DST " neigh add " IP4_TUN_SRC " dev dst lladdr " MAC_DST_FWD);
1230	SYS(fail, "ip -netns " NS_DST " neigh add " IP6_TUN_SRC " dev dst lladdr " MAC_DST_FWD);
1231
1232	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
1233		goto fail;
1234
1235	test_connectivity();
1236
1237fail:
1238	if (tunnel_pid > 0) {
1239		kill(tunnel_pid, SIGTERM);
1240		waitpid(tunnel_pid, NULL, 0);
1241	}
1242	if (src_fd >= 0)
1243		close(src_fd);
1244	if (target_fd >= 0)
1245		close(target_fd);
1246	if (skel)
1247		test_tc_peer__destroy(skel);
1248	if (nstoken)
1249		close_netns(nstoken);
1250}
1251
1252#define RUN_TEST(name, mode)                                                                \
1253	({                                                                                  \
1254		struct netns_setup_result setup_result = { .dev_mode = mode, };             \
1255		if (test__start_subtest(#name))                                             \
1256			if (ASSERT_OK(netns_setup_namespaces("add"), "setup namespaces")) { \
1257				if (ASSERT_OK(netns_setup_links_and_routes(&setup_result),  \
1258					      "setup links and routes"))                    \
1259					test_ ## name(&setup_result);                       \
1260				netns_setup_namespaces("delete");                           \
1261			}                                                                   \
1262	})
1263
1264static void *test_tc_redirect_run_tests(void *arg)
1265{
1266	netns_setup_namespaces_nofail("delete");
1267
1268	RUN_TEST(tc_redirect_peer, MODE_VETH);
1269	RUN_TEST(tc_redirect_peer, MODE_NETKIT);
1270	RUN_TEST(tc_redirect_peer_l3, MODE_VETH);
1271	RUN_TEST(tc_redirect_peer_l3, MODE_NETKIT);
1272	RUN_TEST(tc_redirect_neigh, MODE_VETH);
1273	RUN_TEST(tc_redirect_neigh_fib, MODE_VETH);
1274	RUN_TEST(tc_redirect_dtime, MODE_VETH);
1275	return NULL;
1276}
1277
1278void test_tc_redirect(void)
1279{
1280	pthread_t test_thread;
1281	int err;
1282
1283	/* Run the tests in their own thread to isolate the namespace changes
1284	 * so they do not affect the environment of other tests.
1285	 * (specifically needed because of unshare(CLONE_NEWNS) in open_netns())
1286	 */
1287	err = pthread_create(&test_thread, NULL, &test_tc_redirect_run_tests, NULL);
1288	if (ASSERT_OK(err, "pthread_create"))
1289		ASSERT_OK(pthread_join(test_thread, NULL), "pthread_join");
1290}
1291