1// SPDX-License-Identifier: GPL-2.0-only
2/* Unstable Conntrack Helpers for XDP and TC-BPF hook
3 *
4 * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
5 * allowed to break compatibility for these functions since the interface they
6 * are exposed through to BPF programs is explicitly unstable.
7 */
8
9#include <linux/bpf_verifier.h>
10#include <linux/bpf.h>
11#include <linux/btf.h>
12#include <linux/filter.h>
13#include <linux/mutex.h>
14#include <linux/types.h>
15#include <linux/btf_ids.h>
16#include <linux/net_namespace.h>
17#include <net/xdp.h>
18#include <net/netfilter/nf_conntrack_bpf.h>
19#include <net/netfilter/nf_conntrack_core.h>
20
21/* bpf_ct_opts - Options for CT lookup helpers
22 *
23 * Members:
24 * @netns_id   - Specify the network namespace for lookup
25 *		 Values:
26 *		   BPF_F_CURRENT_NETNS (-1)
27 *		     Use namespace associated with ctx (xdp_md, __sk_buff)
28 *		   [0, S32_MAX]
29 *		     Network Namespace ID
30 * @error      - Out parameter, set for any errors encountered
31 *		 Values:
32 *		   -EINVAL - Passed NULL for bpf_tuple pointer
33 *		   -EINVAL - opts->reserved is not 0
34 *		   -EINVAL - netns_id is less than -1
35 *		   -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12)
36 *		   -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
37 *		   -ENONET - No network namespace found for netns_id
38 *		   -ENOENT - Conntrack lookup could not find entry for tuple
39 *		   -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4)
40 *				   or sizeof(tuple->ipv6)
41 * @l4proto    - Layer 4 protocol
42 *		 Values:
43 *		   IPPROTO_TCP, IPPROTO_UDP
44 * @dir:       - connection tracking tuple direction.
45 * @reserved   - Reserved member, will be reused for more options in future
46 *		 Values:
47 *		   0
48 */
49struct bpf_ct_opts {
50	s32 netns_id;
51	s32 error;
52	u8 l4proto;
53	u8 dir;
54	u8 reserved[2];
55};
56
57enum {
58	NF_BPF_CT_OPTS_SZ = 12,
59};
60
61static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
62				 u32 tuple_len, u8 protonum, u8 dir,
63				 struct nf_conntrack_tuple *tuple)
64{
65	union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
66	union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
67	union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
68						  : &tuple->src.u;
69	union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
70						  : (void *)&tuple->dst.u;
71
72	if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
73		return -EPROTO;
74
75	memset(tuple, 0, sizeof(*tuple));
76
77	switch (tuple_len) {
78	case sizeof(bpf_tuple->ipv4):
79		tuple->src.l3num = AF_INET;
80		src->ip = bpf_tuple->ipv4.saddr;
81		sport->tcp.port = bpf_tuple->ipv4.sport;
82		dst->ip = bpf_tuple->ipv4.daddr;
83		dport->tcp.port = bpf_tuple->ipv4.dport;
84		break;
85	case sizeof(bpf_tuple->ipv6):
86		tuple->src.l3num = AF_INET6;
87		memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
88		sport->tcp.port = bpf_tuple->ipv6.sport;
89		memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
90		dport->tcp.port = bpf_tuple->ipv6.dport;
91		break;
92	default:
93		return -EAFNOSUPPORT;
94	}
95	tuple->dst.protonum = protonum;
96	tuple->dst.dir = dir;
97
98	return 0;
99}
100
101static struct nf_conn *
102__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
103			u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len,
104			u32 timeout)
105{
106	struct nf_conntrack_tuple otuple, rtuple;
107	struct nf_conn *ct;
108	int err;
109
110	if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
111	    opts_len != NF_BPF_CT_OPTS_SZ)
112		return ERR_PTR(-EINVAL);
113
114	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
115		return ERR_PTR(-EINVAL);
116
117	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
118				    IP_CT_DIR_ORIGINAL, &otuple);
119	if (err < 0)
120		return ERR_PTR(err);
121
122	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
123				    IP_CT_DIR_REPLY, &rtuple);
124	if (err < 0)
125		return ERR_PTR(err);
126
127	if (opts->netns_id >= 0) {
128		net = get_net_ns_by_id(net, opts->netns_id);
129		if (unlikely(!net))
130			return ERR_PTR(-ENONET);
131	}
132
133	ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
134				GFP_ATOMIC);
135	if (IS_ERR(ct))
136		goto out;
137
138	memset(&ct->proto, 0, sizeof(ct->proto));
139	__nf_ct_set_timeout(ct, timeout * HZ);
140
141out:
142	if (opts->netns_id >= 0)
143		put_net(net);
144
145	return ct;
146}
147
148static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
149					  struct bpf_sock_tuple *bpf_tuple,
150					  u32 tuple_len, struct bpf_ct_opts *opts,
151					  u32 opts_len)
152{
153	struct nf_conntrack_tuple_hash *hash;
154	struct nf_conntrack_tuple tuple;
155	struct nf_conn *ct;
156	int err;
157
158	if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
159	    opts_len != NF_BPF_CT_OPTS_SZ)
160		return ERR_PTR(-EINVAL);
161	if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP))
162		return ERR_PTR(-EPROTO);
163	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
164		return ERR_PTR(-EINVAL);
165
166	err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
167				    IP_CT_DIR_ORIGINAL, &tuple);
168	if (err < 0)
169		return ERR_PTR(err);
170
171	if (opts->netns_id >= 0) {
172		net = get_net_ns_by_id(net, opts->netns_id);
173		if (unlikely(!net))
174			return ERR_PTR(-ENONET);
175	}
176
177	hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
178	if (opts->netns_id >= 0)
179		put_net(net);
180	if (!hash)
181		return ERR_PTR(-ENOENT);
182
183	ct = nf_ct_tuplehash_to_ctrack(hash);
184	opts->dir = NF_CT_DIRECTION(hash);
185
186	return ct;
187}
188
189BTF_ID_LIST(btf_nf_conn_ids)
190BTF_ID(struct, nf_conn)
191BTF_ID(struct, nf_conn___init)
192
193/* Check writes into `struct nf_conn` */
194static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
195					   const struct bpf_reg_state *reg,
196					   int off, int size)
197{
198	const struct btf_type *ncit, *nct, *t;
199	size_t end;
200
201	ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]);
202	nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]);
203	t = btf_type_by_id(reg->btf, reg->btf_id);
204	if (t != nct && t != ncit) {
205		bpf_log(log, "only read is supported\n");
206		return -EACCES;
207	}
208
209	/* `struct nf_conn` and `struct nf_conn___init` have the same layout
210	 * so we are safe to simply merge offset checks here
211	 */
212	switch (off) {
213#if defined(CONFIG_NF_CONNTRACK_MARK)
214	case offsetof(struct nf_conn, mark):
215		end = offsetofend(struct nf_conn, mark);
216		break;
217#endif
218	default:
219		bpf_log(log, "no write support to nf_conn at off %d\n", off);
220		return -EACCES;
221	}
222
223	if (off + size > end) {
224		bpf_log(log,
225			"write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
226			off, size, end);
227		return -EACCES;
228	}
229
230	return 0;
231}
232
233__bpf_kfunc_start_defs();
234
235/* bpf_xdp_ct_alloc - Allocate a new CT entry
236 *
237 * Parameters:
238 * @xdp_ctx	- Pointer to ctx (xdp_md) in XDP program
239 *		    Cannot be NULL
240 * @bpf_tuple	- Pointer to memory representing the tuple to look up
241 *		    Cannot be NULL
242 * @tuple__sz	- Length of the tuple structure
243 *		    Must be one of sizeof(bpf_tuple->ipv4) or
244 *		    sizeof(bpf_tuple->ipv6)
245 * @opts	- Additional options for allocation (documented above)
246 *		    Cannot be NULL
247 * @opts__sz	- Length of the bpf_ct_opts structure
248 *		    Must be NF_BPF_CT_OPTS_SZ (12)
249 */
250__bpf_kfunc struct nf_conn___init *
251bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
252		 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
253{
254	struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
255	struct nf_conn *nfct;
256
257	nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz,
258				       opts, opts__sz, 10);
259	if (IS_ERR(nfct)) {
260		if (opts)
261			opts->error = PTR_ERR(nfct);
262		return NULL;
263	}
264
265	return (struct nf_conn___init *)nfct;
266}
267
268/* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
269 *		       reference to it
270 *
271 * Parameters:
272 * @xdp_ctx	- Pointer to ctx (xdp_md) in XDP program
273 *		    Cannot be NULL
274 * @bpf_tuple	- Pointer to memory representing the tuple to look up
275 *		    Cannot be NULL
276 * @tuple__sz	- Length of the tuple structure
277 *		    Must be one of sizeof(bpf_tuple->ipv4) or
278 *		    sizeof(bpf_tuple->ipv6)
279 * @opts	- Additional options for lookup (documented above)
280 *		    Cannot be NULL
281 * @opts__sz	- Length of the bpf_ct_opts structure
282 *		    Must be NF_BPF_CT_OPTS_SZ (12)
283 */
284__bpf_kfunc struct nf_conn *
285bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
286		  u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
287{
288	struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
289	struct net *caller_net;
290	struct nf_conn *nfct;
291
292	caller_net = dev_net(ctx->rxq->dev);
293	nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
294	if (IS_ERR(nfct)) {
295		if (opts)
296			opts->error = PTR_ERR(nfct);
297		return NULL;
298	}
299	return nfct;
300}
301
302/* bpf_skb_ct_alloc - Allocate a new CT entry
303 *
304 * Parameters:
305 * @skb_ctx	- Pointer to ctx (__sk_buff) in TC program
306 *		    Cannot be NULL
307 * @bpf_tuple	- Pointer to memory representing the tuple to look up
308 *		    Cannot be NULL
309 * @tuple__sz	- Length of the tuple structure
310 *		    Must be one of sizeof(bpf_tuple->ipv4) or
311 *		    sizeof(bpf_tuple->ipv6)
312 * @opts	- Additional options for allocation (documented above)
313 *		    Cannot be NULL
314 * @opts__sz	- Length of the bpf_ct_opts structure
315 *		    Must be NF_BPF_CT_OPTS_SZ (12)
316 */
317__bpf_kfunc struct nf_conn___init *
318bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
319		 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
320{
321	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
322	struct nf_conn *nfct;
323	struct net *net;
324
325	net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
326	nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10);
327	if (IS_ERR(nfct)) {
328		if (opts)
329			opts->error = PTR_ERR(nfct);
330		return NULL;
331	}
332
333	return (struct nf_conn___init *)nfct;
334}
335
336/* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
337 *		       reference to it
338 *
339 * Parameters:
340 * @skb_ctx	- Pointer to ctx (__sk_buff) in TC program
341 *		    Cannot be NULL
342 * @bpf_tuple	- Pointer to memory representing the tuple to look up
343 *		    Cannot be NULL
344 * @tuple__sz	- Length of the tuple structure
345 *		    Must be one of sizeof(bpf_tuple->ipv4) or
346 *		    sizeof(bpf_tuple->ipv6)
347 * @opts	- Additional options for lookup (documented above)
348 *		    Cannot be NULL
349 * @opts__sz	- Length of the bpf_ct_opts structure
350 *		    Must be NF_BPF_CT_OPTS_SZ (12)
351 */
352__bpf_kfunc struct nf_conn *
353bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
354		  u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
355{
356	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
357	struct net *caller_net;
358	struct nf_conn *nfct;
359
360	caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
361	nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
362	if (IS_ERR(nfct)) {
363		if (opts)
364			opts->error = PTR_ERR(nfct);
365		return NULL;
366	}
367	return nfct;
368}
369
370/* bpf_ct_insert_entry - Add the provided entry into a CT map
371 *
372 * This must be invoked for referenced PTR_TO_BTF_ID.
373 *
374 * @nfct	 - Pointer to referenced nf_conn___init object, obtained
375 *		   using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
376 */
377__bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
378{
379	struct nf_conn *nfct = (struct nf_conn *)nfct_i;
380	int err;
381
382	if (!nf_ct_is_confirmed(nfct))
383		nfct->timeout += nfct_time_stamp;
384	nfct->status |= IPS_CONFIRMED;
385	err = nf_conntrack_hash_check_insert(nfct);
386	if (err < 0) {
387		nf_conntrack_free(nfct);
388		return NULL;
389	}
390	return nfct;
391}
392
393/* bpf_ct_release - Release acquired nf_conn object
394 *
395 * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
396 * the program if any references remain in the program in all of the explored
397 * states.
398 *
399 * Parameters:
400 * @nf_conn	 - Pointer to referenced nf_conn object, obtained using
401 *		   bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
402 */
403__bpf_kfunc void bpf_ct_release(struct nf_conn *nfct)
404{
405	nf_ct_put(nfct);
406}
407
408/* bpf_ct_set_timeout - Set timeout of allocated nf_conn
409 *
410 * Sets the default timeout of newly allocated nf_conn before insertion.
411 * This helper must be invoked for refcounted pointer to nf_conn___init.
412 *
413 * Parameters:
414 * @nfct	 - Pointer to referenced nf_conn object, obtained using
415 *                 bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
416 * @timeout      - Timeout in msecs.
417 */
418__bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
419{
420	__nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout));
421}
422
423/* bpf_ct_change_timeout - Change timeout of inserted nf_conn
424 *
425 * Change timeout associated of the inserted or looked up nf_conn.
426 * This helper must be invoked for refcounted pointer to nf_conn.
427 *
428 * Parameters:
429 * @nfct	 - Pointer to referenced nf_conn object, obtained using
430 *		   bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
431 * @timeout      - New timeout in msecs.
432 */
433__bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
434{
435	return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
436}
437
438/* bpf_ct_set_status - Set status field of allocated nf_conn
439 *
440 * Set the status field of the newly allocated nf_conn before insertion.
441 * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init.
442 *
443 * Parameters:
444 * @nfct	 - Pointer to referenced nf_conn object, obtained using
445 *		   bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
446 * @status       - New status value.
447 */
448__bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
449{
450	return nf_ct_change_status_common((struct nf_conn *)nfct, status);
451}
452
453/* bpf_ct_change_status - Change status of inserted nf_conn
454 *
455 * Change the status field of the provided connection tracking entry.
456 * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn.
457 *
458 * Parameters:
459 * @nfct	 - Pointer to referenced nf_conn object, obtained using
460 *		   bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
461 * @status       - New status value.
462 */
463__bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
464{
465	return nf_ct_change_status_common(nfct, status);
466}
467
468__bpf_kfunc_end_defs();
469
470BTF_KFUNCS_START(nf_ct_kfunc_set)
471BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
472BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
473BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
474BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
475BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
476BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
477BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
478BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
479BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
480BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
481BTF_KFUNCS_END(nf_ct_kfunc_set)
482
483static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
484	.owner = THIS_MODULE,
485	.set   = &nf_ct_kfunc_set,
486};
487
488int register_nf_conntrack_bpf(void)
489{
490	int ret;
491
492	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set);
493	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set);
494	if (!ret) {
495		mutex_lock(&nf_conn_btf_access_lock);
496		nfct_btf_struct_access = _nf_conntrack_btf_struct_access;
497		mutex_unlock(&nf_conn_btf_access_lock);
498	}
499
500	return ret;
501}
502
503void cleanup_nf_conntrack_bpf(void)
504{
505	mutex_lock(&nf_conn_btf_access_lock);
506	nfct_btf_struct_access = NULL;
507	mutex_unlock(&nf_conn_btf_access_lock);
508}
509