1/*-
2 * Copyright (c) 2014-2020 Mindaugas Rasiukevicius <rmind at noxt eu>
3 * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This material is based upon work partially supported by The
7 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31/*
32 * NPF connection tracking for stateful filtering and translation.
33 *
34 * Overview
35 *
36 *	Packets can be incoming or outgoing with respect to an interface.
37 *	Connection direction is identified by the direction of its first
38 *	packet.  The meaning of incoming/outgoing packet in the context of
39 *	connection direction can be confusing.  Therefore, we will use the
40 *	terms "forwards stream" and "backwards stream", where packets in
41 *	the forwards stream mean the packets travelling in the direction
42 *	as the connection direction.
43 *
44 *	All connections have two keys and thus two entries:
45 *
46 *	- npf_conn_getforwkey(con)        -- for the forwards stream;
47 *	- npf_conn_getbackkey(con, alen)  -- for the backwards stream.
48 *
49 *	Note: the keys are stored in npf_conn_t::c_keys[], which is used
50 *	to allocate variable-length npf_conn_t structures based on whether
51 *	the IPv4 or IPv6 addresses are used.
52 *
53 *	The key is an n-tuple used to identify the connection flow: see the
54 *	npf_connkey.c source file for the description of the key layouts.
55 *	The key may be formed using translated values in a case of NAT.
56 *
57 *	Connections can serve two purposes: for the implicit passing and/or
58 *	to accommodate the dynamic NAT.  Connections for the former purpose
59 *	are created by the rules with "stateful" attribute and are used for
60 *	stateful filtering.  Such connections indicate that the packet of
61 *	the backwards stream should be passed without inspection of the
62 *	ruleset.  The other purpose is to associate a dynamic NAT mechanism
63 *	with a connection.  Such connections are created by the NAT policies
64 *	and they have a relationship with NAT translation structure via
65 *	npf_conn_t::c_nat.  A single connection can serve both purposes,
66 *	which is a common case.
67 *
68 * Connection life-cycle
69 *
70 *	Connections are established when a packet matches said rule or
71 *	NAT policy.  Both keys of the established connection are inserted
72 *	into the connection database.  A garbage collection thread
73 *	periodically scans all connections and depending on connection
74 *	properties (e.g. last activity time, protocol) removes connection
75 *	entries and expires the actual connections.
76 *
77 *	Each connection has a reference count.  The reference is acquired
78 *	on lookup and should be released by the caller.  It guarantees that
79 *	the connection will not be destroyed, although it may be expired.
80 *
81 * Synchronization
82 *
83 *	Connection database is accessed in a lock-free manner by the main
84 *	routines: npf_conn_inspect() and npf_conn_establish().  Since they
85 *	are always called from a software interrupt, the database is
86 *	protected using EBR.  The main place which can destroy a connection
87 *	is npf_conn_worker().  The database itself can be replaced and
88 *	destroyed in npf_conn_reload().
89 *
90 * ALG support
91 *
92 *	Application-level gateways (ALGs) can override generic connection
93 *	inspection (npf_alg_conn() call in npf_conn_inspect() function) by
94 *	performing their own lookup using different key.  Recursive call
95 *	to npf_conn_inspect() is not allowed.  The ALGs ought to use the
96 *	npf_conn_lookup() function for this purpose.
97 *
98 * Lock order
99 *
100 *	npf_t::config_lock ->
101 *		conn_lock ->
102 *			npf_conn_t::c_lock
103 */
104
105#ifdef _KERNEL
106#include <sys/cdefs.h>
107__KERNEL_RCSID(0, "$NetBSD: npf_conn.c,v 1.35 2023/01/22 18:39:35 riastradh Exp $");
108
109#include <sys/param.h>
110#include <sys/types.h>
111
112#include <netinet/in.h>
113#include <netinet/tcp.h>
114
115#include <sys/atomic.h>
116#include <sys/kmem.h>
117#include <sys/mutex.h>
118#include <net/pfil.h>
119#include <sys/pool.h>
120#include <sys/queue.h>
121#include <sys/systm.h>
122#endif
123
124#define __NPF_CONN_PRIVATE
125#include "npf_conn.h"
126#include "npf_impl.h"
127
128/* A helper to select the IPv4 or IPv6 connection cache. */
129#define	NPF_CONNCACHE(alen)	(((alen) >> 4) & 0x1)
130
131/*
132 * Connection flags: PFIL_IN and PFIL_OUT values are reserved for direction.
133 */
134CTASSERT(PFIL_ALL == (0x001 | 0x002));
135#define	CONN_ACTIVE	0x004	/* visible on inspection */
136#define	CONN_PASS	0x008	/* perform implicit passing */
137#define	CONN_EXPIRE	0x010	/* explicitly expire */
138#define	CONN_REMOVED	0x020	/* "forw/back" entries removed */
139
140enum { CONN_TRACKING_OFF, CONN_TRACKING_ON };
141
142static int	npf_conn_export(npf_t *, npf_conn_t *, nvlist_t *);
143
144/*
145 * npf_conn_sys{init,fini}: initialize/destroy connection tracking.
146 */
147
148void
149npf_conn_init(npf_t *npf)
150{
151	npf_conn_params_t *params = npf_param_allocgroup(npf,
152	    NPF_PARAMS_CONN, sizeof(npf_conn_params_t));
153	npf_param_t param_map[] = {
154		{
155			"state.key.interface",
156			&params->connkey_interface,
157			.default_val = 1, // true
158			.min = 0, .max = 1
159		},
160		{
161			"state.key.direction",
162			&params->connkey_direction,
163			.default_val = 1, // true
164			.min = 0, .max = 1
165		},
166	};
167	npf_param_register(npf, param_map, __arraycount(param_map));
168
169	npf->conn_cache[0] = pool_cache_init(
170	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V4WORDS * 2]),
171	    0, 0, 0, "npfcn4pl", NULL, IPL_NET, NULL, NULL, NULL);
172	npf->conn_cache[1] = pool_cache_init(
173	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V6WORDS * 2]),
174	    0, 0, 0, "npfcn6pl", NULL, IPL_NET, NULL, NULL, NULL);
175
176	mutex_init(&npf->conn_lock, MUTEX_DEFAULT, IPL_NONE);
177	atomic_store_relaxed(&npf->conn_tracking, CONN_TRACKING_OFF);
178	npf->conn_db = npf_conndb_create();
179	npf_conndb_sysinit(npf);
180
181	npf_worker_addfunc(npf, npf_conn_worker);
182}
183
184void
185npf_conn_fini(npf_t *npf)
186{
187	const size_t len = sizeof(npf_conn_params_t);
188
189	/* Note: the caller should have flushed the connections. */
190	KASSERT(atomic_load_relaxed(&npf->conn_tracking) == CONN_TRACKING_OFF);
191
192	npf_conndb_destroy(npf->conn_db);
193	pool_cache_destroy(npf->conn_cache[0]);
194	pool_cache_destroy(npf->conn_cache[1]);
195	mutex_destroy(&npf->conn_lock);
196
197	npf_param_freegroup(npf, NPF_PARAMS_CONN, len);
198	npf_conndb_sysfini(npf);
199}
200
201/*
202 * npf_conn_load: perform the load by flushing the current connection
203 * database and replacing it with the new one or just destroying.
204 *
205 * => The caller must disable the connection tracking and ensure that
206 *    there are no connection database lookups or references in-flight.
207 */
208void
209npf_conn_load(npf_t *npf, npf_conndb_t *ndb, bool track)
210{
211	npf_conndb_t *odb = NULL;
212
213	KASSERT(npf_config_locked_p(npf));
214
215	/*
216	 * The connection database is in the quiescent state.
217	 * Prevent G/C thread from running and install a new database.
218	 */
219	mutex_enter(&npf->conn_lock);
220	if (ndb) {
221		KASSERT(atomic_load_relaxed(&npf->conn_tracking)
222		    == CONN_TRACKING_OFF);
223		odb = atomic_load_relaxed(&npf->conn_db);
224		atomic_store_release(&npf->conn_db, ndb);
225	}
226	if (track) {
227		/* After this point lookups start flying in. */
228		membar_producer();
229		atomic_store_relaxed(&npf->conn_tracking, CONN_TRACKING_ON);
230	}
231	mutex_exit(&npf->conn_lock);
232
233	if (odb) {
234		/*
235		 * Flush all, no sync since the caller did it for us.
236		 * Also, release the pool cache memory.
237		 */
238		npf_conndb_gc(npf, odb, true, false);
239		npf_conndb_destroy(odb);
240		pool_cache_invalidate(npf->conn_cache[0]);
241		pool_cache_invalidate(npf->conn_cache[1]);
242	}
243}
244
245/*
246 * npf_conn_tracking: enable/disable connection tracking.
247 */
248void
249npf_conn_tracking(npf_t *npf, bool track)
250{
251	KASSERT(npf_config_locked_p(npf));
252	atomic_store_relaxed(&npf->conn_tracking,
253	    track ? CONN_TRACKING_ON : CONN_TRACKING_OFF);
254}
255
256static inline bool
257npf_conn_trackable_p(const npf_cache_t *npc)
258{
259	const npf_t *npf = npc->npc_ctx;
260
261	/*
262	 * Check if connection tracking is on.  Also, if layer 3 and 4 are
263	 * not cached - protocol is not supported or packet is invalid.
264	 */
265	if (atomic_load_relaxed(&npf->conn_tracking) != CONN_TRACKING_ON) {
266		return false;
267	}
268	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
269		return false;
270	}
271	return true;
272}
273
274static inline void
275conn_update_atime(npf_conn_t *con)
276{
277	struct timespec tsnow;
278
279	getnanouptime(&tsnow);
280	atomic_store_relaxed(&con->c_atime, tsnow.tv_sec);
281}
282
283/*
284 * npf_conn_check: check that:
285 *
286 *	- the connection is active;
287 *
288 *	- the packet is travelling in the right direction with the respect
289 *	  to the connection direction (if interface-id is not zero);
290 *
291 *	- the packet is travelling on the same interface as the
292 *	  connection interface (if interface-id is not zero).
293 */
294static bool
295npf_conn_check(const npf_conn_t *con, const nbuf_t *nbuf,
296    const unsigned di, const npf_flow_t flow)
297{
298	const uint32_t flags = atomic_load_relaxed(&con->c_flags);
299	const unsigned ifid = atomic_load_relaxed(&con->c_ifid);
300	bool active;
301
302	active = (flags & (CONN_ACTIVE | CONN_EXPIRE)) == CONN_ACTIVE;
303	if (__predict_false(!active)) {
304		return false;
305	}
306	if (ifid && nbuf) {
307		const bool match = (flags & PFIL_ALL) == di;
308		npf_flow_t pflow = match ? NPF_FLOW_FORW : NPF_FLOW_BACK;
309
310		if (__predict_false(flow != pflow)) {
311			return false;
312		}
313		if (__predict_false(ifid != nbuf->nb_ifid)) {
314			return false;
315		}
316	}
317	return true;
318}
319
320/*
321 * npf_conn_lookup: lookup if there is an established connection.
322 *
323 * => If found, we will hold a reference for the caller.
324 */
325npf_conn_t *
326npf_conn_lookup(const npf_cache_t *npc, const unsigned di, npf_flow_t *flow)
327{
328	npf_t *npf = npc->npc_ctx;
329	const nbuf_t *nbuf = npc->npc_nbuf;
330	npf_conn_t *con;
331	npf_connkey_t key;
332
333	/* Construct a key and lookup for a connection in the store. */
334	if (!npf_conn_conkey(npc, &key, di, NPF_FLOW_FORW)) {
335		return NULL;
336	}
337	con = npf_conndb_lookup(npf, &key, flow);
338	if (con == NULL) {
339		return NULL;
340	}
341	KASSERT(npc->npc_proto == atomic_load_relaxed(&con->c_proto));
342
343	/* Extra checks for the connection and packet. */
344	if (!npf_conn_check(con, nbuf, di, *flow)) {
345		atomic_dec_uint(&con->c_refcnt);
346		return NULL;
347	}
348
349	/* Update the last activity time. */
350	conn_update_atime(con);
351	return con;
352}
353
354/*
355 * npf_conn_inspect: lookup a connection and inspecting the protocol data.
356 *
357 * => If found, we will hold a reference for the caller.
358 */
359npf_conn_t *
360npf_conn_inspect(npf_cache_t *npc, const unsigned di, int *error)
361{
362	nbuf_t *nbuf = npc->npc_nbuf;
363	npf_flow_t flow;
364	npf_conn_t *con;
365	bool ok;
366
367	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
368	if (!npf_conn_trackable_p(npc)) {
369		return NULL;
370	}
371
372	/* Query ALG which may lookup connection for us. */
373	if ((con = npf_alg_conn(npc, di)) != NULL) {
374		/* Note: reference is held. */
375		return con;
376	}
377	if (nbuf_head_mbuf(nbuf) == NULL) {
378		*error = ENOMEM;
379		return NULL;
380	}
381	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
382
383	/* The main lookup of the connection (acquires a reference). */
384	if ((con = npf_conn_lookup(npc, di, &flow)) == NULL) {
385		return NULL;
386	}
387
388	/* Inspect the protocol data and handle state changes. */
389	mutex_enter(&con->c_lock);
390	ok = npf_state_inspect(npc, &con->c_state, flow);
391	mutex_exit(&con->c_lock);
392
393	/* If invalid state: let the rules deal with it. */
394	if (__predict_false(!ok)) {
395		npf_conn_release(con);
396		npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE);
397		return NULL;
398	}
399#if 0
400	/*
401	 * TODO -- determine when this might be wanted/used.
402	 *
403	 * Note: skipping the connection lookup and ruleset inspection
404	 * on other interfaces will also bypass dynamic NAT.
405	 */
406	if (atomic_load_relaxed(&con->c_flags) & CONN_GPASS) {
407		/*
408		 * Note: if tagging fails, then give this packet a chance
409		 * to go through a regular ruleset.
410		 */
411		(void)nbuf_add_tag(nbuf, NPF_NTAG_PASS);
412	}
413#endif
414	return con;
415}
416
417/*
418 * npf_conn_establish: create a new connection, insert into the global list.
419 *
420 * => Connection is created with the reference held for the caller.
421 * => Connection will be activated on the first reference release.
422 */
423npf_conn_t *
424npf_conn_establish(npf_cache_t *npc, const unsigned di, bool global)
425{
426	npf_t *npf = npc->npc_ctx;
427	const unsigned alen = npc->npc_alen;
428	const unsigned idx = NPF_CONNCACHE(alen);
429	const nbuf_t *nbuf = npc->npc_nbuf;
430	npf_connkey_t *fw, *bk;
431	npf_conndb_t *conn_db;
432	npf_conn_t *con;
433	int error = 0;
434
435	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
436
437	if (!npf_conn_trackable_p(npc)) {
438		return NULL;
439	}
440
441	/* Allocate and initialize the new connection. */
442	con = pool_cache_get(npf->conn_cache[idx], PR_NOWAIT);
443	if (__predict_false(!con)) {
444		npf_worker_signal(npf);
445		return NULL;
446	}
447	NPF_PRINTF(("NPF: create conn %p\n", con));
448	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
449
450	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
451	atomic_store_relaxed(&con->c_flags, di & PFIL_ALL);
452	atomic_store_relaxed(&con->c_refcnt, 0);
453	con->c_rproc = NULL;
454	con->c_nat = NULL;
455
456	con->c_proto = npc->npc_proto;
457	CTASSERT(sizeof(con->c_proto) >= sizeof(npc->npc_proto));
458	con->c_alen = alen;
459
460	/* Initialize the protocol state. */
461	if (!npf_state_init(npc, &con->c_state)) {
462		npf_conn_destroy(npf, con);
463		return NULL;
464	}
465	KASSERT(npf_iscached(npc, NPC_IP46));
466
467	fw = npf_conn_getforwkey(con);
468	bk = npf_conn_getbackkey(con, alen);
469
470	/*
471	 * Construct "forwards" and "backwards" keys.  Also, set the
472	 * interface ID for this connection (unless it is global).
473	 */
474	if (!npf_conn_conkey(npc, fw, di, NPF_FLOW_FORW) ||
475	    !npf_conn_conkey(npc, bk, di ^ PFIL_ALL, NPF_FLOW_BACK)) {
476		npf_conn_destroy(npf, con);
477		return NULL;
478	}
479	con->c_ifid = global ? nbuf->nb_ifid : 0;
480
481	/*
482	 * Set last activity time for a new connection and acquire
483	 * a reference for the caller before we make it visible.
484	 */
485	conn_update_atime(con);
486	atomic_store_relaxed(&con->c_refcnt, 1);
487
488	/*
489	 * Insert both keys (entries representing directions) of the
490	 * connection.  At this point it becomes visible, but we activate
491	 * the connection later.
492	 */
493	mutex_enter(&con->c_lock);
494	conn_db = atomic_load_consume(&npf->conn_db);
495	if (!npf_conndb_insert(conn_db, fw, con, NPF_FLOW_FORW)) {
496		error = EISCONN;
497		goto err;
498	}
499	if (!npf_conndb_insert(conn_db, bk, con, NPF_FLOW_BACK)) {
500		npf_conn_t *ret __diagused;
501		ret = npf_conndb_remove(conn_db, fw);
502		KASSERT(ret == con);
503		error = EISCONN;
504		goto err;
505	}
506err:
507	/*
508	 * If we have hit the duplicate: mark the connection as expired
509	 * and let the G/C thread to take care of it.  We cannot do it
510	 * here since there might be references acquired already.
511	 */
512	if (error) {
513		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
514		atomic_dec_uint(&con->c_refcnt);
515		npf_stats_inc(npf, NPF_STAT_RACE_CONN);
516	} else {
517		NPF_PRINTF(("NPF: establish conn %p\n", con));
518	}
519
520	/* Finally, insert into the connection list. */
521	npf_conndb_enqueue(conn_db, con);
522	mutex_exit(&con->c_lock);
523
524	return error ? NULL : con;
525}
526
527void
528npf_conn_destroy(npf_t *npf, npf_conn_t *con)
529{
530	const unsigned idx __unused = NPF_CONNCACHE(con->c_alen);
531
532	KASSERT(atomic_load_relaxed(&con->c_refcnt) == 0);
533
534	if (con->c_nat) {
535		/* Release any NAT structures. */
536		npf_nat_destroy(con, con->c_nat);
537	}
538	if (con->c_rproc) {
539		/* Release the rule procedure. */
540		npf_rproc_release(con->c_rproc);
541	}
542
543	/* Destroy the state. */
544	npf_state_destroy(&con->c_state);
545	mutex_destroy(&con->c_lock);
546
547	/* Free the structure, increase the counter. */
548	pool_cache_put(npf->conn_cache[idx], con);
549	npf_stats_inc(npf, NPF_STAT_CONN_DESTROY);
550	NPF_PRINTF(("NPF: conn %p destroyed\n", con));
551}
552
553/*
554 * npf_conn_setnat: associate NAT entry with the connection, update and
555 * re-insert connection entry using the translation values.
556 *
557 * => The caller must be holding a reference.
558 */
559int
560npf_conn_setnat(const npf_cache_t *npc, npf_conn_t *con,
561    npf_nat_t *nt, unsigned ntype)
562{
563	static const unsigned nat_type_which[] = {
564		/* See the description in npf_nat_which(). */
565		[NPF_NATOUT] = NPF_DST,
566		[NPF_NATIN] = NPF_SRC,
567	};
568	npf_t *npf = npc->npc_ctx;
569	npf_conn_t *ret __diagused;
570	npf_conndb_t *conn_db;
571	npf_connkey_t *bk;
572	npf_addr_t *taddr;
573	in_port_t tport;
574	uint32_t flags;
575
576	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
577
578	npf_nat_gettrans(nt, &taddr, &tport);
579	KASSERT(ntype == NPF_NATOUT || ntype == NPF_NATIN);
580
581	/* Acquire the lock and check for the races. */
582	mutex_enter(&con->c_lock);
583	flags = atomic_load_relaxed(&con->c_flags);
584	if (__predict_false(flags & CONN_EXPIRE)) {
585		/* The connection got expired. */
586		mutex_exit(&con->c_lock);
587		return EINVAL;
588	}
589	KASSERT((flags & CONN_REMOVED) == 0);
590
591	if (__predict_false(con->c_nat != NULL)) {
592		/* Race with a duplicate packet. */
593		mutex_exit(&con->c_lock);
594		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
595		return EISCONN;
596	}
597
598	/* Remove the "backwards" key. */
599	conn_db = atomic_load_consume(&npf->conn_db);
600	bk = npf_conn_getbackkey(con, con->c_alen);
601	ret = npf_conndb_remove(conn_db, bk);
602	KASSERT(ret == con);
603
604	/* Set the source/destination IDs to the translation values. */
605	npf_conn_adjkey(bk, taddr, tport, nat_type_which[ntype]);
606
607	/* Finally, re-insert the "backwards" key. */
608	if (!npf_conndb_insert(conn_db, bk, con, NPF_FLOW_BACK)) {
609		/*
610		 * Race: we have hit the duplicate, remove the "forwards"
611		 * key and expire our connection; it is no longer valid.
612		 */
613		npf_connkey_t *fw = npf_conn_getforwkey(con);
614		ret = npf_conndb_remove(conn_db, fw);
615		KASSERT(ret == con);
616
617		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
618		mutex_exit(&con->c_lock);
619
620		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
621		return EISCONN;
622	}
623
624	/* Associate the NAT entry and release the lock. */
625	con->c_nat = nt;
626	mutex_exit(&con->c_lock);
627	return 0;
628}
629
630/*
631 * npf_conn_expire: explicitly mark connection as expired.
632 *
633 * => Must be called with: a) reference held  b) the relevant lock held.
634 *    The relevant lock should prevent from connection destruction, e.g.
635 *    npf_t::conn_lock or npf_natpolicy_t::n_lock.
636 */
637void
638npf_conn_expire(npf_conn_t *con)
639{
640	atomic_or_uint(&con->c_flags, CONN_EXPIRE);
641}
642
643/*
644 * npf_conn_pass: return true if connection is "pass" one, otherwise false.
645 */
646bool
647npf_conn_pass(const npf_conn_t *con, npf_match_info_t *mi, npf_rproc_t **rp)
648{
649	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
650	if (__predict_true(atomic_load_relaxed(&con->c_flags) & CONN_PASS)) {
651		mi->mi_retfl = atomic_load_relaxed(&con->c_retfl);
652		mi->mi_rid = con->c_rid;
653		*rp = con->c_rproc;
654		return true;
655	}
656	return false;
657}
658
659/*
660 * npf_conn_setpass: mark connection as a "pass" one and associate the
661 * rule procedure with it.
662 */
663void
664npf_conn_setpass(npf_conn_t *con, const npf_match_info_t *mi, npf_rproc_t *rp)
665{
666	KASSERT((atomic_load_relaxed(&con->c_flags) & CONN_ACTIVE) == 0);
667	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
668	KASSERT(con->c_rproc == NULL);
669
670	/*
671	 * No need for atomic since the connection is not yet active.
672	 * If rproc is set, the caller transfers its reference to us,
673	 * which will be released on npf_conn_destroy().
674	 */
675	atomic_or_uint(&con->c_flags, CONN_PASS);
676	con->c_rproc = rp;
677	if (rp) {
678		con->c_rid = mi->mi_rid;
679		con->c_retfl = mi->mi_retfl;
680	}
681}
682
683/*
684 * npf_conn_release: release a reference, which might allow G/C thread
685 * to destroy this connection.
686 */
687void
688npf_conn_release(npf_conn_t *con)
689{
690	const unsigned flags = atomic_load_relaxed(&con->c_flags);
691
692	if ((flags & (CONN_ACTIVE | CONN_EXPIRE)) == 0) {
693		/* Activate: after this, connection is globally visible. */
694		atomic_or_uint(&con->c_flags, CONN_ACTIVE);
695	}
696	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
697	atomic_dec_uint(&con->c_refcnt);
698}
699
700/*
701 * npf_conn_getnat: return the associated NAT entry, if any.
702 */
703npf_nat_t *
704npf_conn_getnat(const npf_conn_t *con)
705{
706	return con->c_nat;
707}
708
709/*
710 * npf_conn_expired: criterion to check if connection is expired.
711 */
712bool
713npf_conn_expired(npf_t *npf, const npf_conn_t *con, uint64_t tsnow)
714{
715	const unsigned flags = atomic_load_relaxed(&con->c_flags);
716	const int etime = npf_state_etime(npf, &con->c_state, con->c_proto);
717	int elapsed;
718
719	if (__predict_false(flags & CONN_EXPIRE)) {
720		/* Explicitly marked to be expired. */
721		return true;
722	}
723
724	/*
725	 * Note: another thread may update 'atime' and it might
726	 * become greater than 'now'.
727	 */
728	elapsed = (int64_t)tsnow - atomic_load_relaxed(&con->c_atime);
729	return elapsed > etime;
730}
731
732/*
733 * npf_conn_remove: unlink the connection and mark as expired.
734 */
735void
736npf_conn_remove(npf_conndb_t *cd, npf_conn_t *con)
737{
738	/* Remove both entries of the connection. */
739	mutex_enter(&con->c_lock);
740	if ((atomic_load_relaxed(&con->c_flags) & CONN_REMOVED) == 0) {
741		npf_connkey_t *fw, *bk;
742		npf_conn_t *ret __diagused;
743
744		fw = npf_conn_getforwkey(con);
745		ret = npf_conndb_remove(cd, fw);
746		KASSERT(ret == con);
747
748		bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
749		ret = npf_conndb_remove(cd, bk);
750		KASSERT(ret == con);
751	}
752
753	/* Flag the removal and expiration. */
754	atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
755	mutex_exit(&con->c_lock);
756}
757
758/*
759 * npf_conn_worker: G/C to run from a worker thread or via npfk_gc().
760 */
761void
762npf_conn_worker(npf_t *npf)
763{
764	npf_conndb_t *conn_db = atomic_load_consume(&npf->conn_db);
765	npf_conndb_gc(npf, conn_db, false, true);
766}
767
768/*
769 * npf_conndb_export: construct a list of connections prepared for saving.
770 * Note: this is expected to be an expensive operation.
771 */
772int
773npf_conndb_export(npf_t *npf, nvlist_t *nvl)
774{
775	npf_conn_t *head, *con;
776	npf_conndb_t *conn_db;
777
778	/*
779	 * Note: acquire conn_lock to prevent from the database
780	 * destruction and G/C thread.
781	 */
782	mutex_enter(&npf->conn_lock);
783	if (atomic_load_relaxed(&npf->conn_tracking) != CONN_TRACKING_ON) {
784		mutex_exit(&npf->conn_lock);
785		return 0;
786	}
787	conn_db = atomic_load_relaxed(&npf->conn_db);
788	head = npf_conndb_getlist(conn_db);
789	con = head;
790	while (con) {
791		nvlist_t *con_nvl;
792
793		con_nvl = nvlist_create(0);
794		if (npf_conn_export(npf, con, con_nvl) == 0) {
795			nvlist_append_nvlist_array(nvl, "conn-list", con_nvl);
796		}
797		nvlist_destroy(con_nvl);
798
799		if ((con = npf_conndb_getnext(conn_db, con)) == head) {
800			break;
801		}
802	}
803	mutex_exit(&npf->conn_lock);
804	return 0;
805}
806
807/*
808 * npf_conn_export: serialize a single connection.
809 */
810static int
811npf_conn_export(npf_t *npf, npf_conn_t *con, nvlist_t *nvl)
812{
813	nvlist_t *knvl;
814	npf_connkey_t *fw, *bk;
815	unsigned flags, alen;
816
817	flags = atomic_load_relaxed(&con->c_flags);
818	if ((flags & (CONN_ACTIVE|CONN_EXPIRE)) != CONN_ACTIVE) {
819		return ESRCH;
820	}
821	nvlist_add_number(nvl, "flags", flags);
822	nvlist_add_number(nvl, "proto", con->c_proto);
823	if (con->c_ifid) {
824		char ifname[IFNAMSIZ];
825		npf_ifmap_copyname(npf, con->c_ifid, ifname, sizeof(ifname));
826		nvlist_add_string(nvl, "ifname", ifname);
827	}
828	nvlist_add_binary(nvl, "state", &con->c_state, sizeof(npf_state_t));
829
830	fw = npf_conn_getforwkey(con);
831	alen = NPF_CONNKEY_ALEN(fw);
832	KASSERT(alen == con->c_alen);
833	bk = npf_conn_getbackkey(con, alen);
834
835	knvl = npf_connkey_export(npf, fw);
836	nvlist_move_nvlist(nvl, "forw-key", knvl);
837
838	knvl = npf_connkey_export(npf, bk);
839	nvlist_move_nvlist(nvl, "back-key", knvl);
840
841	/* Let the address length be based on on first key. */
842	nvlist_add_number(nvl, "alen", alen);
843
844	if (con->c_nat) {
845		npf_nat_export(npf, con->c_nat, nvl);
846	}
847	return 0;
848}
849
850/*
851 * npf_conn_import: fully reconstruct a single connection from a
852 * nvlist and insert into the given database.
853 */
854int
855npf_conn_import(npf_t *npf, npf_conndb_t *cd, const nvlist_t *cdict,
856    npf_ruleset_t *natlist)
857{
858	npf_conn_t *con;
859	npf_connkey_t *fw, *bk;
860	const nvlist_t *nat, *conkey;
861	unsigned flags, alen, idx;
862	const char *ifname;
863	const void *state;
864	size_t len;
865
866	/*
867	 * To determine the length of the connection, which depends
868	 * on the address length in the connection keys.
869	 */
870	alen = dnvlist_get_number(cdict, "alen", 0);
871	idx = NPF_CONNCACHE(alen);
872
873	/* Allocate a connection and initialize it (clear first). */
874	con = pool_cache_get(npf->conn_cache[idx], PR_WAITOK);
875	memset(con, 0, sizeof(npf_conn_t));
876	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
877	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
878
879	con->c_proto = dnvlist_get_number(cdict, "proto", 0);
880	flags = dnvlist_get_number(cdict, "flags", 0);
881	flags &= PFIL_ALL | CONN_ACTIVE | CONN_PASS;
882	atomic_store_relaxed(&con->c_flags, flags);
883	conn_update_atime(con);
884
885	ifname = dnvlist_get_string(cdict, "ifname", NULL);
886	if (ifname && (con->c_ifid = npf_ifmap_register(npf, ifname)) == 0) {
887		goto err;
888	}
889
890	state = dnvlist_get_binary(cdict, "state", &len, NULL, 0);
891	if (!state || len != sizeof(npf_state_t)) {
892		goto err;
893	}
894	memcpy(&con->c_state, state, sizeof(npf_state_t));
895
896	/* Reconstruct NAT association, if any. */
897	if ((nat = dnvlist_get_nvlist(cdict, "nat", NULL)) != NULL &&
898	    (con->c_nat = npf_nat_import(npf, nat, natlist, con)) == NULL) {
899		goto err;
900	}
901
902	/*
903	 * Fetch and copy the keys for each direction.
904	 */
905	fw = npf_conn_getforwkey(con);
906	conkey = dnvlist_get_nvlist(cdict, "forw-key", NULL);
907	if (conkey == NULL || !npf_connkey_import(npf, conkey, fw)) {
908		goto err;
909	}
910	bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
911	conkey = dnvlist_get_nvlist(cdict, "back-key", NULL);
912	if (conkey == NULL || !npf_connkey_import(npf, conkey, bk)) {
913		goto err;
914	}
915
916	/* Guard against the contradicting address lengths. */
917	if (NPF_CONNKEY_ALEN(fw) != alen || NPF_CONNKEY_ALEN(bk) != alen) {
918		goto err;
919	}
920
921	/* Insert the entries and the connection itself. */
922	if (!npf_conndb_insert(cd, fw, con, NPF_FLOW_FORW)) {
923		goto err;
924	}
925	if (!npf_conndb_insert(cd, bk, con, NPF_FLOW_BACK)) {
926		npf_conndb_remove(cd, fw);
927		goto err;
928	}
929
930	NPF_PRINTF(("NPF: imported conn %p\n", con));
931	npf_conndb_enqueue(cd, con);
932	return 0;
933err:
934	npf_conn_destroy(npf, con);
935	return EINVAL;
936}
937
938/*
939 * npf_conn_find: lookup a connection in the list of connections
940 */
941int
942npf_conn_find(npf_t *npf, const nvlist_t *req, nvlist_t *resp)
943{
944	const nvlist_t *key_nv;
945	npf_conn_t *con;
946	npf_connkey_t key;
947	npf_flow_t flow;
948	int error;
949
950	key_nv = dnvlist_get_nvlist(req, "key", NULL);
951	if (!key_nv || !npf_connkey_import(npf, key_nv, &key)) {
952		return EINVAL;
953	}
954	con = npf_conndb_lookup(npf, &key, &flow);
955	if (con == NULL) {
956		return ESRCH;
957	}
958	if (!npf_conn_check(con, NULL, 0, NPF_FLOW_FORW)) {
959		atomic_dec_uint(&con->c_refcnt);
960		return ESRCH;
961	}
962	error = npf_conn_export(npf, con, resp);
963	nvlist_add_number(resp, "flow", flow);
964	atomic_dec_uint(&con->c_refcnt);
965	return error;
966}
967
968#if defined(DDB) || defined(_NPF_TESTING)
969
970void
971npf_conn_print(npf_conn_t *con)
972{
973	const npf_connkey_t *fw = npf_conn_getforwkey(con);
974	const npf_connkey_t *bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
975	const unsigned flags = atomic_load_relaxed(&con->c_flags);
976	const unsigned proto = con->c_proto;
977	struct timespec tspnow;
978
979	getnanouptime(&tspnow);
980	printf("%p:\n\tproto %d flags 0x%x tsdiff %ld etime %d\n", con,
981	    proto, flags, (long)(tspnow.tv_sec - con->c_atime),
982	    npf_state_etime(npf_getkernctx(), &con->c_state, proto));
983	npf_connkey_print(fw);
984	npf_connkey_print(bk);
985	npf_state_dump(&con->c_state);
986	if (con->c_nat) {
987		npf_nat_dump(con->c_nat);
988	}
989}
990
991#endif
992