ip_fw_dynamic.c revision 332401
1/*-
2 * Copyright (c) 2017-2018 Yandex LLC
3 * Copyright (c) 2017-2018 Andrey V. Elsukov <ae@FreeBSD.org>
4 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/11/sys/netpfil/ipfw/ip_fw_dynamic.c 332401 2018-04-11 10:36:20Z ae $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33#include "opt_ipfw.h"
34#ifndef INET
35#error IPFIREWALL requires INET.
36#endif /* INET */
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/hash.h>
41#include <sys/mbuf.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/pcpu.h>
45#include <sys/queue.h>
46#include <sys/rmlock.h>
47#include <sys/smp.h>
48#include <sys/socket.h>
49#include <sys/sysctl.h>
50#include <sys/syslog.h>
51#include <net/ethernet.h>
52#include <net/if.h>
53#include <net/if_var.h>
54#include <net/pfil.h>
55#include <net/vnet.h>
56
57#include <netinet/in.h>
58#include <netinet/ip.h>
59#include <netinet/ip_var.h>
60#include <netinet/ip_fw.h>
61#include <netinet/tcp_var.h>
62#include <netinet/udp.h>
63
64#include <netinet/ip6.h>	/* IN6_ARE_ADDR_EQUAL */
65#ifdef INET6
66#include <netinet6/in6_var.h>
67#include <netinet6/ip6_var.h>
68#include <netinet6/scope6_var.h>
69#endif
70
71#include <netpfil/ipfw/ip_fw_private.h>
72
73#include <machine/in_cksum.h>	/* XXX for in_cksum */
74
75#ifdef MAC
76#include <security/mac/mac_framework.h>
77#endif
78#include <ck_queue.h>
79
80/*
81 * Description of dynamic states.
82 *
83 * Dynamic states are stored in lists accessed through a hash tables
84 * whose size is curr_dyn_buckets. This value can be modified through
85 * the sysctl variable dyn_buckets.
86 *
87 * Currently there are four tables: dyn_ipv4, dyn_ipv6, dyn_ipv4_parent,
88 * and dyn_ipv6_parent.
89 *
90 * When a packet is received, its address fields hashed, then matched
91 * against the entries in the corresponding list by addr_type.
92 * Dynamic states can be used for different purposes:
93 *  + stateful rules;
94 *  + enforcing limits on the number of sessions;
95 *  + in-kernel NAT (not implemented yet)
96 *
97 * The lifetime of dynamic states is regulated by dyn_*_lifetime,
98 * measured in seconds and depending on the flags.
99 *
100 * The total number of dynamic states is equal to UMA zone items count.
101 * The max number of dynamic states is dyn_max. When we reach
102 * the maximum number of rules we do not create anymore. This is
103 * done to avoid consuming too much memory, but also too much
104 * time when searching on each packet (ideally, we should try instead
105 * to put a limit on the length of the list on each bucket...).
106 *
107 * Each state holds a pointer to the parent ipfw rule so we know what
108 * action to perform. Dynamic rules are removed when the parent rule is
109 * deleted.
110 *
111 * There are some limitations with dynamic rules -- we do not
112 * obey the 'randomized match', and we do not do multiple
113 * passes through the firewall. XXX check the latter!!!
114 */
115
116/* By default use jenkins hash function */
117#define	IPFIREWALL_JENKINSHASH
118
119#define	DYN_COUNTER_INC(d, dir, pktlen)	do {	\
120	(d)->pcnt_ ## dir++;			\
121	(d)->bcnt_ ## dir += pktlen;		\
122	} while (0)
123
124struct dyn_data {
125	void		*parent;	/* pointer to parent rule */
126	uint32_t	chain_id;	/* cached ruleset id */
127	uint32_t	f_pos;		/* cached rule index */
128
129	uint32_t	hashval;	/* hash value used for hash resize */
130	uint16_t	fibnum;		/* fib used to send keepalives */
131	uint8_t		_pad[3];
132	uint8_t		set;		/* parent rule set number */
133	uint16_t	rulenum;	/* parent rule number */
134	uint32_t	ruleid;		/* parent rule id */
135
136	uint32_t	state;		/* TCP session state and flags */
137	uint32_t	ack_fwd;	/* most recent ACKs in forward */
138	uint32_t	ack_rev;	/* and reverse direction (used */
139					/* to generate keepalives) */
140	uint32_t	sync;		/* synchronization time */
141	uint32_t	expire;		/* expire time */
142
143	uint64_t	pcnt_fwd;	/* bytes counter in forward */
144	uint64_t	bcnt_fwd;	/* packets counter in forward */
145	uint64_t	pcnt_rev;	/* bytes counter in reverse */
146	uint64_t	bcnt_rev;	/* packets counter in reverse */
147};
148
149#define	DPARENT_COUNT_DEC(p)	do {			\
150	MPASS(p->count > 0);				\
151	ck_pr_dec_32(&(p)->count);			\
152} while (0)
153#define	DPARENT_COUNT_INC(p)	ck_pr_inc_32(&(p)->count)
154#define	DPARENT_COUNT(p)	ck_pr_load_32(&(p)->count)
155struct dyn_parent {
156	void		*parent;	/* pointer to parent rule */
157	uint32_t	count;		/* number of linked states */
158	uint8_t		_pad;
159	uint8_t		set;		/* parent rule set number */
160	uint16_t	rulenum;	/* parent rule number */
161	uint32_t	ruleid;		/* parent rule id */
162	uint32_t	hashval;	/* hash value used for hash resize */
163	uint32_t	expire;		/* expire time */
164};
165
166struct dyn_ipv4_state {
167	uint8_t		type;		/* State type */
168	uint8_t		proto;		/* UL Protocol */
169	uint16_t	kidx;		/* named object index */
170	uint16_t	sport, dport;	/* ULP source and destination ports */
171	in_addr_t	src, dst;	/* IPv4 source and destination */
172
173	union {
174		struct dyn_data	*data;
175		struct dyn_parent *limit;
176	};
177	CK_SLIST_ENTRY(dyn_ipv4_state)	entry;
178	SLIST_ENTRY(dyn_ipv4_state)	expired;
179};
180CK_SLIST_HEAD(dyn_ipv4ck_slist, dyn_ipv4_state);
181static VNET_DEFINE(struct dyn_ipv4ck_slist *, dyn_ipv4);
182static VNET_DEFINE(struct dyn_ipv4ck_slist *, dyn_ipv4_parent);
183
184SLIST_HEAD(dyn_ipv4_slist, dyn_ipv4_state);
185static VNET_DEFINE(struct dyn_ipv4_slist, dyn_expired_ipv4);
186#define	V_dyn_ipv4			VNET(dyn_ipv4)
187#define	V_dyn_ipv4_parent		VNET(dyn_ipv4_parent)
188#define	V_dyn_expired_ipv4		VNET(dyn_expired_ipv4)
189
190#ifdef INET6
191struct dyn_ipv6_state {
192	uint8_t		type;		/* State type */
193	uint8_t		proto;		/* UL Protocol */
194	uint16_t	kidx;		/* named object index */
195	uint16_t	sport, dport;	/* ULP source and destination ports */
196	struct in6_addr	src, dst;	/* IPv6 source and destination */
197	uint32_t	zoneid;		/* IPv6 scope zone id */
198	union {
199		struct dyn_data	*data;
200		struct dyn_parent *limit;
201	};
202	CK_SLIST_ENTRY(dyn_ipv6_state)	entry;
203	SLIST_ENTRY(dyn_ipv6_state)	expired;
204};
205CK_SLIST_HEAD(dyn_ipv6ck_slist, dyn_ipv6_state);
206static VNET_DEFINE(struct dyn_ipv6ck_slist *, dyn_ipv6);
207static VNET_DEFINE(struct dyn_ipv6ck_slist *, dyn_ipv6_parent);
208
209SLIST_HEAD(dyn_ipv6_slist, dyn_ipv6_state);
210static VNET_DEFINE(struct dyn_ipv6_slist, dyn_expired_ipv6);
211#define	V_dyn_ipv6			VNET(dyn_ipv6)
212#define	V_dyn_ipv6_parent		VNET(dyn_ipv6_parent)
213#define	V_dyn_expired_ipv6		VNET(dyn_expired_ipv6)
214#endif /* INET6 */
215
216/*
217 * Per-CPU pointer indicates that specified state is currently in use
218 * and must not be reclaimed by expiration callout.
219 */
220static void **dyn_hp_cache;
221static DPCPU_DEFINE(void *, dyn_hp);
222#define	DYNSTATE_GET(cpu)	ck_pr_load_ptr(DPCPU_ID_PTR((cpu), dyn_hp))
223#define	DYNSTATE_PROTECT(v)	ck_pr_store_ptr(DPCPU_PTR(dyn_hp), (v))
224#define	DYNSTATE_RELEASE()	DYNSTATE_PROTECT(NULL)
225#define	DYNSTATE_CRITICAL_ENTER()	critical_enter()
226#define	DYNSTATE_CRITICAL_EXIT()	do {	\
227	DYNSTATE_RELEASE();			\
228	critical_exit();			\
229} while (0);
230
231/*
232 * We keep two version numbers, one is updated when new entry added to
233 * the list. Second is updated when an entry deleted from the list.
234 * Versions are updated under bucket lock.
235 *
236 * Bucket "add" version number is used to know, that in the time between
237 * state lookup (i.e. ipfw_dyn_lookup_state()) and the followed state
238 * creation (i.e. ipfw_dyn_install_state()) another concurrent thread did
239 * not install some state in this bucket. Using this info we can avoid
240 * additional state lookup, because we are sure that we will not install
241 * the state twice.
242 *
243 * Also doing the tracking of bucket "del" version during lookup we can
244 * be sure, that state entry was not unlinked and freed in time between
245 * we read the state pointer and protect it with hazard pointer.
246 *
247 * An entry unlinked from CK list keeps unchanged until it is freed.
248 * Unlinked entries are linked into expired lists using "expired" field.
249 */
250
251/*
252 * dyn_expire_lock is used to protect access to dyn_expired_xxx lists.
253 * dyn_bucket_lock is used to get write access to lists in specific bucket.
254 * Currently one dyn_bucket_lock is used for all ipv4, ipv4_parent, ipv6,
255 * and ipv6_parent lists.
256 */
257static VNET_DEFINE(struct mtx, dyn_expire_lock);
258static VNET_DEFINE(struct mtx *, dyn_bucket_lock);
259#define	V_dyn_expire_lock		VNET(dyn_expire_lock)
260#define	V_dyn_bucket_lock		VNET(dyn_bucket_lock)
261
262/*
263 * Bucket's add/delete generation versions.
264 */
265static VNET_DEFINE(uint32_t *, dyn_ipv4_add);
266static VNET_DEFINE(uint32_t *, dyn_ipv4_del);
267static VNET_DEFINE(uint32_t *, dyn_ipv4_parent_add);
268static VNET_DEFINE(uint32_t *, dyn_ipv4_parent_del);
269#define	V_dyn_ipv4_add			VNET(dyn_ipv4_add)
270#define	V_dyn_ipv4_del			VNET(dyn_ipv4_del)
271#define	V_dyn_ipv4_parent_add		VNET(dyn_ipv4_parent_add)
272#define	V_dyn_ipv4_parent_del		VNET(dyn_ipv4_parent_del)
273
274#ifdef INET6
275static VNET_DEFINE(uint32_t *, dyn_ipv6_add);
276static VNET_DEFINE(uint32_t *, dyn_ipv6_del);
277static VNET_DEFINE(uint32_t *, dyn_ipv6_parent_add);
278static VNET_DEFINE(uint32_t *, dyn_ipv6_parent_del);
279#define	V_dyn_ipv6_add			VNET(dyn_ipv6_add)
280#define	V_dyn_ipv6_del			VNET(dyn_ipv6_del)
281#define	V_dyn_ipv6_parent_add		VNET(dyn_ipv6_parent_add)
282#define	V_dyn_ipv6_parent_del		VNET(dyn_ipv6_parent_del)
283#endif /* INET6 */
284
285#define	DYN_BUCKET(h, b)		((h) & (b - 1))
286#define	DYN_BUCKET_VERSION(b, v)	ck_pr_load_32(&V_dyn_ ## v[(b)])
287#define	DYN_BUCKET_VERSION_BUMP(b, v)	ck_pr_inc_32(&V_dyn_ ## v[(b)])
288
289#define	DYN_BUCKET_LOCK_INIT(lock, b)		\
290    mtx_init(&lock[(b)], "IPFW dynamic bucket", NULL, MTX_DEF)
291#define	DYN_BUCKET_LOCK_DESTROY(lock, b)	mtx_destroy(&lock[(b)])
292#define	DYN_BUCKET_LOCK(b)	mtx_lock(&V_dyn_bucket_lock[(b)])
293#define	DYN_BUCKET_UNLOCK(b)	mtx_unlock(&V_dyn_bucket_lock[(b)])
294#define	DYN_BUCKET_ASSERT(b)	mtx_assert(&V_dyn_bucket_lock[(b)], MA_OWNED)
295
296#define	DYN_EXPIRED_LOCK_INIT()		\
297    mtx_init(&V_dyn_expire_lock, "IPFW expired states list", NULL, MTX_DEF)
298#define	DYN_EXPIRED_LOCK_DESTROY()	mtx_destroy(&V_dyn_expire_lock)
299#define	DYN_EXPIRED_LOCK()		mtx_lock(&V_dyn_expire_lock)
300#define	DYN_EXPIRED_UNLOCK()		mtx_unlock(&V_dyn_expire_lock)
301
302static VNET_DEFINE(uint32_t, dyn_buckets_max);
303static VNET_DEFINE(uint32_t, curr_dyn_buckets);
304static VNET_DEFINE(struct callout, dyn_timeout);
305#define	V_dyn_buckets_max		VNET(dyn_buckets_max)
306#define	V_curr_dyn_buckets		VNET(curr_dyn_buckets)
307#define	V_dyn_timeout			VNET(dyn_timeout)
308
309/* Maximum length of states chain in a bucket */
310static VNET_DEFINE(uint32_t, curr_max_length);
311#define	V_curr_max_length		VNET(curr_max_length)
312
313static VNET_DEFINE(uma_zone_t, dyn_data_zone);
314static VNET_DEFINE(uma_zone_t, dyn_parent_zone);
315static VNET_DEFINE(uma_zone_t, dyn_ipv4_zone);
316#ifdef INET6
317static VNET_DEFINE(uma_zone_t, dyn_ipv6_zone);
318#define	V_dyn_ipv6_zone			VNET(dyn_ipv6_zone)
319#endif /* INET6 */
320#define	V_dyn_data_zone			VNET(dyn_data_zone)
321#define	V_dyn_parent_zone		VNET(dyn_parent_zone)
322#define	V_dyn_ipv4_zone			VNET(dyn_ipv4_zone)
323
324/*
325 * Timeouts for various events in handing dynamic rules.
326 */
327static VNET_DEFINE(uint32_t, dyn_ack_lifetime);
328static VNET_DEFINE(uint32_t, dyn_syn_lifetime);
329static VNET_DEFINE(uint32_t, dyn_fin_lifetime);
330static VNET_DEFINE(uint32_t, dyn_rst_lifetime);
331static VNET_DEFINE(uint32_t, dyn_udp_lifetime);
332static VNET_DEFINE(uint32_t, dyn_short_lifetime);
333
334#define	V_dyn_ack_lifetime		VNET(dyn_ack_lifetime)
335#define	V_dyn_syn_lifetime		VNET(dyn_syn_lifetime)
336#define	V_dyn_fin_lifetime		VNET(dyn_fin_lifetime)
337#define	V_dyn_rst_lifetime		VNET(dyn_rst_lifetime)
338#define	V_dyn_udp_lifetime		VNET(dyn_udp_lifetime)
339#define	V_dyn_short_lifetime		VNET(dyn_short_lifetime)
340
341/*
342 * Keepalives are sent if dyn_keepalive is set. They are sent every
343 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
344 * seconds of lifetime of a rule.
345 * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
346 * than dyn_keepalive_period.
347 */
348#define	DYN_KEEPALIVE_MAXQ		512
349static VNET_DEFINE(uint32_t, dyn_keepalive_interval);
350static VNET_DEFINE(uint32_t, dyn_keepalive_period);
351static VNET_DEFINE(uint32_t, dyn_keepalive);
352static VNET_DEFINE(time_t, dyn_keepalive_last);
353
354#define	V_dyn_keepalive_interval	VNET(dyn_keepalive_interval)
355#define	V_dyn_keepalive_period		VNET(dyn_keepalive_period)
356#define	V_dyn_keepalive			VNET(dyn_keepalive)
357#define	V_dyn_keepalive_last		VNET(dyn_keepalive_last)
358
359static VNET_DEFINE(uint32_t, dyn_max);		/* max # of dynamic states */
360static VNET_DEFINE(uint32_t, dyn_count);	/* number of states */
361static VNET_DEFINE(uint32_t, dyn_parent_max);	/* max # of parent states */
362static VNET_DEFINE(uint32_t, dyn_parent_count);	/* number of parent states */
363#define	V_dyn_max			VNET(dyn_max)
364#define	V_dyn_count			VNET(dyn_count)
365#define	V_dyn_parent_max		VNET(dyn_parent_max)
366#define	V_dyn_parent_count		VNET(dyn_parent_count)
367
368#define	DYN_COUNT_DEC(name)	do {			\
369	MPASS((V_ ## name) > 0);			\
370	ck_pr_dec_32(&(V_ ## name));			\
371} while (0)
372#define	DYN_COUNT_INC(name)	ck_pr_inc_32(&(V_ ## name))
373#define	DYN_COUNT(name)		ck_pr_load_32(&(V_ ## name))
374
375static time_t last_log;	/* Log ratelimiting */
376
377/*
378 * Get/set maximum number of dynamic states in given VNET instance.
379 */
380static int
381sysctl_dyn_max(SYSCTL_HANDLER_ARGS)
382{
383	uint32_t nstates;
384	int error;
385
386	nstates = V_dyn_max;
387	error = sysctl_handle_32(oidp, &nstates, 0, req);
388	/* Read operation or some error */
389	if ((error != 0) || (req->newptr == NULL))
390		return (error);
391
392	V_dyn_max = nstates;
393	uma_zone_set_max(V_dyn_data_zone, V_dyn_max);
394	return (0);
395}
396
397static int
398sysctl_dyn_parent_max(SYSCTL_HANDLER_ARGS)
399{
400	uint32_t nstates;
401	int error;
402
403	nstates = V_dyn_parent_max;
404	error = sysctl_handle_32(oidp, &nstates, 0, req);
405	/* Read operation or some error */
406	if ((error != 0) || (req->newptr == NULL))
407		return (error);
408
409	V_dyn_parent_max = nstates;
410	uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max);
411	return (0);
412}
413
414static int
415sysctl_dyn_buckets(SYSCTL_HANDLER_ARGS)
416{
417	uint32_t nbuckets;
418	int error;
419
420	nbuckets = V_dyn_buckets_max;
421	error = sysctl_handle_32(oidp, &nbuckets, 0, req);
422	/* Read operation or some error */
423	if ((error != 0) || (req->newptr == NULL))
424		return (error);
425
426	if (nbuckets > 256)
427		V_dyn_buckets_max = 1 << fls(nbuckets - 1);
428	else
429		return (EINVAL);
430	return (0);
431}
432
433SYSCTL_DECL(_net_inet_ip_fw);
434
435SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_count,
436    CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_count), 0,
437    "Current number of dynamic states.");
438SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_parent_count,
439    CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_parent_count), 0,
440    "Current number of parent states. ");
441SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
442    CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
443    "Current number of buckets for states hash table.");
444SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_max_length,
445    CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_max_length), 0,
446    "Current maximum length of states chains in hash buckets.");
447SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
448    CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_buckets,
449    "IU", "Max number of buckets for dynamic states hash table.");
450SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
451    CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_max,
452    "IU", "Max number of dynamic states.");
453SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_parent_max,
454    CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_parent_max,
455    "IU", "Max number of parent dynamic states.");
456SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
457    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
458    "Lifetime of dynamic states for TCP ACK.");
459SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
460    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
461    "Lifetime of dynamic states for TCP SYN.");
462SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
463    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
464    "Lifetime of dynamic states for TCP FIN.");
465SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
466    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
467    "Lifetime of dynamic states for TCP RST.");
468SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
469    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
470    "Lifetime of dynamic states for UDP.");
471SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
472    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
473    "Lifetime of dynamic states for other situations.");
474SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
475    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
476    "Enable keepalives for dynamic states.");
477
478#ifdef IPFIREWALL_DYNDEBUG
479#define	DYN_DEBUG(fmt, ...)	do {			\
480	printf("%s: " fmt "\n", __func__, __VA_ARGS__);	\
481} while (0)
482#else
483#define	DYN_DEBUG(fmt, ...)
484#endif /* !IPFIREWALL_DYNDEBUG */
485
486#ifdef INET6
487/* Functions to work with IPv6 states */
488static struct dyn_ipv6_state *dyn_lookup_ipv6_state(
489    const struct ipfw_flow_id *, uint32_t, const void *,
490    struct ipfw_dyn_info *, int);
491static int dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *,
492    uint32_t, const void *, int, const void *, uint32_t, uint16_t, uint32_t,
493    uint16_t);
494static struct dyn_ipv6_state *dyn_alloc_ipv6_state(
495    const struct ipfw_flow_id *, uint32_t, uint16_t, uint8_t);
496static int dyn_add_ipv6_state(void *, uint32_t, uint16_t, uint8_t,
497    const struct ipfw_flow_id *, uint32_t, const void *, int, uint32_t,
498    struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t);
499static void dyn_export_ipv6_state(const struct dyn_ipv6_state *,
500    ipfw_dyn_rule *);
501
502static uint32_t dyn_getscopeid(const struct ip_fw_args *);
503static void dyn_make_keepalive_ipv6(struct mbuf *, const struct in6_addr *,
504    const struct in6_addr *, uint32_t, uint32_t, uint32_t, uint16_t,
505    uint16_t);
506static void dyn_enqueue_keepalive_ipv6(struct mbufq *,
507    const struct dyn_ipv6_state *);
508static void dyn_send_keepalive_ipv6(struct ip_fw_chain *);
509
510static struct dyn_ipv6_state *dyn_lookup_ipv6_parent(
511    const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t,
512    uint32_t);
513static struct dyn_ipv6_state *dyn_lookup_ipv6_parent_locked(
514    const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t,
515    uint32_t);
516static struct dyn_ipv6_state *dyn_add_ipv6_parent(void *, uint32_t, uint16_t,
517    uint8_t, const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t,
518    uint16_t);
519#endif /* INET6 */
520
521/* Functions to work with limit states */
522static void *dyn_get_parent_state(const struct ipfw_flow_id *, uint32_t,
523    struct ip_fw *, uint32_t, uint32_t, uint16_t);
524static struct dyn_ipv4_state *dyn_lookup_ipv4_parent(
525    const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t);
526static struct dyn_ipv4_state *dyn_lookup_ipv4_parent_locked(
527    const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t);
528static struct dyn_parent *dyn_alloc_parent(void *, uint32_t, uint16_t,
529    uint8_t, uint32_t);
530static struct dyn_ipv4_state *dyn_add_ipv4_parent(void *, uint32_t, uint16_t,
531    uint8_t, const struct ipfw_flow_id *, uint32_t, uint32_t, uint16_t);
532
533static void dyn_tick(void *);
534static void dyn_expire_states(struct ip_fw_chain *, ipfw_range_tlv *);
535static void dyn_free_states(struct ip_fw_chain *);
536static void dyn_export_parent(const struct dyn_parent *, uint16_t,
537    ipfw_dyn_rule *);
538static void dyn_export_data(const struct dyn_data *, uint16_t, uint8_t,
539    ipfw_dyn_rule *);
540static uint32_t dyn_update_tcp_state(struct dyn_data *,
541    const struct ipfw_flow_id *, const struct tcphdr *, int);
542static void dyn_update_proto_state(struct dyn_data *,
543    const struct ipfw_flow_id *, const void *, int, int);
544
545/* Functions to work with IPv4 states */
546struct dyn_ipv4_state *dyn_lookup_ipv4_state(const struct ipfw_flow_id *,
547    const void *, struct ipfw_dyn_info *, int);
548static int dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *,
549    const void *, int, const void *, uint32_t, uint16_t, uint32_t, uint16_t);
550static struct dyn_ipv4_state *dyn_alloc_ipv4_state(
551    const struct ipfw_flow_id *, uint16_t, uint8_t);
552static int dyn_add_ipv4_state(void *, uint32_t, uint16_t, uint8_t,
553    const struct ipfw_flow_id *, const void *, int, uint32_t,
554    struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t);
555static void dyn_export_ipv4_state(const struct dyn_ipv4_state *,
556    ipfw_dyn_rule *);
557
558/*
559 * Named states support.
560 */
561static char *default_state_name = "default";
562struct dyn_state_obj {
563	struct named_object	no;
564	char			name[64];
565};
566
567#define	DYN_STATE_OBJ(ch, cmd)	\
568    ((struct dyn_state_obj *)SRV_OBJECT(ch, (cmd)->arg1))
569/*
570 * Classifier callback.
571 * Return 0 if opcode contains object that should be referenced
572 * or rewritten.
573 */
574static int
575dyn_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
576{
577
578	DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1);
579	/* Don't rewrite "check-state any" */
580	if (cmd->arg1 == 0 &&
581	    cmd->opcode == O_CHECK_STATE)
582		return (1);
583
584	*puidx = cmd->arg1;
585	*ptype = 0;
586	return (0);
587}
588
589static void
590dyn_update(ipfw_insn *cmd, uint16_t idx)
591{
592
593	cmd->arg1 = idx;
594	DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1);
595}
596
597static int
598dyn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
599    struct named_object **pno)
600{
601	ipfw_obj_ntlv *ntlv;
602	const char *name;
603
604	DYN_DEBUG("uidx %d", ti->uidx);
605	if (ti->uidx != 0) {
606		if (ti->tlvs == NULL)
607			return (EINVAL);
608		/* Search ntlv in the buffer provided by user */
609		ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
610		    IPFW_TLV_STATE_NAME);
611		if (ntlv == NULL)
612			return (EINVAL);
613		name = ntlv->name;
614	} else
615		name = default_state_name;
616	/*
617	 * Search named object with corresponding name.
618	 * Since states objects are global - ignore the set value
619	 * and use zero instead.
620	 */
621	*pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0,
622	    IPFW_TLV_STATE_NAME, name);
623	/*
624	 * We always return success here.
625	 * The caller will check *pno and mark object as unresolved,
626	 * then it will automatically create "default" object.
627	 */
628	return (0);
629}
630
631static struct named_object *
632dyn_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
633{
634
635	DYN_DEBUG("kidx %d", idx);
636	return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx));
637}
638
639static int
640dyn_create(struct ip_fw_chain *ch, struct tid_info *ti,
641    uint16_t *pkidx)
642{
643	struct namedobj_instance *ni;
644	struct dyn_state_obj *obj;
645	struct named_object *no;
646	ipfw_obj_ntlv *ntlv;
647	char *name;
648
649	DYN_DEBUG("uidx %d", ti->uidx);
650	if (ti->uidx != 0) {
651		if (ti->tlvs == NULL)
652			return (EINVAL);
653		ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
654		    IPFW_TLV_STATE_NAME);
655		if (ntlv == NULL)
656			return (EINVAL);
657		name = ntlv->name;
658	} else
659		name = default_state_name;
660
661	ni = CHAIN_TO_SRV(ch);
662	obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO);
663	obj->no.name = obj->name;
664	obj->no.etlv = IPFW_TLV_STATE_NAME;
665	strlcpy(obj->name, name, sizeof(obj->name));
666
667	IPFW_UH_WLOCK(ch);
668	no = ipfw_objhash_lookup_name_type(ni, 0,
669	    IPFW_TLV_STATE_NAME, name);
670	if (no != NULL) {
671		/*
672		 * Object is already created.
673		 * Just return its kidx and bump refcount.
674		 */
675		*pkidx = no->kidx;
676		no->refcnt++;
677		IPFW_UH_WUNLOCK(ch);
678		free(obj, M_IPFW);
679		DYN_DEBUG("\tfound kidx %d", *pkidx);
680		return (0);
681	}
682	if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) {
683		DYN_DEBUG("\talloc_idx failed for %s", name);
684		IPFW_UH_WUNLOCK(ch);
685		free(obj, M_IPFW);
686		return (ENOSPC);
687	}
688	ipfw_objhash_add(ni, &obj->no);
689	SRV_OBJECT(ch, obj->no.kidx) = obj;
690	obj->no.refcnt++;
691	*pkidx = obj->no.kidx;
692	IPFW_UH_WUNLOCK(ch);
693	DYN_DEBUG("\tcreated kidx %d", *pkidx);
694	return (0);
695}
696
697static void
698dyn_destroy(struct ip_fw_chain *ch, struct named_object *no)
699{
700	struct dyn_state_obj *obj;
701
702	IPFW_UH_WLOCK_ASSERT(ch);
703
704	KASSERT(no->refcnt == 1,
705	    ("Destroying object '%s' (type %u, idx %u) with refcnt %u",
706	    no->name, no->etlv, no->kidx, no->refcnt));
707	DYN_DEBUG("kidx %d", no->kidx);
708	obj = SRV_OBJECT(ch, no->kidx);
709	SRV_OBJECT(ch, no->kidx) = NULL;
710	ipfw_objhash_del(CHAIN_TO_SRV(ch), no);
711	ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), no->kidx);
712
713	free(obj, M_IPFW);
714}
715
716static struct opcode_obj_rewrite dyn_opcodes[] = {
717	{
718		O_KEEP_STATE, IPFW_TLV_STATE_NAME,
719		dyn_classify, dyn_update,
720		dyn_findbyname, dyn_findbykidx,
721		dyn_create, dyn_destroy
722	},
723	{
724		O_CHECK_STATE, IPFW_TLV_STATE_NAME,
725		dyn_classify, dyn_update,
726		dyn_findbyname, dyn_findbykidx,
727		dyn_create, dyn_destroy
728	},
729	{
730		O_PROBE_STATE, IPFW_TLV_STATE_NAME,
731		dyn_classify, dyn_update,
732		dyn_findbyname, dyn_findbykidx,
733		dyn_create, dyn_destroy
734	},
735	{
736		O_LIMIT, IPFW_TLV_STATE_NAME,
737		dyn_classify, dyn_update,
738		dyn_findbyname, dyn_findbykidx,
739		dyn_create, dyn_destroy
740	},
741};
742
743/*
744 * IMPORTANT: the hash function for dynamic rules must be commutative
745 * in source and destination (ip,port), because rules are bidirectional
746 * and we want to find both in the same bucket.
747 */
748#ifndef IPFIREWALL_JENKINSHASH
749static __inline uint32_t
750hash_packet(const struct ipfw_flow_id *id)
751{
752	uint32_t i;
753
754#ifdef INET6
755	if (IS_IP6_FLOW_ID(id))
756		i = ntohl((id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
757		    (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
758		    (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
759		    (id->src_ip6.__u6_addr.__u6_addr32[3]));
760	else
761#endif /* INET6 */
762	i = (id->dst_ip) ^ (id->src_ip);
763	i ^= (id->dst_port) ^ (id->src_port);
764	return (i);
765}
766
767static __inline uint32_t
768hash_parent(const struct ipfw_flow_id *id, const void *rule)
769{
770
771	return (hash_packet(id) ^ ((uintptr_t)rule));
772}
773
774#else /* IPFIREWALL_JENKINSHASH */
775
776static VNET_DEFINE(uint32_t, dyn_hashseed);
777#define	V_dyn_hashseed		VNET(dyn_hashseed)
778
779static __inline int
780addrcmp4(const struct ipfw_flow_id *id)
781{
782
783	if (id->src_ip < id->dst_ip)
784		return (0);
785	if (id->src_ip > id->dst_ip)
786		return (1);
787	if (id->src_port <= id->dst_port)
788		return (0);
789	return (1);
790}
791
792#ifdef INET6
793static __inline int
794addrcmp6(const struct ipfw_flow_id *id)
795{
796	int ret;
797
798	ret = memcmp(&id->src_ip6, &id->dst_ip6, sizeof(struct in6_addr));
799	if (ret < 0)
800		return (0);
801	if (ret > 0)
802		return (1);
803	if (id->src_port <= id->dst_port)
804		return (0);
805	return (1);
806}
807
808static __inline uint32_t
809hash_packet6(const struct ipfw_flow_id *id)
810{
811	struct tuple6 {
812		struct in6_addr	addr[2];
813		uint16_t	port[2];
814	} t6;
815
816	if (addrcmp6(id) == 0) {
817		t6.addr[0] = id->src_ip6;
818		t6.addr[1] = id->dst_ip6;
819		t6.port[0] = id->src_port;
820		t6.port[1] = id->dst_port;
821	} else {
822		t6.addr[0] = id->dst_ip6;
823		t6.addr[1] = id->src_ip6;
824		t6.port[0] = id->dst_port;
825		t6.port[1] = id->src_port;
826	}
827	return (jenkins_hash32((const uint32_t *)&t6,
828	    sizeof(t6) / sizeof(uint32_t), V_dyn_hashseed));
829}
830#endif
831
832static __inline uint32_t
833hash_packet(const struct ipfw_flow_id *id)
834{
835	struct tuple4 {
836		in_addr_t	addr[2];
837		uint16_t	port[2];
838	} t4;
839
840	if (IS_IP4_FLOW_ID(id)) {
841		/* All fields are in host byte order */
842		if (addrcmp4(id) == 0) {
843			t4.addr[0] = id->src_ip;
844			t4.addr[1] = id->dst_ip;
845			t4.port[0] = id->src_port;
846			t4.port[1] = id->dst_port;
847		} else {
848			t4.addr[0] = id->dst_ip;
849			t4.addr[1] = id->src_ip;
850			t4.port[0] = id->dst_port;
851			t4.port[1] = id->src_port;
852		}
853		return (jenkins_hash32((const uint32_t *)&t4,
854		    sizeof(t4) / sizeof(uint32_t), V_dyn_hashseed));
855	} else
856#ifdef INET6
857	if (IS_IP6_FLOW_ID(id))
858		return (hash_packet6(id));
859#endif
860	return (0);
861}
862
863static __inline uint32_t
864hash_parent(const struct ipfw_flow_id *id, const void *rule)
865{
866
867	return (jenkins_hash32((const uint32_t *)&rule,
868	    sizeof(rule) / sizeof(uint32_t), hash_packet(id)));
869}
870#endif /* IPFIREWALL_JENKINSHASH */
871
872/*
873 * Print customizable flow id description via log(9) facility.
874 */
875static void
876print_dyn_rule_flags(const struct ipfw_flow_id *id, int dyn_type,
877    int log_flags, char *prefix, char *postfix)
878{
879	struct in_addr da;
880#ifdef INET6
881	char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
882#else
883	char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
884#endif
885
886#ifdef INET6
887	if (IS_IP6_FLOW_ID(id)) {
888		ip6_sprintf(src, &id->src_ip6);
889		ip6_sprintf(dst, &id->dst_ip6);
890	} else
891#endif
892	{
893		da.s_addr = htonl(id->src_ip);
894		inet_ntop(AF_INET, &da, src, sizeof(src));
895		da.s_addr = htonl(id->dst_ip);
896		inet_ntop(AF_INET, &da, dst, sizeof(dst));
897	}
898	log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n",
899	    prefix, dyn_type, src, id->src_port, dst,
900	    id->dst_port, V_dyn_count, postfix);
901}
902
903#define	print_dyn_rule(id, dtype, prefix, postfix)	\
904	print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix)
905
906#define	TIME_LEQ(a,b)	((int)((a)-(b)) <= 0)
907#define	TIME_LE(a,b)	((int)((a)-(b)) < 0)
908#define	_SEQ_GE(a,b)	((int)((a)-(b)) >= 0)
909#define	BOTH_SYN	(TH_SYN | (TH_SYN << 8))
910#define	BOTH_FIN	(TH_FIN | (TH_FIN << 8))
911#define	TCP_FLAGS	(TH_FLAGS | (TH_FLAGS << 8))
912#define	ACK_FWD		0x00010000	/* fwd ack seen */
913#define	ACK_REV		0x00020000	/* rev ack seen */
914#define	ACK_BOTH	(ACK_FWD | ACK_REV)
915
916static uint32_t
917dyn_update_tcp_state(struct dyn_data *data, const struct ipfw_flow_id *pkt,
918    const struct tcphdr *tcp, int dir)
919{
920	uint32_t ack, expire;
921	uint32_t state, old;
922	uint8_t th_flags;
923
924	expire = data->expire;
925	old = state = data->state;
926	th_flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST);
927	state |= (dir == MATCH_FORWARD) ? th_flags: (th_flags << 8);
928	switch (state & TCP_FLAGS) {
929	case TH_SYN:			/* opening */
930		expire = time_uptime + V_dyn_syn_lifetime;
931		break;
932
933	case BOTH_SYN:			/* move to established */
934	case BOTH_SYN | TH_FIN:		/* one side tries to close */
935	case BOTH_SYN | (TH_FIN << 8):
936		if (tcp == NULL)
937			break;
938		ack = ntohl(tcp->th_ack);
939		if (dir == MATCH_FORWARD) {
940			if (data->ack_fwd == 0 ||
941			    _SEQ_GE(ack, data->ack_fwd)) {
942				state |= ACK_FWD;
943				if (data->ack_fwd != ack)
944					ck_pr_store_32(&data->ack_fwd, ack);
945			}
946		} else {
947			if (data->ack_rev == 0 ||
948			    _SEQ_GE(ack, data->ack_rev)) {
949				state |= ACK_REV;
950				if (data->ack_rev != ack)
951					ck_pr_store_32(&data->ack_rev, ack);
952			}
953		}
954		if ((state & ACK_BOTH) == ACK_BOTH) {
955			/*
956			 * Set expire time to V_dyn_ack_lifetime only if
957			 * we got ACKs for both directions.
958			 * We use XOR here to avoid possible state
959			 * overwriting in concurrent thread.
960			 */
961			expire = time_uptime + V_dyn_ack_lifetime;
962			ck_pr_xor_32(&data->state, ACK_BOTH);
963		} else if ((data->state & ACK_BOTH) != (state & ACK_BOTH))
964			ck_pr_or_32(&data->state, state & ACK_BOTH);
965		break;
966
967	case BOTH_SYN | BOTH_FIN:	/* both sides closed */
968		if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
969			V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
970		expire = time_uptime + V_dyn_fin_lifetime;
971		break;
972
973	default:
974		if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
975			V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
976		expire = time_uptime + V_dyn_rst_lifetime;
977	}
978	/* Save TCP state if it was changed */
979	if ((state & TCP_FLAGS) != (old & TCP_FLAGS))
980		ck_pr_or_32(&data->state, state & TCP_FLAGS);
981	return (expire);
982}
983
984/*
985 * Update ULP specific state.
986 * For TCP we keep sequence numbers and flags. For other protocols
987 * currently we update only expire time. Packets and bytes counters
988 * are also updated here.
989 */
990static void
991dyn_update_proto_state(struct dyn_data *data, const struct ipfw_flow_id *pkt,
992    const void *ulp, int pktlen, int dir)
993{
994	uint32_t expire;
995
996	/* NOTE: we are in critical section here. */
997	switch (pkt->proto) {
998	case IPPROTO_UDP:
999	case IPPROTO_UDPLITE:
1000		expire = time_uptime + V_dyn_udp_lifetime;
1001		break;
1002	case IPPROTO_TCP:
1003		expire = dyn_update_tcp_state(data, pkt, ulp, dir);
1004		break;
1005	default:
1006		expire = time_uptime + V_dyn_short_lifetime;
1007	}
1008	/*
1009	 * Expiration timer has the per-second granularity, no need to update
1010	 * it every time when state is matched.
1011	 */
1012	if (data->expire != expire)
1013		ck_pr_store_32(&data->expire, expire);
1014
1015	if (dir == MATCH_FORWARD)
1016		DYN_COUNTER_INC(data, fwd, pktlen);
1017	else
1018		DYN_COUNTER_INC(data, rev, pktlen);
1019}
1020
1021/*
1022 * Lookup IPv4 state.
1023 * Must be called in critical section.
1024 */
1025struct dyn_ipv4_state *
1026dyn_lookup_ipv4_state(const struct ipfw_flow_id *pkt, const void *ulp,
1027    struct ipfw_dyn_info *info, int pktlen)
1028{
1029	struct dyn_ipv4_state *s;
1030	uint32_t version, bucket;
1031
1032	bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets);
1033	info->version = DYN_BUCKET_VERSION(bucket, ipv4_add);
1034restart:
1035	version = DYN_BUCKET_VERSION(bucket, ipv4_del);
1036	CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) {
1037		DYNSTATE_PROTECT(s);
1038		if (version != DYN_BUCKET_VERSION(bucket, ipv4_del))
1039			goto restart;
1040		if (s->proto != pkt->proto)
1041			continue;
1042		if (info->kidx != 0 && s->kidx != info->kidx)
1043			continue;
1044		if (s->sport == pkt->src_port && s->dport == pkt->dst_port &&
1045		    s->src == pkt->src_ip && s->dst == pkt->dst_ip) {
1046			info->direction = MATCH_FORWARD;
1047			break;
1048		}
1049		if (s->sport == pkt->dst_port && s->dport == pkt->src_port &&
1050		    s->src == pkt->dst_ip && s->dst == pkt->src_ip) {
1051			info->direction = MATCH_REVERSE;
1052			break;
1053		}
1054	}
1055
1056	if (s != NULL)
1057		dyn_update_proto_state(s->data, pkt, ulp, pktlen,
1058		    info->direction);
1059	return (s);
1060}
1061
1062/*
1063 * Lookup IPv4 state.
1064 * Simplifed version is used to check that matching state doesn't exist.
1065 */
1066static int
1067dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *pkt,
1068    const void *ulp, int pktlen, const void *parent, uint32_t ruleid,
1069    uint16_t rulenum, uint32_t bucket, uint16_t kidx)
1070{
1071	struct dyn_ipv4_state *s;
1072	int dir;
1073
1074	dir = MATCH_NONE;
1075	DYN_BUCKET_ASSERT(bucket);
1076	CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) {
1077		if (s->proto != pkt->proto ||
1078		    s->kidx != kidx)
1079			continue;
1080		/*
1081		 * XXXAE: Install synchronized state only when there are
1082		 *	  no matching states.
1083		 */
1084		if (pktlen != 0 && (
1085		    s->data->parent != parent ||
1086		    s->data->ruleid != ruleid ||
1087		    s->data->rulenum != rulenum))
1088			continue;
1089		if (s->sport == pkt->src_port &&
1090		    s->dport == pkt->dst_port &&
1091		    s->src == pkt->src_ip && s->dst == pkt->dst_ip) {
1092			dir = MATCH_FORWARD;
1093			break;
1094		}
1095		if (s->sport == pkt->dst_port && s->dport == pkt->src_port &&
1096		    s->src == pkt->dst_ip && s->dst == pkt->src_ip) {
1097			dir = MATCH_REVERSE;
1098			break;
1099		}
1100	}
1101	if (s != NULL)
1102		dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir);
1103	return (s != NULL);
1104}
1105
1106struct dyn_ipv4_state *
1107dyn_lookup_ipv4_parent(const struct ipfw_flow_id *pkt, const void *rule,
1108    uint32_t ruleid, uint16_t rulenum, uint32_t hashval)
1109{
1110	struct dyn_ipv4_state *s;
1111	uint32_t version, bucket;
1112
1113	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1114restart:
1115	version = DYN_BUCKET_VERSION(bucket, ipv4_parent_del);
1116	CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) {
1117		DYNSTATE_PROTECT(s);
1118		if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_del))
1119			goto restart;
1120		/*
1121		 * NOTE: we do not need to check kidx, because parent rule
1122		 * can not create states with different kidx.
1123		 * And parent rule always created for forward direction.
1124		 */
1125		if (s->limit->parent == rule &&
1126		    s->limit->ruleid == ruleid &&
1127		    s->limit->rulenum == rulenum &&
1128		    s->proto == pkt->proto &&
1129		    s->sport == pkt->src_port &&
1130		    s->dport == pkt->dst_port &&
1131		    s->src == pkt->src_ip && s->dst == pkt->dst_ip) {
1132			if (s->limit->expire != time_uptime +
1133			    V_dyn_short_lifetime)
1134				ck_pr_store_32(&s->limit->expire,
1135				    time_uptime + V_dyn_short_lifetime);
1136			break;
1137		}
1138	}
1139	return (s);
1140}
1141
1142static struct dyn_ipv4_state *
1143dyn_lookup_ipv4_parent_locked(const struct ipfw_flow_id *pkt,
1144    const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t bucket)
1145{
1146	struct dyn_ipv4_state *s;
1147
1148	DYN_BUCKET_ASSERT(bucket);
1149	CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) {
1150		if (s->limit->parent == rule &&
1151		    s->limit->ruleid == ruleid &&
1152		    s->limit->rulenum == rulenum &&
1153		    s->proto == pkt->proto &&
1154		    s->sport == pkt->src_port &&
1155		    s->dport == pkt->dst_port &&
1156		    s->src == pkt->src_ip && s->dst == pkt->dst_ip)
1157			break;
1158	}
1159	return (s);
1160}
1161
1162
1163#ifdef INET6
1164static uint32_t
1165dyn_getscopeid(const struct ip_fw_args *args)
1166{
1167
1168	/*
1169	 * If source or destination address is an scopeid address, we need
1170	 * determine the scope zone id to resolve address scope ambiguity.
1171	 */
1172	if (IN6_IS_ADDR_LINKLOCAL(&args->f_id.src_ip6) ||
1173	    IN6_IS_ADDR_LINKLOCAL(&args->f_id.dst_ip6)) {
1174		MPASS(args->oif != NULL ||
1175		    args->m->m_pkthdr.rcvif != NULL);
1176		return (in6_getscopezone(args->oif != NULL ? args->oif:
1177		    args->m->m_pkthdr.rcvif, IPV6_ADDR_SCOPE_LINKLOCAL));
1178	}
1179	return (0);
1180}
1181
1182/*
1183 * Lookup IPv6 state.
1184 * Must be called in critical section.
1185 */
1186static struct dyn_ipv6_state *
1187dyn_lookup_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1188    const void *ulp, struct ipfw_dyn_info *info, int pktlen)
1189{
1190	struct dyn_ipv6_state *s;
1191	uint32_t version, bucket;
1192
1193	bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets);
1194	info->version = DYN_BUCKET_VERSION(bucket, ipv6_add);
1195restart:
1196	version = DYN_BUCKET_VERSION(bucket, ipv6_del);
1197	CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) {
1198		DYNSTATE_PROTECT(s);
1199		if (version != DYN_BUCKET_VERSION(bucket, ipv6_del))
1200			goto restart;
1201		if (s->proto != pkt->proto || s->zoneid != zoneid)
1202			continue;
1203		if (info->kidx != 0 && s->kidx != info->kidx)
1204			continue;
1205		if (s->sport == pkt->src_port && s->dport == pkt->dst_port &&
1206		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) &&
1207		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) {
1208			info->direction = MATCH_FORWARD;
1209			break;
1210		}
1211		if (s->sport == pkt->dst_port && s->dport == pkt->src_port &&
1212		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) &&
1213		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) {
1214			info->direction = MATCH_REVERSE;
1215			break;
1216		}
1217	}
1218	if (s != NULL)
1219		dyn_update_proto_state(s->data, pkt, ulp, pktlen,
1220		    info->direction);
1221	return (s);
1222}
1223
1224/*
1225 * Lookup IPv6 state.
1226 * Simplifed version is used to check that matching state doesn't exist.
1227 */
1228static int
1229dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1230    const void *ulp, int pktlen, const void *parent, uint32_t ruleid,
1231    uint16_t rulenum, uint32_t bucket, uint16_t kidx)
1232{
1233	struct dyn_ipv6_state *s;
1234	int dir;
1235
1236	dir = MATCH_NONE;
1237	DYN_BUCKET_ASSERT(bucket);
1238	CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) {
1239		if (s->proto != pkt->proto || s->kidx != kidx ||
1240		    s->zoneid != zoneid)
1241			continue;
1242		/*
1243		 * XXXAE: Install synchronized state only when there are
1244		 *	  no matching states.
1245		 */
1246		if (pktlen != 0 && (
1247		    s->data->parent != parent ||
1248		    s->data->ruleid != ruleid ||
1249		    s->data->rulenum != rulenum))
1250			continue;
1251		if (s->sport == pkt->src_port && s->dport == pkt->dst_port &&
1252		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) &&
1253		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) {
1254			dir = MATCH_FORWARD;
1255			break;
1256		}
1257		if (s->sport == pkt->dst_port && s->dport == pkt->src_port &&
1258		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) &&
1259		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) {
1260			dir = MATCH_REVERSE;
1261			break;
1262		}
1263	}
1264	if (s != NULL)
1265		dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir);
1266	return (s != NULL);
1267}
1268
1269static struct dyn_ipv6_state *
1270dyn_lookup_ipv6_parent(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1271    const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t hashval)
1272{
1273	struct dyn_ipv6_state *s;
1274	uint32_t version, bucket;
1275
1276	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1277restart:
1278	version = DYN_BUCKET_VERSION(bucket, ipv6_parent_del);
1279	CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) {
1280		DYNSTATE_PROTECT(s);
1281		if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_del))
1282			goto restart;
1283		/*
1284		 * NOTE: we do not need to check kidx, because parent rule
1285		 * can not create states with different kidx.
1286		 * Also parent rule always created for forward direction.
1287		 */
1288		if (s->limit->parent == rule &&
1289		    s->limit->ruleid == ruleid &&
1290		    s->limit->rulenum == rulenum &&
1291		    s->proto == pkt->proto &&
1292		    s->sport == pkt->src_port &&
1293		    s->dport == pkt->dst_port && s->zoneid == zoneid &&
1294		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) &&
1295		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) {
1296			if (s->limit->expire != time_uptime +
1297			    V_dyn_short_lifetime)
1298				ck_pr_store_32(&s->limit->expire,
1299				    time_uptime + V_dyn_short_lifetime);
1300			break;
1301		}
1302	}
1303	return (s);
1304}
1305
1306static struct dyn_ipv6_state *
1307dyn_lookup_ipv6_parent_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1308    const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t bucket)
1309{
1310	struct dyn_ipv6_state *s;
1311
1312	DYN_BUCKET_ASSERT(bucket);
1313	CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) {
1314		if (s->limit->parent == rule &&
1315		    s->limit->ruleid == ruleid &&
1316		    s->limit->rulenum == rulenum &&
1317		    s->proto == pkt->proto &&
1318		    s->sport == pkt->src_port &&
1319		    s->dport == pkt->dst_port && s->zoneid == zoneid &&
1320		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) &&
1321		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6))
1322			break;
1323	}
1324	return (s);
1325}
1326
1327#endif /* INET6 */
1328
1329/*
1330 * Lookup dynamic state.
1331 *  pkt - filled by ipfw_chk() ipfw_flow_id;
1332 *  ulp - determined by ipfw_chk() upper level protocol header;
1333 *  dyn_info - info about matched state to return back;
1334 * Returns pointer to state's parent rule and dyn_info. If there is
1335 * no state, NULL is returned.
1336 * On match ipfw_dyn_lookup() updates state's counters.
1337 */
1338struct ip_fw *
1339ipfw_dyn_lookup_state(const struct ip_fw_args *args, const void *ulp,
1340    int pktlen, const ipfw_insn *cmd, struct ipfw_dyn_info *info)
1341{
1342	struct dyn_data *data;
1343	struct ip_fw *rule;
1344
1345	IPFW_RLOCK_ASSERT(&V_layer3_chain);
1346
1347	data = NULL;
1348	rule = NULL;
1349	info->kidx = cmd->arg1;
1350	info->direction = MATCH_NONE;
1351	info->hashval = hash_packet(&args->f_id);
1352
1353	DYNSTATE_CRITICAL_ENTER();
1354	if (IS_IP4_FLOW_ID(&args->f_id)) {
1355		struct dyn_ipv4_state *s;
1356
1357		s = dyn_lookup_ipv4_state(&args->f_id, ulp, info, pktlen);
1358		if (s != NULL) {
1359			/*
1360			 * Dynamic states are created using the same 5-tuple,
1361			 * so it is assumed, that parent rule for O_LIMIT
1362			 * state has the same address family.
1363			 */
1364			data = s->data;
1365			if (s->type == O_LIMIT) {
1366				s = data->parent;
1367				rule = s->limit->parent;
1368			} else
1369				rule = data->parent;
1370		}
1371	}
1372#ifdef INET6
1373	else if (IS_IP6_FLOW_ID(&args->f_id)) {
1374		struct dyn_ipv6_state *s;
1375
1376		s = dyn_lookup_ipv6_state(&args->f_id, dyn_getscopeid(args),
1377		    ulp, info, pktlen);
1378		if (s != NULL) {
1379			data = s->data;
1380			if (s->type == O_LIMIT) {
1381				s = data->parent;
1382				rule = s->limit->parent;
1383			} else
1384				rule = data->parent;
1385		}
1386	}
1387#endif
1388	if (data != NULL) {
1389		/*
1390		 * If cached chain id is the same, we can avoid rule index
1391		 * lookup. Otherwise do lookup and update chain_id and f_pos.
1392		 * It is safe even if there is concurrent thread that want
1393		 * update the same state, because chain->id can be changed
1394		 * only under IPFW_WLOCK().
1395		 */
1396		if (data->chain_id != V_layer3_chain.id) {
1397			data->f_pos = ipfw_find_rule(&V_layer3_chain,
1398			    data->rulenum, data->ruleid);
1399			/*
1400			 * Check that found state has not orphaned.
1401			 * When chain->id being changed the parent
1402			 * rule can be deleted. If found rule doesn't
1403			 * match the parent pointer, consider this
1404			 * result as MATCH_NONE and return NULL.
1405			 *
1406			 * This will lead to creation of new similar state
1407			 * that will be added into head of this bucket.
1408			 * And the state that we currently have matched
1409			 * should be deleted by dyn_expire_states().
1410			 */
1411			if (V_layer3_chain.map[data->f_pos] == rule)
1412				data->chain_id = V_layer3_chain.id;
1413			else {
1414				rule = NULL;
1415				info->direction = MATCH_NONE;
1416				DYN_DEBUG("rule %p  [%u, %u] is considered "
1417				    "invalid in data %p", rule, data->ruleid,
1418				    data->rulenum, data);
1419			}
1420		}
1421		info->f_pos = data->f_pos;
1422	}
1423	DYNSTATE_CRITICAL_EXIT();
1424#if 0
1425	/*
1426	 * Return MATCH_NONE if parent rule is in disabled set.
1427	 * This will lead to creation of new similar state that
1428	 * will be added into head of this bucket.
1429	 *
1430	 * XXXAE: we need to be able update state's set when parent
1431	 *	  rule set is changed.
1432	 */
1433	if (rule != NULL && (V_set_disable & (1 << rule->set))) {
1434		rule = NULL;
1435		info->direction = MATCH_NONE;
1436	}
1437#endif
1438	return (rule);
1439}
1440
1441static struct dyn_parent *
1442dyn_alloc_parent(void *parent, uint32_t ruleid, uint16_t rulenum,
1443    uint8_t set, uint32_t hashval)
1444{
1445	struct dyn_parent *limit;
1446
1447	limit = uma_zalloc(V_dyn_parent_zone, M_NOWAIT | M_ZERO);
1448	if (limit == NULL) {
1449		if (last_log != time_uptime) {
1450			last_log = time_uptime;
1451			log(LOG_DEBUG,
1452			    "ipfw: Cannot allocate parent dynamic state, "
1453			    "consider increasing "
1454			    "net.inet.ip.fw.dyn_parent_max\n");
1455		}
1456		return (NULL);
1457	}
1458
1459	limit->parent = parent;
1460	limit->ruleid = ruleid;
1461	limit->rulenum = rulenum;
1462	limit->set = set;
1463	limit->hashval = hashval;
1464	limit->expire = time_uptime + V_dyn_short_lifetime;
1465	return (limit);
1466}
1467
1468static struct dyn_data *
1469dyn_alloc_dyndata(void *parent, uint32_t ruleid, uint16_t rulenum,
1470    uint8_t set, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen,
1471    uint32_t hashval, uint16_t fibnum)
1472{
1473	struct dyn_data *data;
1474
1475	data = uma_zalloc(V_dyn_data_zone, M_NOWAIT | M_ZERO);
1476	if (data == NULL) {
1477		if (last_log != time_uptime) {
1478			last_log = time_uptime;
1479			log(LOG_DEBUG,
1480			    "ipfw: Cannot allocate dynamic state, "
1481			    "consider increasing net.inet.ip.fw.dyn_max\n");
1482		}
1483		return (NULL);
1484	}
1485
1486	data->parent = parent;
1487	data->ruleid = ruleid;
1488	data->rulenum = rulenum;
1489	data->set = set;
1490	data->fibnum = fibnum;
1491	data->hashval = hashval;
1492	data->expire = time_uptime + V_dyn_syn_lifetime;
1493	dyn_update_proto_state(data, pkt, ulp, pktlen, MATCH_FORWARD);
1494	return (data);
1495}
1496
1497static struct dyn_ipv4_state *
1498dyn_alloc_ipv4_state(const struct ipfw_flow_id *pkt, uint16_t kidx,
1499    uint8_t type)
1500{
1501	struct dyn_ipv4_state *s;
1502
1503	s = uma_zalloc(V_dyn_ipv4_zone, M_NOWAIT | M_ZERO);
1504	if (s == NULL)
1505		return (NULL);
1506
1507	s->type = type;
1508	s->kidx = kidx;
1509	s->proto = pkt->proto;
1510	s->sport = pkt->src_port;
1511	s->dport = pkt->dst_port;
1512	s->src = pkt->src_ip;
1513	s->dst = pkt->dst_ip;
1514	return (s);
1515}
1516
1517/*
1518 * Add IPv4 parent state.
1519 * Returns pointer to parent state. When it is not NULL we are in
1520 * critical section and pointer protected by hazard pointer.
1521 * When some error occurs, it returns NULL and exit from critical section
1522 * is not needed.
1523 */
1524static struct dyn_ipv4_state *
1525dyn_add_ipv4_parent(void *rule, uint32_t ruleid, uint16_t rulenum,
1526    uint8_t set, const struct ipfw_flow_id *pkt, uint32_t hashval,
1527    uint32_t version, uint16_t kidx)
1528{
1529	struct dyn_ipv4_state *s;
1530	struct dyn_parent *limit;
1531	uint32_t bucket;
1532
1533	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1534	DYN_BUCKET_LOCK(bucket);
1535	if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_add)) {
1536		/*
1537		 * Bucket version has been changed since last lookup,
1538		 * do lookup again to be sure that state does not exist.
1539		 */
1540		s = dyn_lookup_ipv4_parent_locked(pkt, rule, ruleid,
1541		    rulenum, bucket);
1542		if (s != NULL) {
1543			/*
1544			 * Simultaneous thread has already created this
1545			 * state. Just return it.
1546			 */
1547			DYNSTATE_CRITICAL_ENTER();
1548			DYNSTATE_PROTECT(s);
1549			DYN_BUCKET_UNLOCK(bucket);
1550			return (s);
1551		}
1552	}
1553
1554	limit = dyn_alloc_parent(rule, ruleid, rulenum, set, hashval);
1555	if (limit == NULL) {
1556		DYN_BUCKET_UNLOCK(bucket);
1557		return (NULL);
1558	}
1559
1560	s = dyn_alloc_ipv4_state(pkt, kidx, O_LIMIT_PARENT);
1561	if (s == NULL) {
1562		DYN_BUCKET_UNLOCK(bucket);
1563		uma_zfree(V_dyn_parent_zone, limit);
1564		return (NULL);
1565	}
1566
1567	s->limit = limit;
1568	CK_SLIST_INSERT_HEAD(&V_dyn_ipv4_parent[bucket], s, entry);
1569	DYN_COUNT_INC(dyn_parent_count);
1570	DYN_BUCKET_VERSION_BUMP(bucket, ipv4_parent_add);
1571	DYNSTATE_CRITICAL_ENTER();
1572	DYNSTATE_PROTECT(s);
1573	DYN_BUCKET_UNLOCK(bucket);
1574	return (s);
1575}
1576
1577static int
1578dyn_add_ipv4_state(void *parent, uint32_t ruleid, uint16_t rulenum,
1579    uint8_t set, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen,
1580    uint32_t hashval, struct ipfw_dyn_info *info, uint16_t fibnum,
1581    uint16_t kidx, uint8_t type)
1582{
1583	struct dyn_ipv4_state *s;
1584	void *data;
1585	uint32_t bucket;
1586
1587	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1588	DYN_BUCKET_LOCK(bucket);
1589	if (info->direction == MATCH_UNKNOWN ||
1590	    info->kidx != kidx ||
1591	    info->hashval != hashval ||
1592	    info->version != DYN_BUCKET_VERSION(bucket, ipv4_add)) {
1593		/*
1594		 * Bucket version has been changed since last lookup,
1595		 * do lookup again to be sure that state does not exist.
1596		 */
1597		if (dyn_lookup_ipv4_state_locked(pkt, ulp, pktlen, parent,
1598		    ruleid, rulenum, bucket, kidx) != 0) {
1599			DYN_BUCKET_UNLOCK(bucket);
1600			return (EEXIST);
1601		}
1602	}
1603
1604	data = dyn_alloc_dyndata(parent, ruleid, rulenum, set, pkt, ulp,
1605	    pktlen, hashval, fibnum);
1606	if (data == NULL) {
1607		DYN_BUCKET_UNLOCK(bucket);
1608		return (ENOMEM);
1609	}
1610
1611	s = dyn_alloc_ipv4_state(pkt, kidx, type);
1612	if (s == NULL) {
1613		DYN_BUCKET_UNLOCK(bucket);
1614		uma_zfree(V_dyn_data_zone, data);
1615		return (ENOMEM);
1616	}
1617
1618	s->data = data;
1619	CK_SLIST_INSERT_HEAD(&V_dyn_ipv4[bucket], s, entry);
1620	DYN_COUNT_INC(dyn_count);
1621	DYN_BUCKET_VERSION_BUMP(bucket, ipv4_add);
1622	DYN_BUCKET_UNLOCK(bucket);
1623	return (0);
1624}
1625
1626#ifdef INET6
1627static struct dyn_ipv6_state *
1628dyn_alloc_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1629    uint16_t kidx, uint8_t type)
1630{
1631	struct dyn_ipv6_state *s;
1632
1633	s = uma_zalloc(V_dyn_ipv6_zone, M_NOWAIT | M_ZERO);
1634	if (s == NULL)
1635		return (NULL);
1636
1637	s->type = type;
1638	s->kidx = kidx;
1639	s->zoneid = zoneid;
1640	s->proto = pkt->proto;
1641	s->sport = pkt->src_port;
1642	s->dport = pkt->dst_port;
1643	s->src = pkt->src_ip6;
1644	s->dst = pkt->dst_ip6;
1645	return (s);
1646}
1647
1648/*
1649 * Add IPv6 parent state.
1650 * Returns pointer to parent state. When it is not NULL we are in
1651 * critical section and pointer protected by hazard pointer.
1652 * When some error occurs, it return NULL and exit from critical section
1653 * is not needed.
1654 */
1655static struct dyn_ipv6_state *
1656dyn_add_ipv6_parent(void *rule, uint32_t ruleid, uint16_t rulenum,
1657    uint8_t set, const struct ipfw_flow_id *pkt, uint32_t zoneid,
1658    uint32_t hashval, uint32_t version, uint16_t kidx)
1659{
1660	struct dyn_ipv6_state *s;
1661	struct dyn_parent *limit;
1662	uint32_t bucket;
1663
1664	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1665	DYN_BUCKET_LOCK(bucket);
1666	if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_add)) {
1667		/*
1668		 * Bucket version has been changed since last lookup,
1669		 * do lookup again to be sure that state does not exist.
1670		 */
1671		s = dyn_lookup_ipv6_parent_locked(pkt, zoneid, rule, ruleid,
1672		    rulenum, bucket);
1673		if (s != NULL) {
1674			/*
1675			 * Simultaneous thread has already created this
1676			 * state. Just return it.
1677			 */
1678			DYNSTATE_CRITICAL_ENTER();
1679			DYNSTATE_PROTECT(s);
1680			DYN_BUCKET_UNLOCK(bucket);
1681			return (s);
1682		}
1683	}
1684
1685	limit = dyn_alloc_parent(rule, ruleid, rulenum, set, hashval);
1686	if (limit == NULL) {
1687		DYN_BUCKET_UNLOCK(bucket);
1688		return (NULL);
1689	}
1690
1691	s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, O_LIMIT_PARENT);
1692	if (s == NULL) {
1693		DYN_BUCKET_UNLOCK(bucket);
1694		uma_zfree(V_dyn_parent_zone, limit);
1695		return (NULL);
1696	}
1697
1698	s->limit = limit;
1699	CK_SLIST_INSERT_HEAD(&V_dyn_ipv6_parent[bucket], s, entry);
1700	DYN_COUNT_INC(dyn_parent_count);
1701	DYN_BUCKET_VERSION_BUMP(bucket, ipv6_parent_add);
1702	DYNSTATE_CRITICAL_ENTER();
1703	DYNSTATE_PROTECT(s);
1704	DYN_BUCKET_UNLOCK(bucket);
1705	return (s);
1706}
1707
1708static int
1709dyn_add_ipv6_state(void *parent, uint32_t ruleid, uint16_t rulenum,
1710    uint8_t set, const struct ipfw_flow_id *pkt, uint32_t zoneid,
1711    const void *ulp, int pktlen, uint32_t hashval, struct ipfw_dyn_info *info,
1712    uint16_t fibnum, uint16_t kidx, uint8_t type)
1713{
1714	struct dyn_ipv6_state *s;
1715	struct dyn_data *data;
1716	uint32_t bucket;
1717
1718	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1719	DYN_BUCKET_LOCK(bucket);
1720	if (info->direction == MATCH_UNKNOWN ||
1721	    info->kidx != kidx ||
1722	    info->hashval != hashval ||
1723	    info->version != DYN_BUCKET_VERSION(bucket, ipv6_add)) {
1724		/*
1725		 * Bucket version has been changed since last lookup,
1726		 * do lookup again to be sure that state does not exist.
1727		 */
1728		if (dyn_lookup_ipv6_state_locked(pkt, zoneid, ulp, pktlen,
1729		    parent, ruleid, rulenum, bucket, kidx) != 0) {
1730			DYN_BUCKET_UNLOCK(bucket);
1731			return (EEXIST);
1732		}
1733	}
1734
1735	data = dyn_alloc_dyndata(parent, ruleid, rulenum, set, pkt, ulp,
1736	    pktlen, hashval, fibnum);
1737	if (data == NULL) {
1738		DYN_BUCKET_UNLOCK(bucket);
1739		return (ENOMEM);
1740	}
1741
1742	s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, type);
1743	if (s == NULL) {
1744		DYN_BUCKET_UNLOCK(bucket);
1745		uma_zfree(V_dyn_data_zone, data);
1746		return (ENOMEM);
1747	}
1748
1749	s->data = data;
1750	CK_SLIST_INSERT_HEAD(&V_dyn_ipv6[bucket], s, entry);
1751	DYN_COUNT_INC(dyn_count);
1752	DYN_BUCKET_VERSION_BUMP(bucket, ipv6_add);
1753	DYN_BUCKET_UNLOCK(bucket);
1754	return (0);
1755}
1756#endif /* INET6 */
1757
1758static void *
1759dyn_get_parent_state(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1760    struct ip_fw *rule, uint32_t hashval, uint32_t limit, uint16_t kidx)
1761{
1762	char sbuf[24];
1763	struct dyn_parent *p;
1764	void *ret;
1765	uint32_t bucket, version;
1766
1767	p = NULL;
1768	ret = NULL;
1769	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1770	DYNSTATE_CRITICAL_ENTER();
1771	if (IS_IP4_FLOW_ID(pkt)) {
1772		struct dyn_ipv4_state *s;
1773
1774		version = DYN_BUCKET_VERSION(bucket, ipv4_parent_add);
1775		s = dyn_lookup_ipv4_parent(pkt, rule, rule->id,
1776		    rule->rulenum, bucket);
1777		if (s == NULL) {
1778			/*
1779			 * Exit from critical section because dyn_add_parent()
1780			 * will acquire bucket lock.
1781			 */
1782			DYNSTATE_CRITICAL_EXIT();
1783
1784			s = dyn_add_ipv4_parent(rule, rule->id,
1785			    rule->rulenum, rule->set, pkt, hashval,
1786			    version, kidx);
1787			if (s == NULL)
1788				return (NULL);
1789			/* Now we are in critical section again. */
1790		}
1791		ret = s;
1792		p = s->limit;
1793	}
1794#ifdef INET6
1795	else if (IS_IP6_FLOW_ID(pkt)) {
1796		struct dyn_ipv6_state *s;
1797
1798		version = DYN_BUCKET_VERSION(bucket, ipv6_parent_add);
1799		s = dyn_lookup_ipv6_parent(pkt, zoneid, rule, rule->id,
1800		    rule->rulenum, bucket);
1801		if (s == NULL) {
1802			/*
1803			 * Exit from critical section because dyn_add_parent()
1804			 * can acquire bucket mutex.
1805			 */
1806			DYNSTATE_CRITICAL_EXIT();
1807
1808			s = dyn_add_ipv6_parent(rule, rule->id,
1809			    rule->rulenum, rule->set, pkt, zoneid, hashval,
1810			    version, kidx);
1811			if (s == NULL)
1812				return (NULL);
1813			/* Now we are in critical section again. */
1814		}
1815		ret = s;
1816		p = s->limit;
1817	}
1818#endif
1819	else {
1820		DYNSTATE_CRITICAL_EXIT();
1821		return (NULL);
1822	}
1823
1824	/* Check the limit */
1825	if (DPARENT_COUNT(p) >= limit) {
1826		DYNSTATE_CRITICAL_EXIT();
1827		if (V_fw_verbose && last_log != time_uptime) {
1828			last_log = time_uptime;
1829			snprintf(sbuf, sizeof(sbuf), "%u drop session",
1830			    rule->rulenum);
1831			print_dyn_rule_flags(pkt, O_LIMIT,
1832			    LOG_SECURITY | LOG_DEBUG, sbuf,
1833			    "too many entries");
1834		}
1835		return (NULL);
1836	}
1837
1838	/* Take new session into account. */
1839	DPARENT_COUNT_INC(p);
1840	/*
1841	 * We must exit from critical section because the following code
1842	 * can acquire bucket mutex.
1843	 * We rely on the the 'count' field. The state will not expire
1844	 * until it has some child states, i.e. 'count' field is not zero.
1845	 * Return state pointer, it will be used by child states as parent.
1846	 */
1847	DYNSTATE_CRITICAL_EXIT();
1848	return (ret);
1849}
1850
1851static int
1852dyn_install_state(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1853    uint16_t fibnum, const void *ulp, int pktlen, void *rule,
1854    uint32_t ruleid, uint16_t rulenum, uint8_t set,
1855    struct ipfw_dyn_info *info, uint32_t limit, uint16_t limit_mask,
1856    uint16_t kidx, uint8_t type)
1857{
1858	struct ipfw_flow_id id;
1859	uint32_t hashval, parent_hashval;
1860	int ret;
1861
1862	MPASS(type == O_LIMIT || type == O_KEEP_STATE);
1863
1864	if (type == O_LIMIT) {
1865		/* Create masked flow id and calculate bucket */
1866		id.addr_type = pkt->addr_type;
1867		id.proto = pkt->proto;
1868		id.fib = fibnum; /* unused */
1869		id.src_port = (limit_mask & DYN_SRC_PORT) ?
1870		    pkt->src_port: 0;
1871		id.dst_port = (limit_mask & DYN_DST_PORT) ?
1872		    pkt->dst_port: 0;
1873		if (IS_IP4_FLOW_ID(pkt)) {
1874			id.src_ip = (limit_mask & DYN_SRC_ADDR) ?
1875			    pkt->src_ip: 0;
1876			id.dst_ip = (limit_mask & DYN_DST_ADDR) ?
1877			    pkt->dst_ip: 0;
1878		}
1879#ifdef INET6
1880		else if (IS_IP6_FLOW_ID(pkt)) {
1881			if (limit_mask & DYN_SRC_ADDR)
1882				id.src_ip6 = pkt->src_ip6;
1883			else
1884				memset(&id.src_ip6, 0, sizeof(id.src_ip6));
1885			if (limit_mask & DYN_DST_ADDR)
1886				id.dst_ip6 = pkt->dst_ip6;
1887			else
1888				memset(&id.dst_ip6, 0, sizeof(id.dst_ip6));
1889		}
1890#endif
1891		else
1892			return (EAFNOSUPPORT);
1893
1894		parent_hashval = hash_parent(&id, rule);
1895		rule = dyn_get_parent_state(&id, zoneid, rule, parent_hashval,
1896		    limit, kidx);
1897		if (rule == NULL) {
1898#if 0
1899			if (V_fw_verbose && last_log != time_uptime) {
1900				last_log = time_uptime;
1901				snprintf(sbuf, sizeof(sbuf),
1902				    "%u drop session", rule->rulenum);
1903			print_dyn_rule_flags(pkt, O_LIMIT,
1904			    LOG_SECURITY | LOG_DEBUG, sbuf,
1905			    "too many entries");
1906			}
1907#endif
1908			return (EACCES);
1909		}
1910		/*
1911		 * Limit is not reached, create new state.
1912		 * Now rule points to parent state.
1913		 */
1914	}
1915
1916	hashval = hash_packet(pkt);
1917	if (IS_IP4_FLOW_ID(pkt))
1918		ret = dyn_add_ipv4_state(rule, ruleid, rulenum, set, pkt,
1919		    ulp, pktlen, hashval, info, fibnum, kidx, type);
1920#ifdef INET6
1921	else if (IS_IP6_FLOW_ID(pkt))
1922		ret = dyn_add_ipv6_state(rule, ruleid, rulenum, set, pkt,
1923		    zoneid, ulp, pktlen, hashval, info, fibnum, kidx, type);
1924#endif /* INET6 */
1925	else
1926		ret = EAFNOSUPPORT;
1927
1928	if (type == O_LIMIT) {
1929		if (ret != 0) {
1930			/*
1931			 * We failed to create child state for O_LIMIT
1932			 * opcode. Since we already counted it in the parent,
1933			 * we must revert counter back. The 'rule' points to
1934			 * parent state, use it to get dyn_parent.
1935			 *
1936			 * XXXAE: it should be safe to use 'rule' pointer
1937			 * without extra lookup, parent state is referenced
1938			 * and should not be freed.
1939			 */
1940			if (IS_IP4_FLOW_ID(&id))
1941				DPARENT_COUNT_DEC(
1942				    ((struct dyn_ipv4_state *)rule)->limit);
1943#ifdef INET6
1944			else if (IS_IP6_FLOW_ID(&id))
1945				DPARENT_COUNT_DEC(
1946				    ((struct dyn_ipv6_state *)rule)->limit);
1947#endif
1948		}
1949	}
1950	/*
1951	 * EEXIST means that simultaneous thread has created this
1952	 * state. Consider this as success.
1953	 *
1954	 * XXXAE: should we invalidate 'info' content here?
1955	 */
1956	if (ret == EEXIST)
1957		return (0);
1958	return (ret);
1959}
1960
1961/*
1962 * Install dynamic state.
1963 *  chain - ipfw's instance;
1964 *  rule - the parent rule that installs the state;
1965 *  cmd - opcode that installs the state;
1966 *  args - ipfw arguments;
1967 *  ulp - upper level protocol header;
1968 *  pktlen - packet length;
1969 *  info - dynamic state lookup info;
1970 *  tablearg - tablearg id.
1971 *
1972 * Returns non-zero value (failure) if state is not installed because
1973 * of errors or because session limitations are enforced.
1974 */
1975int
1976ipfw_dyn_install_state(struct ip_fw_chain *chain, struct ip_fw *rule,
1977    const ipfw_insn_limit *cmd, const struct ip_fw_args *args,
1978    const void *ulp, int pktlen, struct ipfw_dyn_info *info,
1979    uint32_t tablearg)
1980{
1981	uint32_t limit;
1982	uint16_t limit_mask;
1983
1984	if (cmd->o.opcode == O_LIMIT) {
1985		limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit);
1986		limit_mask = cmd->limit_mask;
1987	} else {
1988		limit = 0;
1989		limit_mask = 0;
1990	}
1991	return (dyn_install_state(&args->f_id,
1992#ifdef INET6
1993	    IS_IP6_FLOW_ID(&args->f_id) ? dyn_getscopeid(args):
1994#endif
1995	    0, M_GETFIB(args->m), ulp, pktlen, rule, rule->id, rule->rulenum,
1996	    rule->set, info, limit, limit_mask, cmd->o.arg1, cmd->o.opcode));
1997}
1998
1999/*
2000 * Free safe to remove state entries from expired lists.
2001 */
2002static void
2003dyn_free_states(struct ip_fw_chain *chain)
2004{
2005	struct dyn_ipv4_state *s4, *s4n;
2006#ifdef INET6
2007	struct dyn_ipv6_state *s6, *s6n;
2008#endif
2009	int cached_count, i;
2010
2011	/*
2012	 * We keep pointers to objects that are in use on each CPU
2013	 * in the per-cpu dyn_hp pointer. When object is going to be
2014	 * removed, first of it is unlinked from the corresponding
2015	 * list. This leads to changing of dyn_bucket_xxx_delver version.
2016	 * Unlinked objects is placed into corresponding dyn_expired_xxx
2017	 * list. Reader that is going to dereference object pointer checks
2018	 * dyn_bucket_xxx_delver version before and after storing pointer
2019	 * into dyn_hp. If version is the same, the object is protected
2020	 * from freeing and it is safe to dereference. Othervise reader
2021	 * tries to iterate list again from the beginning, but this object
2022	 * now unlinked and thus will not be accessible.
2023	 *
2024	 * Copy dyn_hp pointers for each CPU into dyn_hp_cache array.
2025	 * It does not matter that some pointer can be changed in
2026	 * time while we are copying. We need to check, that objects
2027	 * removed in the previous pass are not in use. And if dyn_hp
2028	 * pointer does not contain it in the time when we are copying,
2029	 * it will not appear there, because it is already unlinked.
2030	 * And for new pointers we will not free objects that will be
2031	 * unlinked in this pass.
2032	 */
2033	cached_count = 0;
2034	CPU_FOREACH(i) {
2035		dyn_hp_cache[cached_count] = DYNSTATE_GET(i);
2036		if (dyn_hp_cache[cached_count] != NULL)
2037			cached_count++;
2038	}
2039
2040	/*
2041	 * Free expired states that are safe to free.
2042	 * Check each entry from previous pass in the dyn_expired_xxx
2043	 * list, if pointer to the object is in the dyn_hp_cache array,
2044	 * keep it until next pass. Otherwise it is safe to free the
2045	 * object.
2046	 *
2047	 * XXXAE: optimize this to use SLIST_REMOVE_AFTER.
2048	 */
2049#define	DYN_FREE_STATES(s, next, name)		do {			\
2050	s = SLIST_FIRST(&V_dyn_expired_ ## name);			\
2051	while (s != NULL) {						\
2052		next = SLIST_NEXT(s, expired);				\
2053		for (i = 0; i < cached_count; i++)			\
2054			if (dyn_hp_cache[i] == s)			\
2055				break;					\
2056		if (i == cached_count) {				\
2057			if (s->type == O_LIMIT_PARENT &&		\
2058			    s->limit->count != 0) {			\
2059				s = next;				\
2060				continue;				\
2061			}						\
2062			SLIST_REMOVE(&V_dyn_expired_ ## name,		\
2063			    s, dyn_ ## name ## _state, expired);	\
2064			if (s->type == O_LIMIT_PARENT)			\
2065				uma_zfree(V_dyn_parent_zone, s->limit);	\
2066			else						\
2067				uma_zfree(V_dyn_data_zone, s->data);	\
2068			uma_zfree(V_dyn_ ## name ## _zone, s);		\
2069		}							\
2070		s = next;						\
2071	}								\
2072} while (0)
2073
2074	/*
2075	 * Protect access to expired lists with DYN_EXPIRED_LOCK.
2076	 * Userland can invoke ipfw_expire_dyn_states() to delete
2077	 * specific states, this will lead to modification of expired
2078	 * lists.
2079	 *
2080	 * XXXAE: do we need DYN_EXPIRED_LOCK? We can just use
2081	 *	  IPFW_UH_WLOCK to protect access to these lists.
2082	 */
2083	DYN_EXPIRED_LOCK();
2084	DYN_FREE_STATES(s4, s4n, ipv4);
2085#ifdef INET6
2086	DYN_FREE_STATES(s6, s6n, ipv6);
2087#endif
2088	DYN_EXPIRED_UNLOCK();
2089#undef DYN_FREE_STATES
2090}
2091
2092/*
2093 * Returns 1 when state is matched by specified range, otherwise returns 0.
2094 */
2095static int
2096dyn_match_range(uint16_t rulenum, uint8_t set, const ipfw_range_tlv *rt)
2097{
2098
2099	MPASS(rt != NULL);
2100	/* flush all states */
2101	if (rt->flags & IPFW_RCFLAG_ALL)
2102		return (1);
2103	if ((rt->flags & IPFW_RCFLAG_SET) != 0 && set != rt->set)
2104		return (0);
2105	if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 &&
2106	    (rulenum < rt->start_rule || rulenum > rt->end_rule))
2107		return (0);
2108	return (1);
2109}
2110
2111static int
2112dyn_match_ipv4_state(struct dyn_ipv4_state *s, const ipfw_range_tlv *rt)
2113{
2114
2115	if (s->type == O_LIMIT_PARENT)
2116		return (dyn_match_range(s->limit->rulenum,
2117		    s->limit->set, rt));
2118
2119	if (s->type == O_LIMIT)
2120		return (dyn_match_range(s->data->rulenum, s->data->set, rt));
2121
2122	if (dyn_match_range(s->data->rulenum, s->data->set, rt))
2123		return (1);
2124
2125	return (0);
2126}
2127
2128#ifdef INET6
2129static int
2130dyn_match_ipv6_state(struct dyn_ipv6_state *s, const ipfw_range_tlv *rt)
2131{
2132
2133	if (s->type == O_LIMIT_PARENT)
2134		return (dyn_match_range(s->limit->rulenum,
2135		    s->limit->set, rt));
2136
2137	if (s->type == O_LIMIT)
2138		return (dyn_match_range(s->data->rulenum, s->data->set, rt));
2139
2140	if (dyn_match_range(s->data->rulenum, s->data->set, rt))
2141		return (1);
2142
2143	return (0);
2144}
2145#endif
2146
2147/*
2148 * Unlink expired entries from states lists.
2149 * @rt can be used to specify the range of states for deletion.
2150 */
2151static void
2152dyn_expire_states(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
2153{
2154	struct dyn_ipv4_slist expired_ipv4;
2155#ifdef INET6
2156	struct dyn_ipv6_slist expired_ipv6;
2157	struct dyn_ipv6_state *s6, *s6n, *s6p;
2158#endif
2159	struct dyn_ipv4_state *s4, *s4n, *s4p;
2160	int bucket, removed, length, max_length;
2161
2162	/*
2163	 * Unlink expired states from each bucket.
2164	 * With acquired bucket lock iterate entries of each lists:
2165	 * ipv4, ipv4_parent, ipv6, and ipv6_parent. Check expired time
2166	 * and unlink entry from the list, link entry into temporary
2167	 * expired_xxx lists then bump "del" bucket version.
2168	 *
2169	 * When an entry is removed, corresponding states counter is
2170	 * decremented. If entry has O_LIMIT type, parent's reference
2171	 * counter is decremented.
2172	 *
2173	 * NOTE: this function can be called from userspace context
2174	 * when user deletes rules. In this case all matched states
2175	 * will be forcedly unlinked. O_LIMIT_PARENT states will be kept
2176	 * in the expired lists until reference counter become zero.
2177	 */
2178#define	DYN_UNLINK_STATES(s, prev, next, exp, af, name, extra)	do {	\
2179	length = 0;							\
2180	removed = 0;							\
2181	prev = NULL;							\
2182	s = CK_SLIST_FIRST(&V_dyn_ ## name [bucket]);			\
2183	while (s != NULL) {						\
2184		next = CK_SLIST_NEXT(s, entry);				\
2185		if ((TIME_LEQ((s)->exp, time_uptime) && extra) ||	\
2186		    (rt != NULL && dyn_match_ ## af ## _state(s, rt))) {\
2187			if (prev != NULL)				\
2188				CK_SLIST_REMOVE_AFTER(prev, entry);	\
2189			else						\
2190				CK_SLIST_REMOVE_HEAD(			\
2191				    &V_dyn_ ## name [bucket], entry);	\
2192			removed++;					\
2193			SLIST_INSERT_HEAD(&expired_ ## af, s, expired);	\
2194			if (s->type == O_LIMIT_PARENT)			\
2195				DYN_COUNT_DEC(dyn_parent_count);	\
2196			else {						\
2197				DYN_COUNT_DEC(dyn_count);		\
2198				if (s->type == O_LIMIT)	{		\
2199					s = s->data->parent;		\
2200					DPARENT_COUNT_DEC(s->limit);	\
2201				}					\
2202			}						\
2203		} else {						\
2204			prev = s;					\
2205			length++;					\
2206		}							\
2207		s = next;						\
2208	}								\
2209	if (removed != 0)						\
2210		DYN_BUCKET_VERSION_BUMP(bucket, name ## _del);		\
2211	if (length > max_length)				\
2212		max_length = length;				\
2213} while (0)
2214
2215	SLIST_INIT(&expired_ipv4);
2216#ifdef INET6
2217	SLIST_INIT(&expired_ipv6);
2218#endif
2219	max_length = 0;
2220	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2221		DYN_BUCKET_LOCK(bucket);
2222		DYN_UNLINK_STATES(s4, s4p, s4n, data->expire, ipv4, ipv4, 1);
2223		DYN_UNLINK_STATES(s4, s4p, s4n, limit->expire, ipv4,
2224		    ipv4_parent, (s4->limit->count == 0));
2225#ifdef INET6
2226		DYN_UNLINK_STATES(s6, s6p, s6n, data->expire, ipv6, ipv6, 1);
2227		DYN_UNLINK_STATES(s6, s6p, s6n, limit->expire, ipv6,
2228		    ipv6_parent, (s6->limit->count == 0));
2229#endif
2230		DYN_BUCKET_UNLOCK(bucket);
2231	}
2232	/* Update curr_max_length for statistics. */
2233	V_curr_max_length = max_length;
2234	/*
2235	 * Concatenate temporary lists with global expired lists.
2236	 */
2237	DYN_EXPIRED_LOCK();
2238	SLIST_CONCAT(&V_dyn_expired_ipv4, &expired_ipv4,
2239	    dyn_ipv4_state, expired);
2240#ifdef INET6
2241	SLIST_CONCAT(&V_dyn_expired_ipv6, &expired_ipv6,
2242	    dyn_ipv6_state, expired);
2243#endif
2244	DYN_EXPIRED_UNLOCK();
2245#undef DYN_UNLINK_STATES
2246#undef DYN_UNREF_STATES
2247}
2248
2249static struct mbuf *
2250dyn_mgethdr(int len, uint16_t fibnum)
2251{
2252	struct mbuf *m;
2253
2254	m = m_gethdr(M_NOWAIT, MT_DATA);
2255	if (m == NULL)
2256		return (NULL);
2257#ifdef MAC
2258	mac_netinet_firewall_send(m);
2259#endif
2260	M_SETFIB(m, fibnum);
2261	m->m_data += max_linkhdr;
2262	m->m_flags |= M_SKIP_FIREWALL;
2263	m->m_len = m->m_pkthdr.len = len;
2264	bzero(m->m_data, len);
2265	return (m);
2266}
2267
2268static void
2269dyn_make_keepalive_ipv4(struct mbuf *m, in_addr_t src, in_addr_t dst,
2270    uint32_t seq, uint32_t ack, uint16_t sport, uint16_t dport)
2271{
2272	struct tcphdr *tcp;
2273	struct ip *ip;
2274
2275	ip = mtod(m, struct ip *);
2276	ip->ip_v = 4;
2277	ip->ip_hl = sizeof(*ip) >> 2;
2278	ip->ip_tos = IPTOS_LOWDELAY;
2279	ip->ip_len = htons(m->m_len);
2280	ip->ip_off |= htons(IP_DF);
2281	ip->ip_ttl = V_ip_defttl;
2282	ip->ip_p = IPPROTO_TCP;
2283	ip->ip_src.s_addr = htonl(src);
2284	ip->ip_dst.s_addr = htonl(dst);
2285
2286	tcp = mtodo(m, sizeof(struct ip));
2287	tcp->th_sport = htons(sport);
2288	tcp->th_dport = htons(dport);
2289	tcp->th_off = sizeof(struct tcphdr) >> 2;
2290	tcp->th_seq = htonl(seq);
2291	tcp->th_ack = htonl(ack);
2292	tcp->th_flags = TH_ACK;
2293	tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2294	    htons(sizeof(struct tcphdr) + IPPROTO_TCP));
2295
2296	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
2297	m->m_pkthdr.csum_flags = CSUM_TCP;
2298}
2299
2300static void
2301dyn_enqueue_keepalive_ipv4(struct mbufq *q, const struct dyn_ipv4_state *s)
2302{
2303	struct mbuf *m;
2304
2305	if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) {
2306		m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr),
2307		    s->data->fibnum);
2308		if (m != NULL) {
2309			dyn_make_keepalive_ipv4(m, s->dst, s->src,
2310			    s->data->ack_fwd - 1, s->data->ack_rev,
2311			    s->dport, s->sport);
2312			if (mbufq_enqueue(q, m)) {
2313				m_freem(m);
2314				log(LOG_DEBUG, "ipfw: limit for IPv4 "
2315				    "keepalive queue is reached.\n");
2316				return;
2317			}
2318		}
2319	}
2320
2321	if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) {
2322		m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr),
2323		    s->data->fibnum);
2324		if (m != NULL) {
2325			dyn_make_keepalive_ipv4(m, s->src, s->dst,
2326			    s->data->ack_rev - 1, s->data->ack_fwd,
2327			    s->sport, s->dport);
2328			if (mbufq_enqueue(q, m)) {
2329				m_freem(m);
2330				log(LOG_DEBUG, "ipfw: limit for IPv4 "
2331				    "keepalive queue is reached.\n");
2332				return;
2333			}
2334		}
2335	}
2336}
2337
2338/*
2339 * Prepare and send keep-alive packets.
2340 */
2341static void
2342dyn_send_keepalive_ipv4(struct ip_fw_chain *chain)
2343{
2344	struct mbufq q;
2345	struct mbuf *m;
2346	struct dyn_ipv4_state *s;
2347	uint32_t bucket;
2348
2349	mbufq_init(&q, DYN_KEEPALIVE_MAXQ);
2350	IPFW_UH_RLOCK(chain);
2351	/*
2352	 * It is safe to not use hazard pointer and just do lockless
2353	 * access to the lists, because states entries can not be deleted
2354	 * while we hold IPFW_UH_RLOCK.
2355	 */
2356	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2357		CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) {
2358			/*
2359			 * Only established TCP connections that will
2360			 * become expired withing dyn_keepalive_interval.
2361			 */
2362			if (s->proto != IPPROTO_TCP ||
2363			    (s->data->state & BOTH_SYN) != BOTH_SYN ||
2364			    TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
2365				s->data->expire))
2366				continue;
2367			dyn_enqueue_keepalive_ipv4(&q, s);
2368		}
2369	}
2370	IPFW_UH_RUNLOCK(chain);
2371	while ((m = mbufq_dequeue(&q)) != NULL)
2372		ip_output(m, NULL, NULL, 0, NULL, NULL);
2373}
2374
2375#ifdef INET6
2376static void
2377dyn_make_keepalive_ipv6(struct mbuf *m, const struct in6_addr *src,
2378    const struct in6_addr *dst, uint32_t zoneid, uint32_t seq, uint32_t ack,
2379    uint16_t sport, uint16_t dport)
2380{
2381	struct tcphdr *tcp;
2382	struct ip6_hdr *ip6;
2383
2384	ip6 = mtod(m, struct ip6_hdr *);
2385	ip6->ip6_vfc |= IPV6_VERSION;
2386	ip6->ip6_plen = htons(sizeof(struct tcphdr));
2387	ip6->ip6_nxt = IPPROTO_TCP;
2388	ip6->ip6_hlim = IPV6_DEFHLIM;
2389	ip6->ip6_src = *src;
2390	if (IN6_IS_ADDR_LINKLOCAL(src))
2391		ip6->ip6_src.s6_addr16[1] = htons(zoneid & 0xffff);
2392	ip6->ip6_dst = *dst;
2393	if (IN6_IS_ADDR_LINKLOCAL(dst))
2394		ip6->ip6_dst.s6_addr16[1] = htons(zoneid & 0xffff);
2395
2396	tcp = mtodo(m, sizeof(struct ip6_hdr));
2397	tcp->th_sport = htons(sport);
2398	tcp->th_dport = htons(dport);
2399	tcp->th_off = sizeof(struct tcphdr) >> 2;
2400	tcp->th_seq = htonl(seq);
2401	tcp->th_ack = htonl(ack);
2402	tcp->th_flags = TH_ACK;
2403	tcp->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr),
2404	    IPPROTO_TCP, 0);
2405
2406	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
2407	m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
2408}
2409
2410static void
2411dyn_enqueue_keepalive_ipv6(struct mbufq *q, const struct dyn_ipv6_state *s)
2412{
2413	struct mbuf *m;
2414
2415	if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) {
2416		m = dyn_mgethdr(sizeof(struct ip6_hdr) +
2417		    sizeof(struct tcphdr), s->data->fibnum);
2418		if (m != NULL) {
2419			dyn_make_keepalive_ipv6(m, &s->dst, &s->src,
2420			    s->zoneid, s->data->ack_fwd - 1, s->data->ack_rev,
2421			    s->dport, s->sport);
2422			if (mbufq_enqueue(q, m)) {
2423				m_freem(m);
2424				log(LOG_DEBUG, "ipfw: limit for IPv6 "
2425				    "keepalive queue is reached.\n");
2426				return;
2427			}
2428		}
2429	}
2430
2431	if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) {
2432		m = dyn_mgethdr(sizeof(struct ip6_hdr) +
2433		    sizeof(struct tcphdr), s->data->fibnum);
2434		if (m != NULL) {
2435			dyn_make_keepalive_ipv6(m, &s->src, &s->dst,
2436			    s->zoneid, s->data->ack_rev - 1, s->data->ack_fwd,
2437			    s->sport, s->dport);
2438			if (mbufq_enqueue(q, m)) {
2439				m_freem(m);
2440				log(LOG_DEBUG, "ipfw: limit for IPv6 "
2441				    "keepalive queue is reached.\n");
2442				return;
2443			}
2444		}
2445	}
2446}
2447
2448static void
2449dyn_send_keepalive_ipv6(struct ip_fw_chain *chain)
2450{
2451	struct mbufq q;
2452	struct mbuf *m;
2453	struct dyn_ipv6_state *s;
2454	uint32_t bucket;
2455
2456	mbufq_init(&q, DYN_KEEPALIVE_MAXQ);
2457	IPFW_UH_RLOCK(chain);
2458	/*
2459	 * It is safe to not use hazard pointer and just do lockless
2460	 * access to the lists, because states entries can not be deleted
2461	 * while we hold IPFW_UH_RLOCK.
2462	 */
2463	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2464		CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) {
2465			/*
2466			 * Only established TCP connections that will
2467			 * become expired withing dyn_keepalive_interval.
2468			 */
2469			if (s->proto != IPPROTO_TCP ||
2470			    (s->data->state & BOTH_SYN) != BOTH_SYN ||
2471			    TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
2472				s->data->expire))
2473				continue;
2474			dyn_enqueue_keepalive_ipv6(&q, s);
2475		}
2476	}
2477	IPFW_UH_RUNLOCK(chain);
2478	while ((m = mbufq_dequeue(&q)) != NULL)
2479		ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
2480}
2481#endif /* INET6 */
2482
2483static void
2484dyn_grow_hashtable(struct ip_fw_chain *chain, uint32_t new)
2485{
2486#ifdef INET6
2487	struct dyn_ipv6ck_slist *ipv6, *ipv6_parent;
2488	uint32_t *ipv6_add, *ipv6_del, *ipv6_parent_add, *ipv6_parent_del;
2489	struct dyn_ipv6_state *s6;
2490#endif
2491	struct dyn_ipv4ck_slist *ipv4, *ipv4_parent;
2492	uint32_t *ipv4_add, *ipv4_del, *ipv4_parent_add, *ipv4_parent_del;
2493	struct dyn_ipv4_state *s4;
2494	struct mtx *bucket_lock;
2495	void *tmp;
2496	uint32_t bucket;
2497
2498	MPASS(powerof2(new));
2499	DYN_DEBUG("grow hash size %u -> %u", V_curr_dyn_buckets, new);
2500	/*
2501	 * Allocate and initialize new lists.
2502	 * XXXAE: on memory pressure this can disable callout timer.
2503	 */
2504	bucket_lock = malloc(new * sizeof(struct mtx), M_IPFW,
2505	    M_WAITOK | M_ZERO);
2506	ipv4 = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW,
2507	    M_WAITOK | M_ZERO);
2508	ipv4_parent = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW,
2509	    M_WAITOK | M_ZERO);
2510	ipv4_add = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO);
2511	ipv4_del = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO);
2512	ipv4_parent_add = malloc(new * sizeof(uint32_t), M_IPFW,
2513	    M_WAITOK | M_ZERO);
2514	ipv4_parent_del = malloc(new * sizeof(uint32_t), M_IPFW,
2515	    M_WAITOK | M_ZERO);
2516#ifdef INET6
2517	ipv6 = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW,
2518	    M_WAITOK | M_ZERO);
2519	ipv6_parent = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW,
2520	    M_WAITOK | M_ZERO);
2521	ipv6_add = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO);
2522	ipv6_del = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO);
2523	ipv6_parent_add = malloc(new * sizeof(uint32_t), M_IPFW,
2524	    M_WAITOK | M_ZERO);
2525	ipv6_parent_del = malloc(new * sizeof(uint32_t), M_IPFW,
2526	    M_WAITOK | M_ZERO);
2527#endif
2528	for (bucket = 0; bucket < new; bucket++) {
2529		DYN_BUCKET_LOCK_INIT(bucket_lock, bucket);
2530		CK_SLIST_INIT(&ipv4[bucket]);
2531		CK_SLIST_INIT(&ipv4_parent[bucket]);
2532#ifdef INET6
2533		CK_SLIST_INIT(&ipv6[bucket]);
2534		CK_SLIST_INIT(&ipv6_parent[bucket]);
2535#endif
2536	}
2537
2538#define DYN_RELINK_STATES(s, hval, i, head, ohead)	do {		\
2539	while ((s = CK_SLIST_FIRST(&V_dyn_ ## ohead[i])) != NULL) {	\
2540		CK_SLIST_REMOVE_HEAD(&V_dyn_ ## ohead[i], entry);	\
2541		CK_SLIST_INSERT_HEAD(&head[DYN_BUCKET(s->hval, new)],	\
2542		    s, entry);						\
2543	}								\
2544} while (0)
2545	/*
2546	 * Prevent rules changing from userland.
2547	 */
2548	IPFW_UH_WLOCK(chain);
2549	/*
2550	 * Hold traffic processing until we finish resize to
2551	 * prevent access to states lists.
2552	 */
2553	IPFW_WLOCK(chain);
2554	/* Re-link all dynamic states */
2555	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2556		DYN_RELINK_STATES(s4, data->hashval, bucket, ipv4, ipv4);
2557		DYN_RELINK_STATES(s4, limit->hashval, bucket, ipv4_parent,
2558		    ipv4_parent);
2559#ifdef INET6
2560		DYN_RELINK_STATES(s6, data->hashval, bucket, ipv6, ipv6);
2561		DYN_RELINK_STATES(s6, limit->hashval, bucket, ipv6_parent,
2562		    ipv6_parent);
2563#endif
2564	}
2565
2566#define	DYN_SWAP_PTR(old, new, tmp)	do {		\
2567	tmp = old;					\
2568	old = new;					\
2569	new = tmp;					\
2570} while (0)
2571	/* Swap pointers */
2572	DYN_SWAP_PTR(V_dyn_bucket_lock, bucket_lock, tmp);
2573	DYN_SWAP_PTR(V_dyn_ipv4, ipv4, tmp);
2574	DYN_SWAP_PTR(V_dyn_ipv4_parent, ipv4_parent, tmp);
2575	DYN_SWAP_PTR(V_dyn_ipv4_add, ipv4_add, tmp);
2576	DYN_SWAP_PTR(V_dyn_ipv4_parent_add, ipv4_parent_add, tmp);
2577	DYN_SWAP_PTR(V_dyn_ipv4_del, ipv4_del, tmp);
2578	DYN_SWAP_PTR(V_dyn_ipv4_parent_del, ipv4_parent_del, tmp);
2579
2580#ifdef INET6
2581	DYN_SWAP_PTR(V_dyn_ipv6, ipv6, tmp);
2582	DYN_SWAP_PTR(V_dyn_ipv6_parent, ipv6_parent, tmp);
2583	DYN_SWAP_PTR(V_dyn_ipv6_add, ipv6_add, tmp);
2584	DYN_SWAP_PTR(V_dyn_ipv6_parent_add, ipv6_parent_add, tmp);
2585	DYN_SWAP_PTR(V_dyn_ipv6_del, ipv6_del, tmp);
2586	DYN_SWAP_PTR(V_dyn_ipv6_parent_del, ipv6_parent_del, tmp);
2587#endif
2588	bucket = V_curr_dyn_buckets;
2589	V_curr_dyn_buckets = new;
2590
2591	IPFW_WUNLOCK(chain);
2592	IPFW_UH_WUNLOCK(chain);
2593
2594	/* Release old resources */
2595	while (bucket-- != 0)
2596		DYN_BUCKET_LOCK_DESTROY(bucket_lock, bucket);
2597	free(bucket_lock, M_IPFW);
2598	free(ipv4, M_IPFW);
2599	free(ipv4_parent, M_IPFW);
2600	free(ipv4_add, M_IPFW);
2601	free(ipv4_parent_add, M_IPFW);
2602	free(ipv4_del, M_IPFW);
2603	free(ipv4_parent_del, M_IPFW);
2604#ifdef INET6
2605	free(ipv6, M_IPFW);
2606	free(ipv6_parent, M_IPFW);
2607	free(ipv6_add, M_IPFW);
2608	free(ipv6_parent_add, M_IPFW);
2609	free(ipv6_del, M_IPFW);
2610	free(ipv6_parent_del, M_IPFW);
2611#endif
2612}
2613
2614/*
2615 * This function is used to perform various maintenance
2616 * on dynamic hash lists. Currently it is called every second.
2617 */
2618static void
2619dyn_tick(void *vnetx)
2620{
2621	uint32_t buckets;
2622
2623	CURVNET_SET((struct vnet *)vnetx);
2624	/*
2625	 * First free states unlinked in previous passes.
2626	 */
2627	dyn_free_states(&V_layer3_chain);
2628	/*
2629	 * Now unlink others expired states.
2630	 * We use IPFW_UH_WLOCK to avoid concurrent call of
2631	 * dyn_expire_states(). It is the only function that does
2632	 * deletion of state entries from states lists.
2633	 */
2634	IPFW_UH_WLOCK(&V_layer3_chain);
2635	dyn_expire_states(&V_layer3_chain, NULL);
2636	IPFW_UH_WUNLOCK(&V_layer3_chain);
2637	/*
2638	 * Send keepalives if they are enabled and the time has come.
2639	 */
2640	if (V_dyn_keepalive != 0 &&
2641	    V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) {
2642		V_dyn_keepalive_last = time_uptime;
2643		dyn_send_keepalive_ipv4(&V_layer3_chain);
2644#ifdef INET6
2645		dyn_send_keepalive_ipv6(&V_layer3_chain);
2646#endif
2647	}
2648	/*
2649	 * Check if we need to resize the hash:
2650	 * if current number of states exceeds number of buckets in hash,
2651	 * and dyn_buckets_max permits to grow the number of buckets, then
2652	 * do it. Grow hash size to the minimum power of 2 which is bigger
2653	 * than current states count.
2654	 */
2655	if (V_curr_dyn_buckets < V_dyn_buckets_max &&
2656	    (V_curr_dyn_buckets < V_dyn_count / 2 || (
2657	    V_curr_dyn_buckets < V_dyn_count && V_curr_max_length > 8))) {
2658		buckets = 1 << fls(V_dyn_count);
2659		if (buckets > V_dyn_buckets_max)
2660			buckets = V_dyn_buckets_max;
2661		dyn_grow_hashtable(&V_layer3_chain, buckets);
2662	}
2663
2664	callout_reset_on(&V_dyn_timeout, hz, dyn_tick, vnetx, 0);
2665	CURVNET_RESTORE();
2666}
2667
2668void
2669ipfw_expire_dyn_states(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
2670{
2671	/*
2672	 * Do not perform any checks if we currently have no dynamic states
2673	 */
2674	if (V_dyn_count == 0)
2675		return;
2676
2677	IPFW_UH_WLOCK_ASSERT(chain);
2678	dyn_expire_states(chain, rt);
2679}
2680
2681/*
2682 * Returns size of dynamic states in legacy format
2683 */
2684int
2685ipfw_dyn_len(void)
2686{
2687
2688	return ((V_dyn_count + V_dyn_parent_count) * sizeof(ipfw_dyn_rule));
2689}
2690
2691/*
2692 * Returns number of dynamic states.
2693 * Used by dump format v1 (current).
2694 */
2695uint32_t
2696ipfw_dyn_get_count(void)
2697{
2698
2699	return (V_dyn_count + V_dyn_parent_count);
2700}
2701
2702/*
2703 * Check if rule contains at least one dynamic opcode.
2704 *
2705 * Returns 1 if such opcode is found, 0 otherwise.
2706 */
2707int
2708ipfw_is_dyn_rule(struct ip_fw *rule)
2709{
2710	int cmdlen, l;
2711	ipfw_insn *cmd;
2712
2713	l = rule->cmd_len;
2714	cmd = rule->cmd;
2715	cmdlen = 0;
2716	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
2717		cmdlen = F_LEN(cmd);
2718
2719		switch (cmd->opcode) {
2720		case O_LIMIT:
2721		case O_KEEP_STATE:
2722		case O_PROBE_STATE:
2723		case O_CHECK_STATE:
2724			return (1);
2725		}
2726	}
2727
2728	return (0);
2729}
2730
2731static void
2732dyn_export_parent(const struct dyn_parent *p, uint16_t kidx,
2733    ipfw_dyn_rule *dst)
2734{
2735
2736	dst->dyn_type = O_LIMIT_PARENT;
2737	dst->kidx = kidx;
2738	dst->count = (uint16_t)DPARENT_COUNT(p);
2739	dst->expire = TIME_LEQ(p->expire, time_uptime) ?  0:
2740	    p->expire - time_uptime;
2741
2742	/* 'rule' is used to pass up the rule number and set */
2743	memcpy(&dst->rule, &p->rulenum, sizeof(p->rulenum));
2744	/* store set number into high word of dst->rule pointer. */
2745	memcpy((char *)&dst->rule + sizeof(p->rulenum), &p->set,
2746	    sizeof(p->set));
2747
2748	/* unused fields */
2749	dst->pcnt = 0;
2750	dst->bcnt = 0;
2751	dst->parent = NULL;
2752	dst->state = 0;
2753	dst->ack_fwd = 0;
2754	dst->ack_rev = 0;
2755	dst->bucket = p->hashval;
2756	/*
2757	 * The legacy userland code will interpret a NULL here as a marker
2758	 * for the last dynamic rule.
2759	 */
2760	dst->next = (ipfw_dyn_rule *)1;
2761}
2762
2763static void
2764dyn_export_data(const struct dyn_data *data, uint16_t kidx, uint8_t type,
2765    ipfw_dyn_rule *dst)
2766{
2767
2768	dst->dyn_type = type;
2769	dst->kidx = kidx;
2770	dst->pcnt = data->pcnt_fwd + data->pcnt_rev;
2771	dst->bcnt = data->bcnt_fwd + data->bcnt_rev;
2772	dst->expire = TIME_LEQ(data->expire, time_uptime) ?  0:
2773	    data->expire - time_uptime;
2774
2775	/* 'rule' is used to pass up the rule number and set */
2776	memcpy(&dst->rule, &data->rulenum, sizeof(data->rulenum));
2777	/* store set number into high word of dst->rule pointer. */
2778	memcpy((char *)&dst->rule + sizeof(data->rulenum), &data->set,
2779	    sizeof(data->set));
2780
2781	/* unused fields */
2782	dst->parent = NULL;
2783	dst->state = data->state;
2784	dst->ack_fwd = data->ack_fwd;
2785	dst->ack_rev = data->ack_rev;
2786	dst->count = 0;
2787	dst->bucket = data->hashval;
2788	/*
2789	 * The legacy userland code will interpret a NULL here as a marker
2790	 * for the last dynamic rule.
2791	 */
2792	dst->next = (ipfw_dyn_rule *)1;
2793}
2794
2795static void
2796dyn_export_ipv4_state(const struct dyn_ipv4_state *s, ipfw_dyn_rule *dst)
2797{
2798
2799	switch (s->type) {
2800	case O_LIMIT_PARENT:
2801		dyn_export_parent(s->limit, s->kidx, dst);
2802		break;
2803	default:
2804		dyn_export_data(s->data, s->kidx, s->type, dst);
2805	}
2806
2807	dst->id.dst_ip = s->dst;
2808	dst->id.src_ip = s->src;
2809	dst->id.dst_port = s->dport;
2810	dst->id.src_port = s->sport;
2811	dst->id.fib = s->data->fibnum;
2812	dst->id.proto = s->proto;
2813	dst->id._flags = 0;
2814	dst->id.addr_type = 4;
2815
2816	memset(&dst->id.dst_ip6, 0, sizeof(dst->id.dst_ip6));
2817	memset(&dst->id.src_ip6, 0, sizeof(dst->id.src_ip6));
2818	dst->id.flow_id6 = dst->id.extra = 0;
2819}
2820
2821#ifdef INET6
2822static void
2823dyn_export_ipv6_state(const struct dyn_ipv6_state *s, ipfw_dyn_rule *dst)
2824{
2825
2826	switch (s->type) {
2827	case O_LIMIT_PARENT:
2828		dyn_export_parent(s->limit, s->kidx, dst);
2829		break;
2830	default:
2831		dyn_export_data(s->data, s->kidx, s->type, dst);
2832	}
2833
2834	dst->id.src_ip6 = s->src;
2835	dst->id.dst_ip6 = s->dst;
2836	dst->id.dst_port = s->dport;
2837	dst->id.src_port = s->sport;
2838	dst->id.fib = s->data->fibnum;
2839	dst->id.proto = s->proto;
2840	dst->id._flags = 0;
2841	dst->id.addr_type = 6;
2842
2843	dst->id.dst_ip = dst->id.src_ip = 0;
2844	dst->id.flow_id6 = dst->id.extra = 0;
2845}
2846#endif /* INET6 */
2847
2848/*
2849 * Fills the buffer given by @sd with dynamic states.
2850 * Used by dump format v1 (current).
2851 *
2852 * Returns 0 on success.
2853 */
2854int
2855ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd)
2856{
2857#ifdef INET6
2858	struct dyn_ipv6_state *s6;
2859#endif
2860	struct dyn_ipv4_state *s4;
2861	ipfw_obj_dyntlv *dst, *last;
2862	ipfw_obj_ctlv *ctlv;
2863	uint32_t bucket;
2864
2865	if (V_dyn_count == 0)
2866		return (0);
2867
2868	/*
2869	 * IPFW_UH_RLOCK garantees that another userland request
2870	 * and callout thread will not delete entries from states
2871	 * lists.
2872	 */
2873	IPFW_UH_RLOCK_ASSERT(chain);
2874
2875	ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
2876	if (ctlv == NULL)
2877		return (ENOMEM);
2878	ctlv->head.type = IPFW_TLV_DYNSTATE_LIST;
2879	ctlv->objsize = sizeof(ipfw_obj_dyntlv);
2880	last = NULL;
2881
2882#define	DYN_EXPORT_STATES(s, af, h, b)				\
2883	CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) {			\
2884		dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd,	\
2885		    sizeof(ipfw_obj_dyntlv));				\
2886		if (dst == NULL)					\
2887			return (ENOMEM);				\
2888		dyn_export_ ## af ## _state(s, &dst->state);		\
2889		dst->head.length = sizeof(ipfw_obj_dyntlv);		\
2890		dst->head.type = IPFW_TLV_DYN_ENT;			\
2891		last = dst;						\
2892	}
2893
2894	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2895		DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket);
2896		DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket);
2897#ifdef INET6
2898		DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket);
2899		DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket);
2900#endif /* INET6 */
2901	}
2902
2903	/* mark last dynamic rule */
2904	if (last != NULL)
2905		last->head.flags = IPFW_DF_LAST; /* XXX: unused */
2906	return (0);
2907#undef DYN_EXPORT_STATES
2908}
2909
2910/*
2911 * Fill given buffer with dynamic states (legacy format).
2912 * IPFW_UH_RLOCK has to be held while calling.
2913 */
2914void
2915ipfw_get_dynamic(struct ip_fw_chain *chain, char **pbp, const char *ep)
2916{
2917#ifdef INET6
2918	struct dyn_ipv6_state *s6;
2919#endif
2920	struct dyn_ipv4_state *s4;
2921	ipfw_dyn_rule *p, *last = NULL;
2922	char *bp;
2923	uint32_t bucket;
2924
2925	if (V_dyn_count == 0)
2926		return;
2927	bp = *pbp;
2928
2929	IPFW_UH_RLOCK_ASSERT(chain);
2930
2931#define	DYN_EXPORT_STATES(s, af, head, b)				\
2932	CK_SLIST_FOREACH(s, &V_dyn_ ## head[b], entry) {		\
2933		if (bp + sizeof(*p) > ep)				\
2934			break;						\
2935		p = (ipfw_dyn_rule *)bp;				\
2936		dyn_export_ ## af ## _state(s, p);			\
2937		last = p;						\
2938		bp += sizeof(*p);					\
2939	}
2940
2941	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2942		DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket);
2943		DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket);
2944#ifdef INET6
2945		DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket);
2946		DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket);
2947#endif /* INET6 */
2948	}
2949
2950	if (last != NULL) /* mark last dynamic rule */
2951		last->next = NULL;
2952	*pbp = bp;
2953#undef DYN_EXPORT_STATES
2954}
2955
2956void
2957ipfw_dyn_init(struct ip_fw_chain *chain)
2958{
2959
2960#ifdef IPFIREWALL_JENKINSHASH
2961	V_dyn_hashseed = arc4random();
2962#endif
2963	V_dyn_max = 16384;		/* max # of states */
2964	V_dyn_parent_max = 4096;	/* max # of parent states */
2965	V_dyn_buckets_max = 8192;	/* must be power of 2 */
2966
2967	V_dyn_ack_lifetime = 300;
2968	V_dyn_syn_lifetime = 20;
2969	V_dyn_fin_lifetime = 1;
2970	V_dyn_rst_lifetime = 1;
2971	V_dyn_udp_lifetime = 10;
2972	V_dyn_short_lifetime = 5;
2973
2974	V_dyn_keepalive_interval = 20;
2975	V_dyn_keepalive_period = 5;
2976	V_dyn_keepalive = 1;		/* send keepalives */
2977	V_dyn_keepalive_last = time_uptime;
2978
2979	V_dyn_data_zone = uma_zcreate("IPFW dynamic states data",
2980	    sizeof(struct dyn_data), NULL, NULL, NULL, NULL,
2981	    UMA_ALIGN_PTR, 0);
2982	uma_zone_set_max(V_dyn_data_zone, V_dyn_max);
2983
2984	V_dyn_parent_zone = uma_zcreate("IPFW parent dynamic states",
2985	    sizeof(struct dyn_parent), NULL, NULL, NULL, NULL,
2986	    UMA_ALIGN_PTR, 0);
2987	uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max);
2988
2989	SLIST_INIT(&V_dyn_expired_ipv4);
2990	V_dyn_ipv4 = NULL;
2991	V_dyn_ipv4_parent = NULL;
2992	V_dyn_ipv4_zone = uma_zcreate("IPFW IPv4 dynamic states",
2993	    sizeof(struct dyn_ipv4_state), NULL, NULL, NULL, NULL,
2994	    UMA_ALIGN_PTR, 0);
2995
2996#ifdef INET6
2997	SLIST_INIT(&V_dyn_expired_ipv6);
2998	V_dyn_ipv6 = NULL;
2999	V_dyn_ipv6_parent = NULL;
3000	V_dyn_ipv6_zone = uma_zcreate("IPFW IPv6 dynamic states",
3001	    sizeof(struct dyn_ipv6_state), NULL, NULL, NULL, NULL,
3002	    UMA_ALIGN_PTR, 0);
3003#endif
3004
3005	/* Initialize buckets. */
3006	V_curr_dyn_buckets = 0;
3007	V_dyn_bucket_lock = NULL;
3008	dyn_grow_hashtable(chain, 256);
3009
3010	if (IS_DEFAULT_VNET(curvnet))
3011		dyn_hp_cache = malloc(mp_ncpus * sizeof(void *), M_IPFW,
3012		    M_WAITOK | M_ZERO);
3013
3014	DYN_EXPIRED_LOCK_INIT();
3015	callout_init(&V_dyn_timeout, 1);
3016	callout_reset(&V_dyn_timeout, hz, dyn_tick, curvnet);
3017	IPFW_ADD_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes);
3018}
3019
3020void
3021ipfw_dyn_uninit(int pass)
3022{
3023#ifdef INET6
3024	struct dyn_ipv6_state *s6;
3025#endif
3026	struct dyn_ipv4_state *s4;
3027	int bucket;
3028
3029	if (pass == 0) {
3030		callout_drain(&V_dyn_timeout);
3031		return;
3032	}
3033	IPFW_DEL_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes);
3034	DYN_EXPIRED_LOCK_DESTROY();
3035
3036#define	DYN_FREE_STATES_FORCED(CK, s, af, name, en)	do {		\
3037	while ((s = CK ## SLIST_FIRST(&V_dyn_ ## name)) != NULL) {	\
3038		CK ## SLIST_REMOVE_HEAD(&V_dyn_ ## name, en);	\
3039		if (s->type == O_LIMIT_PARENT)				\
3040			uma_zfree(V_dyn_parent_zone, s->limit);		\
3041		else							\
3042			uma_zfree(V_dyn_data_zone, s->data);		\
3043		uma_zfree(V_dyn_ ## af ## _zone, s);			\
3044	}								\
3045} while (0)
3046	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
3047		DYN_BUCKET_LOCK_DESTROY(V_dyn_bucket_lock, bucket);
3048
3049		DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4[bucket], entry);
3050		DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4_parent[bucket],
3051		    entry);
3052#ifdef INET6
3053		DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6[bucket], entry);
3054		DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6_parent[bucket],
3055		    entry);
3056#endif /* INET6 */
3057	}
3058	DYN_FREE_STATES_FORCED(, s4, ipv4, expired_ipv4, expired);
3059#ifdef INET6
3060	DYN_FREE_STATES_FORCED(, s6, ipv6, expired_ipv6, expired);
3061#endif
3062#undef DYN_FREE_STATES_FORCED
3063
3064	uma_zdestroy(V_dyn_ipv4_zone);
3065	uma_zdestroy(V_dyn_data_zone);
3066	uma_zdestroy(V_dyn_parent_zone);
3067#ifdef INET6
3068	uma_zdestroy(V_dyn_ipv6_zone);
3069	free(V_dyn_ipv6, M_IPFW);
3070	free(V_dyn_ipv6_parent, M_IPFW);
3071	free(V_dyn_ipv6_add, M_IPFW);
3072	free(V_dyn_ipv6_parent_add, M_IPFW);
3073	free(V_dyn_ipv6_del, M_IPFW);
3074	free(V_dyn_ipv6_parent_del, M_IPFW);
3075#endif
3076	free(V_dyn_bucket_lock, M_IPFW);
3077	free(V_dyn_ipv4, M_IPFW);
3078	free(V_dyn_ipv4_parent, M_IPFW);
3079	free(V_dyn_ipv4_add, M_IPFW);
3080	free(V_dyn_ipv4_parent_add, M_IPFW);
3081	free(V_dyn_ipv4_del, M_IPFW);
3082	free(V_dyn_ipv4_parent_del, M_IPFW);
3083	if (IS_DEFAULT_VNET(curvnet))
3084		free(dyn_hp_cache, M_IPFW);
3085}
3086
3087
3088