ip_fw_dynamic.c revision 340538
1/*-
2 * Copyright (c) 2017-2018 Yandex LLC
3 * Copyright (c) 2017-2018 Andrey V. Elsukov <ae@FreeBSD.org>
4 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/11/sys/netpfil/ipfw/ip_fw_dynamic.c 340538 2018-11-18 00:27:47Z ae $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33#include "opt_ipfw.h"
34#ifndef INET
35#error IPFIREWALL requires INET.
36#endif /* INET */
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/hash.h>
41#include <sys/mbuf.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/pcpu.h>
45#include <sys/queue.h>
46#include <sys/rmlock.h>
47#include <sys/smp.h>
48#include <sys/socket.h>
49#include <sys/sysctl.h>
50#include <sys/syslog.h>
51#include <net/ethernet.h>
52#include <net/if.h>
53#include <net/if_var.h>
54#include <net/pfil.h>
55#include <net/vnet.h>
56
57#include <netinet/in.h>
58#include <netinet/ip.h>
59#include <netinet/ip_var.h>
60#include <netinet/ip_fw.h>
61#include <netinet/tcp_var.h>
62#include <netinet/udp.h>
63
64#include <netinet/ip6.h>	/* IN6_ARE_ADDR_EQUAL */
65#ifdef INET6
66#include <netinet6/in6_var.h>
67#include <netinet6/ip6_var.h>
68#include <netinet6/scope6_var.h>
69#endif
70
71#include <netpfil/ipfw/ip_fw_private.h>
72
73#include <machine/in_cksum.h>	/* XXX for in_cksum */
74
75#ifdef MAC
76#include <security/mac/mac_framework.h>
77#endif
78#include <ck_queue.h>
79
80/*
81 * Description of dynamic states.
82 *
83 * Dynamic states are stored in lists accessed through a hash tables
84 * whose size is curr_dyn_buckets. This value can be modified through
85 * the sysctl variable dyn_buckets.
86 *
87 * Currently there are four tables: dyn_ipv4, dyn_ipv6, dyn_ipv4_parent,
88 * and dyn_ipv6_parent.
89 *
90 * When a packet is received, its address fields hashed, then matched
91 * against the entries in the corresponding list by addr_type.
92 * Dynamic states can be used for different purposes:
93 *  + stateful rules;
94 *  + enforcing limits on the number of sessions;
95 *  + in-kernel NAT (not implemented yet)
96 *
97 * The lifetime of dynamic states is regulated by dyn_*_lifetime,
98 * measured in seconds and depending on the flags.
99 *
100 * The total number of dynamic states is equal to UMA zone items count.
101 * The max number of dynamic states is dyn_max. When we reach
102 * the maximum number of rules we do not create anymore. This is
103 * done to avoid consuming too much memory, but also too much
104 * time when searching on each packet (ideally, we should try instead
105 * to put a limit on the length of the list on each bucket...).
106 *
107 * Each state holds a pointer to the parent ipfw rule so we know what
108 * action to perform. Dynamic rules are removed when the parent rule is
109 * deleted.
110 *
111 * There are some limitations with dynamic rules -- we do not
112 * obey the 'randomized match', and we do not do multiple
113 * passes through the firewall. XXX check the latter!!!
114 */
115
116/* By default use jenkins hash function */
117#define	IPFIREWALL_JENKINSHASH
118
119#define	DYN_COUNTER_INC(d, dir, pktlen)	do {	\
120	(d)->pcnt_ ## dir++;			\
121	(d)->bcnt_ ## dir += pktlen;		\
122	} while (0)
123
124struct dyn_data {
125	void		*parent;	/* pointer to parent rule */
126	uint32_t	chain_id;	/* cached ruleset id */
127	uint32_t	f_pos;		/* cached rule index */
128
129	uint32_t	hashval;	/* hash value used for hash resize */
130	uint16_t	fibnum;		/* fib used to send keepalives */
131	uint8_t		_pad[3];
132	uint8_t		set;		/* parent rule set number */
133	uint16_t	rulenum;	/* parent rule number */
134	uint32_t	ruleid;		/* parent rule id */
135
136	uint32_t	state;		/* TCP session state and flags */
137	uint32_t	ack_fwd;	/* most recent ACKs in forward */
138	uint32_t	ack_rev;	/* and reverse direction (used */
139					/* to generate keepalives) */
140	uint32_t	sync;		/* synchronization time */
141	uint32_t	expire;		/* expire time */
142
143	uint64_t	pcnt_fwd;	/* bytes counter in forward */
144	uint64_t	bcnt_fwd;	/* packets counter in forward */
145	uint64_t	pcnt_rev;	/* bytes counter in reverse */
146	uint64_t	bcnt_rev;	/* packets counter in reverse */
147};
148
149#define	DPARENT_COUNT_DEC(p)	do {			\
150	MPASS(p->count > 0);				\
151	ck_pr_dec_32(&(p)->count);			\
152} while (0)
153#define	DPARENT_COUNT_INC(p)	ck_pr_inc_32(&(p)->count)
154#define	DPARENT_COUNT(p)	ck_pr_load_32(&(p)->count)
155struct dyn_parent {
156	void		*parent;	/* pointer to parent rule */
157	uint32_t	count;		/* number of linked states */
158	uint8_t		_pad;
159	uint8_t		set;		/* parent rule set number */
160	uint16_t	rulenum;	/* parent rule number */
161	uint32_t	ruleid;		/* parent rule id */
162	uint32_t	hashval;	/* hash value used for hash resize */
163	uint32_t	expire;		/* expire time */
164};
165
166struct dyn_ipv4_state {
167	uint8_t		type;		/* State type */
168	uint8_t		proto;		/* UL Protocol */
169	uint16_t	kidx;		/* named object index */
170	uint16_t	sport, dport;	/* ULP source and destination ports */
171	in_addr_t	src, dst;	/* IPv4 source and destination */
172
173	union {
174		struct dyn_data	*data;
175		struct dyn_parent *limit;
176	};
177	CK_SLIST_ENTRY(dyn_ipv4_state)	entry;
178	SLIST_ENTRY(dyn_ipv4_state)	expired;
179};
180CK_SLIST_HEAD(dyn_ipv4ck_slist, dyn_ipv4_state);
181static VNET_DEFINE(struct dyn_ipv4ck_slist *, dyn_ipv4);
182static VNET_DEFINE(struct dyn_ipv4ck_slist *, dyn_ipv4_parent);
183
184SLIST_HEAD(dyn_ipv4_slist, dyn_ipv4_state);
185static VNET_DEFINE(struct dyn_ipv4_slist, dyn_expired_ipv4);
186#define	V_dyn_ipv4			VNET(dyn_ipv4)
187#define	V_dyn_ipv4_parent		VNET(dyn_ipv4_parent)
188#define	V_dyn_expired_ipv4		VNET(dyn_expired_ipv4)
189
190#ifdef INET6
191struct dyn_ipv6_state {
192	uint8_t		type;		/* State type */
193	uint8_t		proto;		/* UL Protocol */
194	uint16_t	kidx;		/* named object index */
195	uint16_t	sport, dport;	/* ULP source and destination ports */
196	struct in6_addr	src, dst;	/* IPv6 source and destination */
197	uint32_t	zoneid;		/* IPv6 scope zone id */
198	union {
199		struct dyn_data	*data;
200		struct dyn_parent *limit;
201	};
202	CK_SLIST_ENTRY(dyn_ipv6_state)	entry;
203	SLIST_ENTRY(dyn_ipv6_state)	expired;
204};
205CK_SLIST_HEAD(dyn_ipv6ck_slist, dyn_ipv6_state);
206static VNET_DEFINE(struct dyn_ipv6ck_slist *, dyn_ipv6);
207static VNET_DEFINE(struct dyn_ipv6ck_slist *, dyn_ipv6_parent);
208
209SLIST_HEAD(dyn_ipv6_slist, dyn_ipv6_state);
210static VNET_DEFINE(struct dyn_ipv6_slist, dyn_expired_ipv6);
211#define	V_dyn_ipv6			VNET(dyn_ipv6)
212#define	V_dyn_ipv6_parent		VNET(dyn_ipv6_parent)
213#define	V_dyn_expired_ipv6		VNET(dyn_expired_ipv6)
214#endif /* INET6 */
215
216/*
217 * Per-CPU pointer indicates that specified state is currently in use
218 * and must not be reclaimed by expiration callout.
219 */
220static void **dyn_hp_cache;
221static DPCPU_DEFINE(void *, dyn_hp);
222#define	DYNSTATE_GET(cpu)	ck_pr_load_ptr(DPCPU_ID_PTR((cpu), dyn_hp))
223#define	DYNSTATE_PROTECT(v)	ck_pr_store_ptr(DPCPU_PTR(dyn_hp), (v))
224#define	DYNSTATE_RELEASE()	DYNSTATE_PROTECT(NULL)
225#define	DYNSTATE_CRITICAL_ENTER()	critical_enter()
226#define	DYNSTATE_CRITICAL_EXIT()	do {	\
227	DYNSTATE_RELEASE();			\
228	critical_exit();			\
229} while (0);
230
231/*
232 * We keep two version numbers, one is updated when new entry added to
233 * the list. Second is updated when an entry deleted from the list.
234 * Versions are updated under bucket lock.
235 *
236 * Bucket "add" version number is used to know, that in the time between
237 * state lookup (i.e. ipfw_dyn_lookup_state()) and the followed state
238 * creation (i.e. ipfw_dyn_install_state()) another concurrent thread did
239 * not install some state in this bucket. Using this info we can avoid
240 * additional state lookup, because we are sure that we will not install
241 * the state twice.
242 *
243 * Also doing the tracking of bucket "del" version during lookup we can
244 * be sure, that state entry was not unlinked and freed in time between
245 * we read the state pointer and protect it with hazard pointer.
246 *
247 * An entry unlinked from CK list keeps unchanged until it is freed.
248 * Unlinked entries are linked into expired lists using "expired" field.
249 */
250
251/*
252 * dyn_expire_lock is used to protect access to dyn_expired_xxx lists.
253 * dyn_bucket_lock is used to get write access to lists in specific bucket.
254 * Currently one dyn_bucket_lock is used for all ipv4, ipv4_parent, ipv6,
255 * and ipv6_parent lists.
256 */
257static VNET_DEFINE(struct mtx, dyn_expire_lock);
258static VNET_DEFINE(struct mtx *, dyn_bucket_lock);
259#define	V_dyn_expire_lock		VNET(dyn_expire_lock)
260#define	V_dyn_bucket_lock		VNET(dyn_bucket_lock)
261
262/*
263 * Bucket's add/delete generation versions.
264 */
265static VNET_DEFINE(uint32_t *, dyn_ipv4_add);
266static VNET_DEFINE(uint32_t *, dyn_ipv4_del);
267static VNET_DEFINE(uint32_t *, dyn_ipv4_parent_add);
268static VNET_DEFINE(uint32_t *, dyn_ipv4_parent_del);
269#define	V_dyn_ipv4_add			VNET(dyn_ipv4_add)
270#define	V_dyn_ipv4_del			VNET(dyn_ipv4_del)
271#define	V_dyn_ipv4_parent_add		VNET(dyn_ipv4_parent_add)
272#define	V_dyn_ipv4_parent_del		VNET(dyn_ipv4_parent_del)
273
274#ifdef INET6
275static VNET_DEFINE(uint32_t *, dyn_ipv6_add);
276static VNET_DEFINE(uint32_t *, dyn_ipv6_del);
277static VNET_DEFINE(uint32_t *, dyn_ipv6_parent_add);
278static VNET_DEFINE(uint32_t *, dyn_ipv6_parent_del);
279#define	V_dyn_ipv6_add			VNET(dyn_ipv6_add)
280#define	V_dyn_ipv6_del			VNET(dyn_ipv6_del)
281#define	V_dyn_ipv6_parent_add		VNET(dyn_ipv6_parent_add)
282#define	V_dyn_ipv6_parent_del		VNET(dyn_ipv6_parent_del)
283#endif /* INET6 */
284
285#define	DYN_BUCKET(h, b)		((h) & (b - 1))
286#define	DYN_BUCKET_VERSION(b, v)	ck_pr_load_32(&V_dyn_ ## v[(b)])
287#define	DYN_BUCKET_VERSION_BUMP(b, v)	ck_pr_inc_32(&V_dyn_ ## v[(b)])
288
289#define	DYN_BUCKET_LOCK_INIT(lock, b)		\
290    mtx_init(&lock[(b)], "IPFW dynamic bucket", NULL, MTX_DEF)
291#define	DYN_BUCKET_LOCK_DESTROY(lock, b)	mtx_destroy(&lock[(b)])
292#define	DYN_BUCKET_LOCK(b)	mtx_lock(&V_dyn_bucket_lock[(b)])
293#define	DYN_BUCKET_UNLOCK(b)	mtx_unlock(&V_dyn_bucket_lock[(b)])
294#define	DYN_BUCKET_ASSERT(b)	mtx_assert(&V_dyn_bucket_lock[(b)], MA_OWNED)
295
296#define	DYN_EXPIRED_LOCK_INIT()		\
297    mtx_init(&V_dyn_expire_lock, "IPFW expired states list", NULL, MTX_DEF)
298#define	DYN_EXPIRED_LOCK_DESTROY()	mtx_destroy(&V_dyn_expire_lock)
299#define	DYN_EXPIRED_LOCK()		mtx_lock(&V_dyn_expire_lock)
300#define	DYN_EXPIRED_UNLOCK()		mtx_unlock(&V_dyn_expire_lock)
301
302static VNET_DEFINE(uint32_t, dyn_buckets_max);
303static VNET_DEFINE(uint32_t, curr_dyn_buckets);
304static VNET_DEFINE(struct callout, dyn_timeout);
305#define	V_dyn_buckets_max		VNET(dyn_buckets_max)
306#define	V_curr_dyn_buckets		VNET(curr_dyn_buckets)
307#define	V_dyn_timeout			VNET(dyn_timeout)
308
309/* Maximum length of states chain in a bucket */
310static VNET_DEFINE(uint32_t, curr_max_length);
311#define	V_curr_max_length		VNET(curr_max_length)
312
313static VNET_DEFINE(uint32_t, dyn_keep_states);
314#define	V_dyn_keep_states		VNET(dyn_keep_states)
315
316static VNET_DEFINE(uma_zone_t, dyn_data_zone);
317static VNET_DEFINE(uma_zone_t, dyn_parent_zone);
318static VNET_DEFINE(uma_zone_t, dyn_ipv4_zone);
319#ifdef INET6
320static VNET_DEFINE(uma_zone_t, dyn_ipv6_zone);
321#define	V_dyn_ipv6_zone			VNET(dyn_ipv6_zone)
322#endif /* INET6 */
323#define	V_dyn_data_zone			VNET(dyn_data_zone)
324#define	V_dyn_parent_zone		VNET(dyn_parent_zone)
325#define	V_dyn_ipv4_zone			VNET(dyn_ipv4_zone)
326
327/*
328 * Timeouts for various events in handing dynamic rules.
329 */
330static VNET_DEFINE(uint32_t, dyn_ack_lifetime);
331static VNET_DEFINE(uint32_t, dyn_syn_lifetime);
332static VNET_DEFINE(uint32_t, dyn_fin_lifetime);
333static VNET_DEFINE(uint32_t, dyn_rst_lifetime);
334static VNET_DEFINE(uint32_t, dyn_udp_lifetime);
335static VNET_DEFINE(uint32_t, dyn_short_lifetime);
336
337#define	V_dyn_ack_lifetime		VNET(dyn_ack_lifetime)
338#define	V_dyn_syn_lifetime		VNET(dyn_syn_lifetime)
339#define	V_dyn_fin_lifetime		VNET(dyn_fin_lifetime)
340#define	V_dyn_rst_lifetime		VNET(dyn_rst_lifetime)
341#define	V_dyn_udp_lifetime		VNET(dyn_udp_lifetime)
342#define	V_dyn_short_lifetime		VNET(dyn_short_lifetime)
343
344/*
345 * Keepalives are sent if dyn_keepalive is set. They are sent every
346 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
347 * seconds of lifetime of a rule.
348 * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
349 * than dyn_keepalive_period.
350 */
351#define	DYN_KEEPALIVE_MAXQ		512
352static VNET_DEFINE(uint32_t, dyn_keepalive_interval);
353static VNET_DEFINE(uint32_t, dyn_keepalive_period);
354static VNET_DEFINE(uint32_t, dyn_keepalive);
355static VNET_DEFINE(time_t, dyn_keepalive_last);
356
357#define	V_dyn_keepalive_interval	VNET(dyn_keepalive_interval)
358#define	V_dyn_keepalive_period		VNET(dyn_keepalive_period)
359#define	V_dyn_keepalive			VNET(dyn_keepalive)
360#define	V_dyn_keepalive_last		VNET(dyn_keepalive_last)
361
362static VNET_DEFINE(uint32_t, dyn_max);		/* max # of dynamic states */
363static VNET_DEFINE(uint32_t, dyn_count);	/* number of states */
364static VNET_DEFINE(uint32_t, dyn_parent_max);	/* max # of parent states */
365static VNET_DEFINE(uint32_t, dyn_parent_count);	/* number of parent states */
366
367#define	V_dyn_max			VNET(dyn_max)
368#define	V_dyn_count			VNET(dyn_count)
369#define	V_dyn_parent_max		VNET(dyn_parent_max)
370#define	V_dyn_parent_count		VNET(dyn_parent_count)
371
372#define	DYN_COUNT_DEC(name)	do {			\
373	MPASS((V_ ## name) > 0);			\
374	ck_pr_dec_32(&(V_ ## name));			\
375} while (0)
376#define	DYN_COUNT_INC(name)	ck_pr_inc_32(&(V_ ## name))
377#define	DYN_COUNT(name)		ck_pr_load_32(&(V_ ## name))
378
379static time_t last_log;	/* Log ratelimiting */
380
381/*
382 * Get/set maximum number of dynamic states in given VNET instance.
383 */
384static int
385sysctl_dyn_max(SYSCTL_HANDLER_ARGS)
386{
387	uint32_t nstates;
388	int error;
389
390	nstates = V_dyn_max;
391	error = sysctl_handle_32(oidp, &nstates, 0, req);
392	/* Read operation or some error */
393	if ((error != 0) || (req->newptr == NULL))
394		return (error);
395
396	V_dyn_max = nstates;
397	uma_zone_set_max(V_dyn_data_zone, V_dyn_max);
398	return (0);
399}
400
401static int
402sysctl_dyn_parent_max(SYSCTL_HANDLER_ARGS)
403{
404	uint32_t nstates;
405	int error;
406
407	nstates = V_dyn_parent_max;
408	error = sysctl_handle_32(oidp, &nstates, 0, req);
409	/* Read operation or some error */
410	if ((error != 0) || (req->newptr == NULL))
411		return (error);
412
413	V_dyn_parent_max = nstates;
414	uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max);
415	return (0);
416}
417
418static int
419sysctl_dyn_buckets(SYSCTL_HANDLER_ARGS)
420{
421	uint32_t nbuckets;
422	int error;
423
424	nbuckets = V_dyn_buckets_max;
425	error = sysctl_handle_32(oidp, &nbuckets, 0, req);
426	/* Read operation or some error */
427	if ((error != 0) || (req->newptr == NULL))
428		return (error);
429
430	if (nbuckets > 256)
431		V_dyn_buckets_max = 1 << fls(nbuckets - 1);
432	else
433		return (EINVAL);
434	return (0);
435}
436
437SYSCTL_DECL(_net_inet_ip_fw);
438
439SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_count,
440    CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_count), 0,
441    "Current number of dynamic states.");
442SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_parent_count,
443    CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_parent_count), 0,
444    "Current number of parent states. ");
445SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
446    CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
447    "Current number of buckets for states hash table.");
448SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_max_length,
449    CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_max_length), 0,
450    "Current maximum length of states chains in hash buckets.");
451SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
452    CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_buckets,
453    "IU", "Max number of buckets for dynamic states hash table.");
454SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
455    CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_max,
456    "IU", "Max number of dynamic states.");
457SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_parent_max,
458    CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_parent_max,
459    "IU", "Max number of parent dynamic states.");
460SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
461    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
462    "Lifetime of dynamic states for TCP ACK.");
463SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
464    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
465    "Lifetime of dynamic states for TCP SYN.");
466SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
467    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
468    "Lifetime of dynamic states for TCP FIN.");
469SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
470    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
471    "Lifetime of dynamic states for TCP RST.");
472SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
473    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
474    "Lifetime of dynamic states for UDP.");
475SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
476    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
477    "Lifetime of dynamic states for other situations.");
478SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
479    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
480    "Enable keepalives for dynamic states.");
481SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keep_states,
482    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0,
483    "Do not flush dynamic states on rule deletion");
484
485
486#ifdef IPFIREWALL_DYNDEBUG
487#define	DYN_DEBUG(fmt, ...)	do {			\
488	printf("%s: " fmt "\n", __func__, __VA_ARGS__);	\
489} while (0)
490#else
491#define	DYN_DEBUG(fmt, ...)
492#endif /* !IPFIREWALL_DYNDEBUG */
493
494#ifdef INET6
495/* Functions to work with IPv6 states */
496static struct dyn_ipv6_state *dyn_lookup_ipv6_state(
497    const struct ipfw_flow_id *, uint32_t, const void *,
498    struct ipfw_dyn_info *, int);
499static int dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *,
500    uint32_t, const void *, int, uint32_t, uint16_t);
501static struct dyn_ipv6_state *dyn_alloc_ipv6_state(
502    const struct ipfw_flow_id *, uint32_t, uint16_t, uint8_t);
503static int dyn_add_ipv6_state(void *, uint32_t, uint16_t, uint8_t,
504    const struct ipfw_flow_id *, uint32_t, const void *, int, uint32_t,
505    struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t);
506static void dyn_export_ipv6_state(const struct dyn_ipv6_state *,
507    ipfw_dyn_rule *);
508
509static uint32_t dyn_getscopeid(const struct ip_fw_args *);
510static void dyn_make_keepalive_ipv6(struct mbuf *, const struct in6_addr *,
511    const struct in6_addr *, uint32_t, uint32_t, uint32_t, uint16_t,
512    uint16_t);
513static void dyn_enqueue_keepalive_ipv6(struct mbufq *,
514    const struct dyn_ipv6_state *);
515static void dyn_send_keepalive_ipv6(struct ip_fw_chain *);
516
517static struct dyn_ipv6_state *dyn_lookup_ipv6_parent(
518    const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t,
519    uint32_t);
520static struct dyn_ipv6_state *dyn_lookup_ipv6_parent_locked(
521    const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t,
522    uint32_t);
523static struct dyn_ipv6_state *dyn_add_ipv6_parent(void *, uint32_t, uint16_t,
524    uint8_t, const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t,
525    uint16_t);
526#endif /* INET6 */
527
528/* Functions to work with limit states */
529static void *dyn_get_parent_state(const struct ipfw_flow_id *, uint32_t,
530    struct ip_fw *, uint32_t, uint32_t, uint16_t);
531static struct dyn_ipv4_state *dyn_lookup_ipv4_parent(
532    const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t);
533static struct dyn_ipv4_state *dyn_lookup_ipv4_parent_locked(
534    const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t);
535static struct dyn_parent *dyn_alloc_parent(void *, uint32_t, uint16_t,
536    uint8_t, uint32_t);
537static struct dyn_ipv4_state *dyn_add_ipv4_parent(void *, uint32_t, uint16_t,
538    uint8_t, const struct ipfw_flow_id *, uint32_t, uint32_t, uint16_t);
539
540static void dyn_tick(void *);
541static void dyn_expire_states(struct ip_fw_chain *, ipfw_range_tlv *);
542static void dyn_free_states(struct ip_fw_chain *);
543static void dyn_export_parent(const struct dyn_parent *, uint16_t,
544    ipfw_dyn_rule *);
545static void dyn_export_data(const struct dyn_data *, uint16_t, uint8_t,
546    ipfw_dyn_rule *);
547static uint32_t dyn_update_tcp_state(struct dyn_data *,
548    const struct ipfw_flow_id *, const struct tcphdr *, int);
549static void dyn_update_proto_state(struct dyn_data *,
550    const struct ipfw_flow_id *, const void *, int, int);
551
552/* Functions to work with IPv4 states */
553struct dyn_ipv4_state *dyn_lookup_ipv4_state(const struct ipfw_flow_id *,
554    const void *, struct ipfw_dyn_info *, int);
555static int dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *,
556    const void *, int, uint32_t, uint16_t);
557static struct dyn_ipv4_state *dyn_alloc_ipv4_state(
558    const struct ipfw_flow_id *, uint16_t, uint8_t);
559static int dyn_add_ipv4_state(void *, uint32_t, uint16_t, uint8_t,
560    const struct ipfw_flow_id *, const void *, int, uint32_t,
561    struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t);
562static void dyn_export_ipv4_state(const struct dyn_ipv4_state *,
563    ipfw_dyn_rule *);
564
565/*
566 * Named states support.
567 */
568static char *default_state_name = "default";
569struct dyn_state_obj {
570	struct named_object	no;
571	char			name[64];
572};
573
574#define	DYN_STATE_OBJ(ch, cmd)	\
575    ((struct dyn_state_obj *)SRV_OBJECT(ch, (cmd)->arg1))
576/*
577 * Classifier callback.
578 * Return 0 if opcode contains object that should be referenced
579 * or rewritten.
580 */
581static int
582dyn_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
583{
584
585	DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1);
586	/* Don't rewrite "check-state any" */
587	if (cmd->arg1 == 0 &&
588	    cmd->opcode == O_CHECK_STATE)
589		return (1);
590
591	*puidx = cmd->arg1;
592	*ptype = 0;
593	return (0);
594}
595
596static void
597dyn_update(ipfw_insn *cmd, uint16_t idx)
598{
599
600	cmd->arg1 = idx;
601	DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1);
602}
603
604static int
605dyn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
606    struct named_object **pno)
607{
608	ipfw_obj_ntlv *ntlv;
609	const char *name;
610
611	DYN_DEBUG("uidx %d", ti->uidx);
612	if (ti->uidx != 0) {
613		if (ti->tlvs == NULL)
614			return (EINVAL);
615		/* Search ntlv in the buffer provided by user */
616		ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
617		    IPFW_TLV_STATE_NAME);
618		if (ntlv == NULL)
619			return (EINVAL);
620		name = ntlv->name;
621	} else
622		name = default_state_name;
623	/*
624	 * Search named object with corresponding name.
625	 * Since states objects are global - ignore the set value
626	 * and use zero instead.
627	 */
628	*pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0,
629	    IPFW_TLV_STATE_NAME, name);
630	/*
631	 * We always return success here.
632	 * The caller will check *pno and mark object as unresolved,
633	 * then it will automatically create "default" object.
634	 */
635	return (0);
636}
637
638static struct named_object *
639dyn_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
640{
641
642	DYN_DEBUG("kidx %d", idx);
643	return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx));
644}
645
646static int
647dyn_create(struct ip_fw_chain *ch, struct tid_info *ti,
648    uint16_t *pkidx)
649{
650	struct namedobj_instance *ni;
651	struct dyn_state_obj *obj;
652	struct named_object *no;
653	ipfw_obj_ntlv *ntlv;
654	char *name;
655
656	DYN_DEBUG("uidx %d", ti->uidx);
657	if (ti->uidx != 0) {
658		if (ti->tlvs == NULL)
659			return (EINVAL);
660		ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
661		    IPFW_TLV_STATE_NAME);
662		if (ntlv == NULL)
663			return (EINVAL);
664		name = ntlv->name;
665	} else
666		name = default_state_name;
667
668	ni = CHAIN_TO_SRV(ch);
669	obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO);
670	obj->no.name = obj->name;
671	obj->no.etlv = IPFW_TLV_STATE_NAME;
672	strlcpy(obj->name, name, sizeof(obj->name));
673
674	IPFW_UH_WLOCK(ch);
675	no = ipfw_objhash_lookup_name_type(ni, 0,
676	    IPFW_TLV_STATE_NAME, name);
677	if (no != NULL) {
678		/*
679		 * Object is already created.
680		 * Just return its kidx and bump refcount.
681		 */
682		*pkidx = no->kidx;
683		no->refcnt++;
684		IPFW_UH_WUNLOCK(ch);
685		free(obj, M_IPFW);
686		DYN_DEBUG("\tfound kidx %d", *pkidx);
687		return (0);
688	}
689	if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) {
690		DYN_DEBUG("\talloc_idx failed for %s", name);
691		IPFW_UH_WUNLOCK(ch);
692		free(obj, M_IPFW);
693		return (ENOSPC);
694	}
695	ipfw_objhash_add(ni, &obj->no);
696	SRV_OBJECT(ch, obj->no.kidx) = obj;
697	obj->no.refcnt++;
698	*pkidx = obj->no.kidx;
699	IPFW_UH_WUNLOCK(ch);
700	DYN_DEBUG("\tcreated kidx %d", *pkidx);
701	return (0);
702}
703
704static void
705dyn_destroy(struct ip_fw_chain *ch, struct named_object *no)
706{
707	struct dyn_state_obj *obj;
708
709	IPFW_UH_WLOCK_ASSERT(ch);
710
711	KASSERT(no->refcnt == 1,
712	    ("Destroying object '%s' (type %u, idx %u) with refcnt %u",
713	    no->name, no->etlv, no->kidx, no->refcnt));
714	DYN_DEBUG("kidx %d", no->kidx);
715	obj = SRV_OBJECT(ch, no->kidx);
716	SRV_OBJECT(ch, no->kidx) = NULL;
717	ipfw_objhash_del(CHAIN_TO_SRV(ch), no);
718	ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), no->kidx);
719
720	free(obj, M_IPFW);
721}
722
723static struct opcode_obj_rewrite dyn_opcodes[] = {
724	{
725		O_KEEP_STATE, IPFW_TLV_STATE_NAME,
726		dyn_classify, dyn_update,
727		dyn_findbyname, dyn_findbykidx,
728		dyn_create, dyn_destroy
729	},
730	{
731		O_CHECK_STATE, IPFW_TLV_STATE_NAME,
732		dyn_classify, dyn_update,
733		dyn_findbyname, dyn_findbykidx,
734		dyn_create, dyn_destroy
735	},
736	{
737		O_PROBE_STATE, IPFW_TLV_STATE_NAME,
738		dyn_classify, dyn_update,
739		dyn_findbyname, dyn_findbykidx,
740		dyn_create, dyn_destroy
741	},
742	{
743		O_LIMIT, IPFW_TLV_STATE_NAME,
744		dyn_classify, dyn_update,
745		dyn_findbyname, dyn_findbykidx,
746		dyn_create, dyn_destroy
747	},
748};
749
750/*
751 * IMPORTANT: the hash function for dynamic rules must be commutative
752 * in source and destination (ip,port), because rules are bidirectional
753 * and we want to find both in the same bucket.
754 */
755#ifndef IPFIREWALL_JENKINSHASH
756static __inline uint32_t
757hash_packet(const struct ipfw_flow_id *id)
758{
759	uint32_t i;
760
761#ifdef INET6
762	if (IS_IP6_FLOW_ID(id))
763		i = ntohl((id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
764		    (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
765		    (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
766		    (id->src_ip6.__u6_addr.__u6_addr32[3]));
767	else
768#endif /* INET6 */
769	i = (id->dst_ip) ^ (id->src_ip);
770	i ^= (id->dst_port) ^ (id->src_port);
771	return (i);
772}
773
774static __inline uint32_t
775hash_parent(const struct ipfw_flow_id *id, const void *rule)
776{
777
778	return (hash_packet(id) ^ ((uintptr_t)rule));
779}
780
781#else /* IPFIREWALL_JENKINSHASH */
782
783static VNET_DEFINE(uint32_t, dyn_hashseed);
784#define	V_dyn_hashseed		VNET(dyn_hashseed)
785
786static __inline int
787addrcmp4(const struct ipfw_flow_id *id)
788{
789
790	if (id->src_ip < id->dst_ip)
791		return (0);
792	if (id->src_ip > id->dst_ip)
793		return (1);
794	if (id->src_port <= id->dst_port)
795		return (0);
796	return (1);
797}
798
799#ifdef INET6
800static __inline int
801addrcmp6(const struct ipfw_flow_id *id)
802{
803	int ret;
804
805	ret = memcmp(&id->src_ip6, &id->dst_ip6, sizeof(struct in6_addr));
806	if (ret < 0)
807		return (0);
808	if (ret > 0)
809		return (1);
810	if (id->src_port <= id->dst_port)
811		return (0);
812	return (1);
813}
814
815static __inline uint32_t
816hash_packet6(const struct ipfw_flow_id *id)
817{
818	struct tuple6 {
819		struct in6_addr	addr[2];
820		uint16_t	port[2];
821	} t6;
822
823	if (addrcmp6(id) == 0) {
824		t6.addr[0] = id->src_ip6;
825		t6.addr[1] = id->dst_ip6;
826		t6.port[0] = id->src_port;
827		t6.port[1] = id->dst_port;
828	} else {
829		t6.addr[0] = id->dst_ip6;
830		t6.addr[1] = id->src_ip6;
831		t6.port[0] = id->dst_port;
832		t6.port[1] = id->src_port;
833	}
834	return (jenkins_hash32((const uint32_t *)&t6,
835	    sizeof(t6) / sizeof(uint32_t), V_dyn_hashseed));
836}
837#endif
838
839static __inline uint32_t
840hash_packet(const struct ipfw_flow_id *id)
841{
842	struct tuple4 {
843		in_addr_t	addr[2];
844		uint16_t	port[2];
845	} t4;
846
847	if (IS_IP4_FLOW_ID(id)) {
848		/* All fields are in host byte order */
849		if (addrcmp4(id) == 0) {
850			t4.addr[0] = id->src_ip;
851			t4.addr[1] = id->dst_ip;
852			t4.port[0] = id->src_port;
853			t4.port[1] = id->dst_port;
854		} else {
855			t4.addr[0] = id->dst_ip;
856			t4.addr[1] = id->src_ip;
857			t4.port[0] = id->dst_port;
858			t4.port[1] = id->src_port;
859		}
860		return (jenkins_hash32((const uint32_t *)&t4,
861		    sizeof(t4) / sizeof(uint32_t), V_dyn_hashseed));
862	} else
863#ifdef INET6
864	if (IS_IP6_FLOW_ID(id))
865		return (hash_packet6(id));
866#endif
867	return (0);
868}
869
870static __inline uint32_t
871hash_parent(const struct ipfw_flow_id *id, const void *rule)
872{
873
874	return (jenkins_hash32((const uint32_t *)&rule,
875	    sizeof(rule) / sizeof(uint32_t), hash_packet(id)));
876}
877#endif /* IPFIREWALL_JENKINSHASH */
878
879/*
880 * Print customizable flow id description via log(9) facility.
881 */
882static void
883print_dyn_rule_flags(const struct ipfw_flow_id *id, int dyn_type,
884    int log_flags, char *prefix, char *postfix)
885{
886	struct in_addr da;
887#ifdef INET6
888	char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
889#else
890	char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
891#endif
892
893#ifdef INET6
894	if (IS_IP6_FLOW_ID(id)) {
895		ip6_sprintf(src, &id->src_ip6);
896		ip6_sprintf(dst, &id->dst_ip6);
897	} else
898#endif
899	{
900		da.s_addr = htonl(id->src_ip);
901		inet_ntop(AF_INET, &da, src, sizeof(src));
902		da.s_addr = htonl(id->dst_ip);
903		inet_ntop(AF_INET, &da, dst, sizeof(dst));
904	}
905	log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n",
906	    prefix, dyn_type, src, id->src_port, dst,
907	    id->dst_port, V_dyn_count, postfix);
908}
909
910#define	print_dyn_rule(id, dtype, prefix, postfix)	\
911	print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix)
912
913#define	TIME_LEQ(a,b)	((int)((a)-(b)) <= 0)
914#define	TIME_LE(a,b)	((int)((a)-(b)) < 0)
915#define	_SEQ_GE(a,b)	((int)((a)-(b)) >= 0)
916#define	BOTH_SYN	(TH_SYN | (TH_SYN << 8))
917#define	BOTH_FIN	(TH_FIN | (TH_FIN << 8))
918#define	TCP_FLAGS	(TH_FLAGS | (TH_FLAGS << 8))
919#define	ACK_FWD		0x00010000	/* fwd ack seen */
920#define	ACK_REV		0x00020000	/* rev ack seen */
921#define	ACK_BOTH	(ACK_FWD | ACK_REV)
922
923static uint32_t
924dyn_update_tcp_state(struct dyn_data *data, const struct ipfw_flow_id *pkt,
925    const struct tcphdr *tcp, int dir)
926{
927	uint32_t ack, expire;
928	uint32_t state, old;
929	uint8_t th_flags;
930
931	expire = data->expire;
932	old = state = data->state;
933	th_flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST);
934	state |= (dir == MATCH_FORWARD) ? th_flags: (th_flags << 8);
935	switch (state & TCP_FLAGS) {
936	case TH_SYN:			/* opening */
937		expire = time_uptime + V_dyn_syn_lifetime;
938		break;
939
940	case BOTH_SYN:			/* move to established */
941	case BOTH_SYN | TH_FIN:		/* one side tries to close */
942	case BOTH_SYN | (TH_FIN << 8):
943		if (tcp == NULL)
944			break;
945		ack = ntohl(tcp->th_ack);
946		if (dir == MATCH_FORWARD) {
947			if (data->ack_fwd == 0 ||
948			    _SEQ_GE(ack, data->ack_fwd)) {
949				state |= ACK_FWD;
950				if (data->ack_fwd != ack)
951					ck_pr_store_32(&data->ack_fwd, ack);
952			}
953		} else {
954			if (data->ack_rev == 0 ||
955			    _SEQ_GE(ack, data->ack_rev)) {
956				state |= ACK_REV;
957				if (data->ack_rev != ack)
958					ck_pr_store_32(&data->ack_rev, ack);
959			}
960		}
961		if ((state & ACK_BOTH) == ACK_BOTH) {
962			/*
963			 * Set expire time to V_dyn_ack_lifetime only if
964			 * we got ACKs for both directions.
965			 * We use XOR here to avoid possible state
966			 * overwriting in concurrent thread.
967			 */
968			expire = time_uptime + V_dyn_ack_lifetime;
969			ck_pr_xor_32(&data->state, ACK_BOTH);
970		} else if ((data->state & ACK_BOTH) != (state & ACK_BOTH))
971			ck_pr_or_32(&data->state, state & ACK_BOTH);
972		break;
973
974	case BOTH_SYN | BOTH_FIN:	/* both sides closed */
975		if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
976			V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
977		expire = time_uptime + V_dyn_fin_lifetime;
978		break;
979
980	default:
981		if (V_dyn_keepalive != 0 &&
982		    V_dyn_rst_lifetime >= V_dyn_keepalive_period)
983			V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
984		expire = time_uptime + V_dyn_rst_lifetime;
985	}
986	/* Save TCP state if it was changed */
987	if ((state & TCP_FLAGS) != (old & TCP_FLAGS))
988		ck_pr_or_32(&data->state, state & TCP_FLAGS);
989	return (expire);
990}
991
992/*
993 * Update ULP specific state.
994 * For TCP we keep sequence numbers and flags. For other protocols
995 * currently we update only expire time. Packets and bytes counters
996 * are also updated here.
997 */
998static void
999dyn_update_proto_state(struct dyn_data *data, const struct ipfw_flow_id *pkt,
1000    const void *ulp, int pktlen, int dir)
1001{
1002	uint32_t expire;
1003
1004	/* NOTE: we are in critical section here. */
1005	switch (pkt->proto) {
1006	case IPPROTO_UDP:
1007	case IPPROTO_UDPLITE:
1008		expire = time_uptime + V_dyn_udp_lifetime;
1009		break;
1010	case IPPROTO_TCP:
1011		expire = dyn_update_tcp_state(data, pkt, ulp, dir);
1012		break;
1013	default:
1014		expire = time_uptime + V_dyn_short_lifetime;
1015	}
1016	/*
1017	 * Expiration timer has the per-second granularity, no need to update
1018	 * it every time when state is matched.
1019	 */
1020	if (data->expire != expire)
1021		ck_pr_store_32(&data->expire, expire);
1022
1023	if (dir == MATCH_FORWARD)
1024		DYN_COUNTER_INC(data, fwd, pktlen);
1025	else
1026		DYN_COUNTER_INC(data, rev, pktlen);
1027}
1028
1029/*
1030 * Lookup IPv4 state.
1031 * Must be called in critical section.
1032 */
1033struct dyn_ipv4_state *
1034dyn_lookup_ipv4_state(const struct ipfw_flow_id *pkt, const void *ulp,
1035    struct ipfw_dyn_info *info, int pktlen)
1036{
1037	struct dyn_ipv4_state *s;
1038	uint32_t version, bucket;
1039
1040	bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets);
1041	info->version = DYN_BUCKET_VERSION(bucket, ipv4_add);
1042restart:
1043	version = DYN_BUCKET_VERSION(bucket, ipv4_del);
1044	CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) {
1045		DYNSTATE_PROTECT(s);
1046		if (version != DYN_BUCKET_VERSION(bucket, ipv4_del))
1047			goto restart;
1048		if (s->proto != pkt->proto)
1049			continue;
1050		if (info->kidx != 0 && s->kidx != info->kidx)
1051			continue;
1052		if (s->sport == pkt->src_port && s->dport == pkt->dst_port &&
1053		    s->src == pkt->src_ip && s->dst == pkt->dst_ip) {
1054			info->direction = MATCH_FORWARD;
1055			break;
1056		}
1057		if (s->sport == pkt->dst_port && s->dport == pkt->src_port &&
1058		    s->src == pkt->dst_ip && s->dst == pkt->src_ip) {
1059			info->direction = MATCH_REVERSE;
1060			break;
1061		}
1062	}
1063
1064	if (s != NULL)
1065		dyn_update_proto_state(s->data, pkt, ulp, pktlen,
1066		    info->direction);
1067	return (s);
1068}
1069
1070/*
1071 * Lookup IPv4 state.
1072 * Simplifed version is used to check that matching state doesn't exist.
1073 */
1074static int
1075dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *pkt,
1076    const void *ulp, int pktlen, uint32_t bucket, uint16_t kidx)
1077{
1078	struct dyn_ipv4_state *s;
1079	int dir;
1080
1081	dir = MATCH_NONE;
1082	DYN_BUCKET_ASSERT(bucket);
1083	CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) {
1084		if (s->proto != pkt->proto ||
1085		    s->kidx != kidx)
1086			continue;
1087		if (s->sport == pkt->src_port &&
1088		    s->dport == pkt->dst_port &&
1089		    s->src == pkt->src_ip && s->dst == pkt->dst_ip) {
1090			dir = MATCH_FORWARD;
1091			break;
1092		}
1093		if (s->sport == pkt->dst_port && s->dport == pkt->src_port &&
1094		    s->src == pkt->dst_ip && s->dst == pkt->src_ip) {
1095			dir = MATCH_REVERSE;
1096			break;
1097		}
1098	}
1099	if (s != NULL)
1100		dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir);
1101	return (s != NULL);
1102}
1103
1104struct dyn_ipv4_state *
1105dyn_lookup_ipv4_parent(const struct ipfw_flow_id *pkt, const void *rule,
1106    uint32_t ruleid, uint16_t rulenum, uint32_t hashval)
1107{
1108	struct dyn_ipv4_state *s;
1109	uint32_t version, bucket;
1110
1111	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1112restart:
1113	version = DYN_BUCKET_VERSION(bucket, ipv4_parent_del);
1114	CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) {
1115		DYNSTATE_PROTECT(s);
1116		if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_del))
1117			goto restart;
1118		/*
1119		 * NOTE: we do not need to check kidx, because parent rule
1120		 * can not create states with different kidx.
1121		 * And parent rule always created for forward direction.
1122		 */
1123		if (s->limit->parent == rule &&
1124		    s->limit->ruleid == ruleid &&
1125		    s->limit->rulenum == rulenum &&
1126		    s->proto == pkt->proto &&
1127		    s->sport == pkt->src_port &&
1128		    s->dport == pkt->dst_port &&
1129		    s->src == pkt->src_ip && s->dst == pkt->dst_ip) {
1130			if (s->limit->expire != time_uptime +
1131			    V_dyn_short_lifetime)
1132				ck_pr_store_32(&s->limit->expire,
1133				    time_uptime + V_dyn_short_lifetime);
1134			break;
1135		}
1136	}
1137	return (s);
1138}
1139
1140static struct dyn_ipv4_state *
1141dyn_lookup_ipv4_parent_locked(const struct ipfw_flow_id *pkt,
1142    const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t bucket)
1143{
1144	struct dyn_ipv4_state *s;
1145
1146	DYN_BUCKET_ASSERT(bucket);
1147	CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) {
1148		if (s->limit->parent == rule &&
1149		    s->limit->ruleid == ruleid &&
1150		    s->limit->rulenum == rulenum &&
1151		    s->proto == pkt->proto &&
1152		    s->sport == pkt->src_port &&
1153		    s->dport == pkt->dst_port &&
1154		    s->src == pkt->src_ip && s->dst == pkt->dst_ip)
1155			break;
1156	}
1157	return (s);
1158}
1159
1160
1161#ifdef INET6
1162static uint32_t
1163dyn_getscopeid(const struct ip_fw_args *args)
1164{
1165
1166	/*
1167	 * If source or destination address is an scopeid address, we need
1168	 * determine the scope zone id to resolve address scope ambiguity.
1169	 */
1170	if (IN6_IS_ADDR_LINKLOCAL(&args->f_id.src_ip6) ||
1171	    IN6_IS_ADDR_LINKLOCAL(&args->f_id.dst_ip6)) {
1172		MPASS(args->oif != NULL ||
1173		    args->m->m_pkthdr.rcvif != NULL);
1174		return (in6_getscopezone(args->oif != NULL ? args->oif:
1175		    args->m->m_pkthdr.rcvif, IPV6_ADDR_SCOPE_LINKLOCAL));
1176	}
1177	return (0);
1178}
1179
1180/*
1181 * Lookup IPv6 state.
1182 * Must be called in critical section.
1183 */
1184static struct dyn_ipv6_state *
1185dyn_lookup_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1186    const void *ulp, struct ipfw_dyn_info *info, int pktlen)
1187{
1188	struct dyn_ipv6_state *s;
1189	uint32_t version, bucket;
1190
1191	bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets);
1192	info->version = DYN_BUCKET_VERSION(bucket, ipv6_add);
1193restart:
1194	version = DYN_BUCKET_VERSION(bucket, ipv6_del);
1195	CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) {
1196		DYNSTATE_PROTECT(s);
1197		if (version != DYN_BUCKET_VERSION(bucket, ipv6_del))
1198			goto restart;
1199		if (s->proto != pkt->proto || s->zoneid != zoneid)
1200			continue;
1201		if (info->kidx != 0 && s->kidx != info->kidx)
1202			continue;
1203		if (s->sport == pkt->src_port && s->dport == pkt->dst_port &&
1204		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) &&
1205		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) {
1206			info->direction = MATCH_FORWARD;
1207			break;
1208		}
1209		if (s->sport == pkt->dst_port && s->dport == pkt->src_port &&
1210		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) &&
1211		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) {
1212			info->direction = MATCH_REVERSE;
1213			break;
1214		}
1215	}
1216	if (s != NULL)
1217		dyn_update_proto_state(s->data, pkt, ulp, pktlen,
1218		    info->direction);
1219	return (s);
1220}
1221
1222/*
1223 * Lookup IPv6 state.
1224 * Simplifed version is used to check that matching state doesn't exist.
1225 */
1226static int
1227dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1228    const void *ulp, int pktlen, uint32_t bucket, uint16_t kidx)
1229{
1230	struct dyn_ipv6_state *s;
1231	int dir;
1232
1233	dir = MATCH_NONE;
1234	DYN_BUCKET_ASSERT(bucket);
1235	CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) {
1236		if (s->proto != pkt->proto || s->kidx != kidx ||
1237		    s->zoneid != zoneid)
1238			continue;
1239		if (s->sport == pkt->src_port && s->dport == pkt->dst_port &&
1240		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) &&
1241		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) {
1242			dir = MATCH_FORWARD;
1243			break;
1244		}
1245		if (s->sport == pkt->dst_port && s->dport == pkt->src_port &&
1246		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) &&
1247		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) {
1248			dir = MATCH_REVERSE;
1249			break;
1250		}
1251	}
1252	if (s != NULL)
1253		dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir);
1254	return (s != NULL);
1255}
1256
1257static struct dyn_ipv6_state *
1258dyn_lookup_ipv6_parent(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1259    const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t hashval)
1260{
1261	struct dyn_ipv6_state *s;
1262	uint32_t version, bucket;
1263
1264	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1265restart:
1266	version = DYN_BUCKET_VERSION(bucket, ipv6_parent_del);
1267	CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) {
1268		DYNSTATE_PROTECT(s);
1269		if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_del))
1270			goto restart;
1271		/*
1272		 * NOTE: we do not need to check kidx, because parent rule
1273		 * can not create states with different kidx.
1274		 * Also parent rule always created for forward direction.
1275		 */
1276		if (s->limit->parent == rule &&
1277		    s->limit->ruleid == ruleid &&
1278		    s->limit->rulenum == rulenum &&
1279		    s->proto == pkt->proto &&
1280		    s->sport == pkt->src_port &&
1281		    s->dport == pkt->dst_port && s->zoneid == zoneid &&
1282		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) &&
1283		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) {
1284			if (s->limit->expire != time_uptime +
1285			    V_dyn_short_lifetime)
1286				ck_pr_store_32(&s->limit->expire,
1287				    time_uptime + V_dyn_short_lifetime);
1288			break;
1289		}
1290	}
1291	return (s);
1292}
1293
1294static struct dyn_ipv6_state *
1295dyn_lookup_ipv6_parent_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1296    const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t bucket)
1297{
1298	struct dyn_ipv6_state *s;
1299
1300	DYN_BUCKET_ASSERT(bucket);
1301	CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) {
1302		if (s->limit->parent == rule &&
1303		    s->limit->ruleid == ruleid &&
1304		    s->limit->rulenum == rulenum &&
1305		    s->proto == pkt->proto &&
1306		    s->sport == pkt->src_port &&
1307		    s->dport == pkt->dst_port && s->zoneid == zoneid &&
1308		    IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) &&
1309		    IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6))
1310			break;
1311	}
1312	return (s);
1313}
1314
1315#endif /* INET6 */
1316
1317/*
1318 * Lookup dynamic state.
1319 *  pkt - filled by ipfw_chk() ipfw_flow_id;
1320 *  ulp - determined by ipfw_chk() upper level protocol header;
1321 *  dyn_info - info about matched state to return back;
1322 * Returns pointer to state's parent rule and dyn_info. If there is
1323 * no state, NULL is returned.
1324 * On match ipfw_dyn_lookup() updates state's counters.
1325 */
1326struct ip_fw *
1327ipfw_dyn_lookup_state(const struct ip_fw_args *args, const void *ulp,
1328    int pktlen, const ipfw_insn *cmd, struct ipfw_dyn_info *info)
1329{
1330	struct dyn_data *data;
1331	struct ip_fw *rule;
1332
1333	IPFW_RLOCK_ASSERT(&V_layer3_chain);
1334
1335	data = NULL;
1336	rule = NULL;
1337	info->kidx = cmd->arg1;
1338	info->direction = MATCH_NONE;
1339	info->hashval = hash_packet(&args->f_id);
1340
1341	DYNSTATE_CRITICAL_ENTER();
1342	if (IS_IP4_FLOW_ID(&args->f_id)) {
1343		struct dyn_ipv4_state *s;
1344
1345		s = dyn_lookup_ipv4_state(&args->f_id, ulp, info, pktlen);
1346		if (s != NULL) {
1347			/*
1348			 * Dynamic states are created using the same 5-tuple,
1349			 * so it is assumed, that parent rule for O_LIMIT
1350			 * state has the same address family.
1351			 */
1352			data = s->data;
1353			if (s->type == O_LIMIT) {
1354				s = data->parent;
1355				rule = s->limit->parent;
1356			} else
1357				rule = data->parent;
1358		}
1359	}
1360#ifdef INET6
1361	else if (IS_IP6_FLOW_ID(&args->f_id)) {
1362		struct dyn_ipv6_state *s;
1363
1364		s = dyn_lookup_ipv6_state(&args->f_id, dyn_getscopeid(args),
1365		    ulp, info, pktlen);
1366		if (s != NULL) {
1367			data = s->data;
1368			if (s->type == O_LIMIT) {
1369				s = data->parent;
1370				rule = s->limit->parent;
1371			} else
1372				rule = data->parent;
1373		}
1374	}
1375#endif
1376	if (data != NULL) {
1377		/*
1378		 * If cached chain id is the same, we can avoid rule index
1379		 * lookup. Otherwise do lookup and update chain_id and f_pos.
1380		 * It is safe even if there is concurrent thread that want
1381		 * update the same state, because chain->id can be changed
1382		 * only under IPFW_WLOCK().
1383		 */
1384		if (data->chain_id != V_layer3_chain.id) {
1385			data->f_pos = ipfw_find_rule(&V_layer3_chain,
1386			    data->rulenum, data->ruleid);
1387			/*
1388			 * Check that found state has not orphaned.
1389			 * When chain->id being changed the parent
1390			 * rule can be deleted. If found rule doesn't
1391			 * match the parent pointer, consider this
1392			 * result as MATCH_NONE and return NULL.
1393			 *
1394			 * This will lead to creation of new similar state
1395			 * that will be added into head of this bucket.
1396			 * And the state that we currently have matched
1397			 * should be deleted by dyn_expire_states().
1398			 *
1399			 * In case when dyn_keep_states is enabled, return
1400			 * pointer to default rule and corresponding f_pos
1401			 * value.
1402			 * XXX: In this case we lose the cache efficiency,
1403			 *      since f_pos is not cached, because it seems
1404			 *      there is no easy way to atomically switch
1405			 *      all fields related to parent rule of given
1406			 *      state.
1407			 */
1408			if (V_layer3_chain.map[data->f_pos] == rule) {
1409				data->chain_id = V_layer3_chain.id;
1410				info->f_pos = data->f_pos;
1411			} else if (V_dyn_keep_states != 0) {
1412				rule = V_layer3_chain.default_rule;
1413				info->f_pos = V_layer3_chain.n_rules - 1;
1414			} else {
1415				rule = NULL;
1416				info->direction = MATCH_NONE;
1417				DYN_DEBUG("rule %p  [%u, %u] is considered "
1418				    "invalid in data %p", rule, data->ruleid,
1419				    data->rulenum, data);
1420				/* info->f_pos doesn't matter here. */
1421			}
1422		} else
1423			info->f_pos = data->f_pos;
1424	}
1425	DYNSTATE_CRITICAL_EXIT();
1426#if 0
1427	/*
1428	 * Return MATCH_NONE if parent rule is in disabled set.
1429	 * This will lead to creation of new similar state that
1430	 * will be added into head of this bucket.
1431	 *
1432	 * XXXAE: we need to be able update state's set when parent
1433	 *	  rule set is changed.
1434	 */
1435	if (rule != NULL && (V_set_disable & (1 << rule->set))) {
1436		rule = NULL;
1437		info->direction = MATCH_NONE;
1438	}
1439#endif
1440	return (rule);
1441}
1442
1443static struct dyn_parent *
1444dyn_alloc_parent(void *parent, uint32_t ruleid, uint16_t rulenum,
1445    uint8_t set, uint32_t hashval)
1446{
1447	struct dyn_parent *limit;
1448
1449	limit = uma_zalloc(V_dyn_parent_zone, M_NOWAIT | M_ZERO);
1450	if (limit == NULL) {
1451		if (last_log != time_uptime) {
1452			last_log = time_uptime;
1453			log(LOG_DEBUG,
1454			    "ipfw: Cannot allocate parent dynamic state, "
1455			    "consider increasing "
1456			    "net.inet.ip.fw.dyn_parent_max\n");
1457		}
1458		return (NULL);
1459	}
1460
1461	limit->parent = parent;
1462	limit->ruleid = ruleid;
1463	limit->rulenum = rulenum;
1464	limit->set = set;
1465	limit->hashval = hashval;
1466	limit->expire = time_uptime + V_dyn_short_lifetime;
1467	return (limit);
1468}
1469
1470static struct dyn_data *
1471dyn_alloc_dyndata(void *parent, uint32_t ruleid, uint16_t rulenum,
1472    uint8_t set, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen,
1473    uint32_t hashval, uint16_t fibnum)
1474{
1475	struct dyn_data *data;
1476
1477	data = uma_zalloc(V_dyn_data_zone, M_NOWAIT | M_ZERO);
1478	if (data == NULL) {
1479		if (last_log != time_uptime) {
1480			last_log = time_uptime;
1481			log(LOG_DEBUG,
1482			    "ipfw: Cannot allocate dynamic state, "
1483			    "consider increasing net.inet.ip.fw.dyn_max\n");
1484		}
1485		return (NULL);
1486	}
1487
1488	data->parent = parent;
1489	data->ruleid = ruleid;
1490	data->rulenum = rulenum;
1491	data->set = set;
1492	data->fibnum = fibnum;
1493	data->hashval = hashval;
1494	data->expire = time_uptime + V_dyn_syn_lifetime;
1495	dyn_update_proto_state(data, pkt, ulp, pktlen, MATCH_FORWARD);
1496	return (data);
1497}
1498
1499static struct dyn_ipv4_state *
1500dyn_alloc_ipv4_state(const struct ipfw_flow_id *pkt, uint16_t kidx,
1501    uint8_t type)
1502{
1503	struct dyn_ipv4_state *s;
1504
1505	s = uma_zalloc(V_dyn_ipv4_zone, M_NOWAIT | M_ZERO);
1506	if (s == NULL)
1507		return (NULL);
1508
1509	s->type = type;
1510	s->kidx = kidx;
1511	s->proto = pkt->proto;
1512	s->sport = pkt->src_port;
1513	s->dport = pkt->dst_port;
1514	s->src = pkt->src_ip;
1515	s->dst = pkt->dst_ip;
1516	return (s);
1517}
1518
1519/*
1520 * Add IPv4 parent state.
1521 * Returns pointer to parent state. When it is not NULL we are in
1522 * critical section and pointer protected by hazard pointer.
1523 * When some error occurs, it returns NULL and exit from critical section
1524 * is not needed.
1525 */
1526static struct dyn_ipv4_state *
1527dyn_add_ipv4_parent(void *rule, uint32_t ruleid, uint16_t rulenum,
1528    uint8_t set, const struct ipfw_flow_id *pkt, uint32_t hashval,
1529    uint32_t version, uint16_t kidx)
1530{
1531	struct dyn_ipv4_state *s;
1532	struct dyn_parent *limit;
1533	uint32_t bucket;
1534
1535	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1536	DYN_BUCKET_LOCK(bucket);
1537	if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_add)) {
1538		/*
1539		 * Bucket version has been changed since last lookup,
1540		 * do lookup again to be sure that state does not exist.
1541		 */
1542		s = dyn_lookup_ipv4_parent_locked(pkt, rule, ruleid,
1543		    rulenum, bucket);
1544		if (s != NULL) {
1545			/*
1546			 * Simultaneous thread has already created this
1547			 * state. Just return it.
1548			 */
1549			DYNSTATE_CRITICAL_ENTER();
1550			DYNSTATE_PROTECT(s);
1551			DYN_BUCKET_UNLOCK(bucket);
1552			return (s);
1553		}
1554	}
1555
1556	limit = dyn_alloc_parent(rule, ruleid, rulenum, set, hashval);
1557	if (limit == NULL) {
1558		DYN_BUCKET_UNLOCK(bucket);
1559		return (NULL);
1560	}
1561
1562	s = dyn_alloc_ipv4_state(pkt, kidx, O_LIMIT_PARENT);
1563	if (s == NULL) {
1564		DYN_BUCKET_UNLOCK(bucket);
1565		uma_zfree(V_dyn_parent_zone, limit);
1566		return (NULL);
1567	}
1568
1569	s->limit = limit;
1570	CK_SLIST_INSERT_HEAD(&V_dyn_ipv4_parent[bucket], s, entry);
1571	DYN_COUNT_INC(dyn_parent_count);
1572	DYN_BUCKET_VERSION_BUMP(bucket, ipv4_parent_add);
1573	DYNSTATE_CRITICAL_ENTER();
1574	DYNSTATE_PROTECT(s);
1575	DYN_BUCKET_UNLOCK(bucket);
1576	return (s);
1577}
1578
1579static int
1580dyn_add_ipv4_state(void *parent, uint32_t ruleid, uint16_t rulenum,
1581    uint8_t set, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen,
1582    uint32_t hashval, struct ipfw_dyn_info *info, uint16_t fibnum,
1583    uint16_t kidx, uint8_t type)
1584{
1585	struct dyn_ipv4_state *s;
1586	void *data;
1587	uint32_t bucket;
1588
1589	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1590	DYN_BUCKET_LOCK(bucket);
1591	if (info->direction == MATCH_UNKNOWN ||
1592	    info->kidx != kidx ||
1593	    info->hashval != hashval ||
1594	    info->version != DYN_BUCKET_VERSION(bucket, ipv4_add)) {
1595		/*
1596		 * Bucket version has been changed since last lookup,
1597		 * do lookup again to be sure that state does not exist.
1598		 */
1599		if (dyn_lookup_ipv4_state_locked(pkt, ulp, pktlen,
1600		    bucket, kidx) != 0) {
1601			DYN_BUCKET_UNLOCK(bucket);
1602			return (EEXIST);
1603		}
1604	}
1605
1606	data = dyn_alloc_dyndata(parent, ruleid, rulenum, set, pkt, ulp,
1607	    pktlen, hashval, fibnum);
1608	if (data == NULL) {
1609		DYN_BUCKET_UNLOCK(bucket);
1610		return (ENOMEM);
1611	}
1612
1613	s = dyn_alloc_ipv4_state(pkt, kidx, type);
1614	if (s == NULL) {
1615		DYN_BUCKET_UNLOCK(bucket);
1616		uma_zfree(V_dyn_data_zone, data);
1617		return (ENOMEM);
1618	}
1619
1620	s->data = data;
1621	CK_SLIST_INSERT_HEAD(&V_dyn_ipv4[bucket], s, entry);
1622	DYN_COUNT_INC(dyn_count);
1623	DYN_BUCKET_VERSION_BUMP(bucket, ipv4_add);
1624	DYN_BUCKET_UNLOCK(bucket);
1625	return (0);
1626}
1627
1628#ifdef INET6
1629static struct dyn_ipv6_state *
1630dyn_alloc_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1631    uint16_t kidx, uint8_t type)
1632{
1633	struct dyn_ipv6_state *s;
1634
1635	s = uma_zalloc(V_dyn_ipv6_zone, M_NOWAIT | M_ZERO);
1636	if (s == NULL)
1637		return (NULL);
1638
1639	s->type = type;
1640	s->kidx = kidx;
1641	s->zoneid = zoneid;
1642	s->proto = pkt->proto;
1643	s->sport = pkt->src_port;
1644	s->dport = pkt->dst_port;
1645	s->src = pkt->src_ip6;
1646	s->dst = pkt->dst_ip6;
1647	return (s);
1648}
1649
1650/*
1651 * Add IPv6 parent state.
1652 * Returns pointer to parent state. When it is not NULL we are in
1653 * critical section and pointer protected by hazard pointer.
1654 * When some error occurs, it return NULL and exit from critical section
1655 * is not needed.
1656 */
1657static struct dyn_ipv6_state *
1658dyn_add_ipv6_parent(void *rule, uint32_t ruleid, uint16_t rulenum,
1659    uint8_t set, const struct ipfw_flow_id *pkt, uint32_t zoneid,
1660    uint32_t hashval, uint32_t version, uint16_t kidx)
1661{
1662	struct dyn_ipv6_state *s;
1663	struct dyn_parent *limit;
1664	uint32_t bucket;
1665
1666	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1667	DYN_BUCKET_LOCK(bucket);
1668	if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_add)) {
1669		/*
1670		 * Bucket version has been changed since last lookup,
1671		 * do lookup again to be sure that state does not exist.
1672		 */
1673		s = dyn_lookup_ipv6_parent_locked(pkt, zoneid, rule, ruleid,
1674		    rulenum, bucket);
1675		if (s != NULL) {
1676			/*
1677			 * Simultaneous thread has already created this
1678			 * state. Just return it.
1679			 */
1680			DYNSTATE_CRITICAL_ENTER();
1681			DYNSTATE_PROTECT(s);
1682			DYN_BUCKET_UNLOCK(bucket);
1683			return (s);
1684		}
1685	}
1686
1687	limit = dyn_alloc_parent(rule, ruleid, rulenum, set, hashval);
1688	if (limit == NULL) {
1689		DYN_BUCKET_UNLOCK(bucket);
1690		return (NULL);
1691	}
1692
1693	s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, O_LIMIT_PARENT);
1694	if (s == NULL) {
1695		DYN_BUCKET_UNLOCK(bucket);
1696		uma_zfree(V_dyn_parent_zone, limit);
1697		return (NULL);
1698	}
1699
1700	s->limit = limit;
1701	CK_SLIST_INSERT_HEAD(&V_dyn_ipv6_parent[bucket], s, entry);
1702	DYN_COUNT_INC(dyn_parent_count);
1703	DYN_BUCKET_VERSION_BUMP(bucket, ipv6_parent_add);
1704	DYNSTATE_CRITICAL_ENTER();
1705	DYNSTATE_PROTECT(s);
1706	DYN_BUCKET_UNLOCK(bucket);
1707	return (s);
1708}
1709
1710static int
1711dyn_add_ipv6_state(void *parent, uint32_t ruleid, uint16_t rulenum,
1712    uint8_t set, const struct ipfw_flow_id *pkt, uint32_t zoneid,
1713    const void *ulp, int pktlen, uint32_t hashval, struct ipfw_dyn_info *info,
1714    uint16_t fibnum, uint16_t kidx, uint8_t type)
1715{
1716	struct dyn_ipv6_state *s;
1717	struct dyn_data *data;
1718	uint32_t bucket;
1719
1720	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1721	DYN_BUCKET_LOCK(bucket);
1722	if (info->direction == MATCH_UNKNOWN ||
1723	    info->kidx != kidx ||
1724	    info->hashval != hashval ||
1725	    info->version != DYN_BUCKET_VERSION(bucket, ipv6_add)) {
1726		/*
1727		 * Bucket version has been changed since last lookup,
1728		 * do lookup again to be sure that state does not exist.
1729		 */
1730		if (dyn_lookup_ipv6_state_locked(pkt, zoneid, ulp, pktlen,
1731		    bucket, kidx) != 0) {
1732			DYN_BUCKET_UNLOCK(bucket);
1733			return (EEXIST);
1734		}
1735	}
1736
1737	data = dyn_alloc_dyndata(parent, ruleid, rulenum, set, pkt, ulp,
1738	    pktlen, hashval, fibnum);
1739	if (data == NULL) {
1740		DYN_BUCKET_UNLOCK(bucket);
1741		return (ENOMEM);
1742	}
1743
1744	s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, type);
1745	if (s == NULL) {
1746		DYN_BUCKET_UNLOCK(bucket);
1747		uma_zfree(V_dyn_data_zone, data);
1748		return (ENOMEM);
1749	}
1750
1751	s->data = data;
1752	CK_SLIST_INSERT_HEAD(&V_dyn_ipv6[bucket], s, entry);
1753	DYN_COUNT_INC(dyn_count);
1754	DYN_BUCKET_VERSION_BUMP(bucket, ipv6_add);
1755	DYN_BUCKET_UNLOCK(bucket);
1756	return (0);
1757}
1758#endif /* INET6 */
1759
1760static void *
1761dyn_get_parent_state(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1762    struct ip_fw *rule, uint32_t hashval, uint32_t limit, uint16_t kidx)
1763{
1764	char sbuf[24];
1765	struct dyn_parent *p;
1766	void *ret;
1767	uint32_t bucket, version;
1768
1769	p = NULL;
1770	ret = NULL;
1771	bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets);
1772	DYNSTATE_CRITICAL_ENTER();
1773	if (IS_IP4_FLOW_ID(pkt)) {
1774		struct dyn_ipv4_state *s;
1775
1776		version = DYN_BUCKET_VERSION(bucket, ipv4_parent_add);
1777		s = dyn_lookup_ipv4_parent(pkt, rule, rule->id,
1778		    rule->rulenum, bucket);
1779		if (s == NULL) {
1780			/*
1781			 * Exit from critical section because dyn_add_parent()
1782			 * will acquire bucket lock.
1783			 */
1784			DYNSTATE_CRITICAL_EXIT();
1785
1786			s = dyn_add_ipv4_parent(rule, rule->id,
1787			    rule->rulenum, rule->set, pkt, hashval,
1788			    version, kidx);
1789			if (s == NULL)
1790				return (NULL);
1791			/* Now we are in critical section again. */
1792		}
1793		ret = s;
1794		p = s->limit;
1795	}
1796#ifdef INET6
1797	else if (IS_IP6_FLOW_ID(pkt)) {
1798		struct dyn_ipv6_state *s;
1799
1800		version = DYN_BUCKET_VERSION(bucket, ipv6_parent_add);
1801		s = dyn_lookup_ipv6_parent(pkt, zoneid, rule, rule->id,
1802		    rule->rulenum, bucket);
1803		if (s == NULL) {
1804			/*
1805			 * Exit from critical section because dyn_add_parent()
1806			 * can acquire bucket mutex.
1807			 */
1808			DYNSTATE_CRITICAL_EXIT();
1809
1810			s = dyn_add_ipv6_parent(rule, rule->id,
1811			    rule->rulenum, rule->set, pkt, zoneid, hashval,
1812			    version, kidx);
1813			if (s == NULL)
1814				return (NULL);
1815			/* Now we are in critical section again. */
1816		}
1817		ret = s;
1818		p = s->limit;
1819	}
1820#endif
1821	else {
1822		DYNSTATE_CRITICAL_EXIT();
1823		return (NULL);
1824	}
1825
1826	/* Check the limit */
1827	if (DPARENT_COUNT(p) >= limit) {
1828		DYNSTATE_CRITICAL_EXIT();
1829		if (V_fw_verbose && last_log != time_uptime) {
1830			last_log = time_uptime;
1831			snprintf(sbuf, sizeof(sbuf), "%u drop session",
1832			    rule->rulenum);
1833			print_dyn_rule_flags(pkt, O_LIMIT,
1834			    LOG_SECURITY | LOG_DEBUG, sbuf,
1835			    "too many entries");
1836		}
1837		return (NULL);
1838	}
1839
1840	/* Take new session into account. */
1841	DPARENT_COUNT_INC(p);
1842	/*
1843	 * We must exit from critical section because the following code
1844	 * can acquire bucket mutex.
1845	 * We rely on the the 'count' field. The state will not expire
1846	 * until it has some child states, i.e. 'count' field is not zero.
1847	 * Return state pointer, it will be used by child states as parent.
1848	 */
1849	DYNSTATE_CRITICAL_EXIT();
1850	return (ret);
1851}
1852
1853static int
1854dyn_install_state(const struct ipfw_flow_id *pkt, uint32_t zoneid,
1855    uint16_t fibnum, const void *ulp, int pktlen, void *rule,
1856    uint32_t ruleid, uint16_t rulenum, uint8_t set,
1857    struct ipfw_dyn_info *info, uint32_t limit, uint16_t limit_mask,
1858    uint16_t kidx, uint8_t type)
1859{
1860	struct ipfw_flow_id id;
1861	uint32_t hashval, parent_hashval;
1862	int ret;
1863
1864	MPASS(type == O_LIMIT || type == O_KEEP_STATE);
1865
1866	if (type == O_LIMIT) {
1867		/* Create masked flow id and calculate bucket */
1868		id.addr_type = pkt->addr_type;
1869		id.proto = pkt->proto;
1870		id.fib = fibnum; /* unused */
1871		id.src_port = (limit_mask & DYN_SRC_PORT) ?
1872		    pkt->src_port: 0;
1873		id.dst_port = (limit_mask & DYN_DST_PORT) ?
1874		    pkt->dst_port: 0;
1875		if (IS_IP4_FLOW_ID(pkt)) {
1876			id.src_ip = (limit_mask & DYN_SRC_ADDR) ?
1877			    pkt->src_ip: 0;
1878			id.dst_ip = (limit_mask & DYN_DST_ADDR) ?
1879			    pkt->dst_ip: 0;
1880		}
1881#ifdef INET6
1882		else if (IS_IP6_FLOW_ID(pkt)) {
1883			if (limit_mask & DYN_SRC_ADDR)
1884				id.src_ip6 = pkt->src_ip6;
1885			else
1886				memset(&id.src_ip6, 0, sizeof(id.src_ip6));
1887			if (limit_mask & DYN_DST_ADDR)
1888				id.dst_ip6 = pkt->dst_ip6;
1889			else
1890				memset(&id.dst_ip6, 0, sizeof(id.dst_ip6));
1891		}
1892#endif
1893		else
1894			return (EAFNOSUPPORT);
1895
1896		parent_hashval = hash_parent(&id, rule);
1897		rule = dyn_get_parent_state(&id, zoneid, rule, parent_hashval,
1898		    limit, kidx);
1899		if (rule == NULL) {
1900#if 0
1901			if (V_fw_verbose && last_log != time_uptime) {
1902				last_log = time_uptime;
1903				snprintf(sbuf, sizeof(sbuf),
1904				    "%u drop session", rule->rulenum);
1905			print_dyn_rule_flags(pkt, O_LIMIT,
1906			    LOG_SECURITY | LOG_DEBUG, sbuf,
1907			    "too many entries");
1908			}
1909#endif
1910			return (EACCES);
1911		}
1912		/*
1913		 * Limit is not reached, create new state.
1914		 * Now rule points to parent state.
1915		 */
1916	}
1917
1918	hashval = hash_packet(pkt);
1919	if (IS_IP4_FLOW_ID(pkt))
1920		ret = dyn_add_ipv4_state(rule, ruleid, rulenum, set, pkt,
1921		    ulp, pktlen, hashval, info, fibnum, kidx, type);
1922#ifdef INET6
1923	else if (IS_IP6_FLOW_ID(pkt))
1924		ret = dyn_add_ipv6_state(rule, ruleid, rulenum, set, pkt,
1925		    zoneid, ulp, pktlen, hashval, info, fibnum, kidx, type);
1926#endif /* INET6 */
1927	else
1928		ret = EAFNOSUPPORT;
1929
1930	if (type == O_LIMIT) {
1931		if (ret != 0) {
1932			/*
1933			 * We failed to create child state for O_LIMIT
1934			 * opcode. Since we already counted it in the parent,
1935			 * we must revert counter back. The 'rule' points to
1936			 * parent state, use it to get dyn_parent.
1937			 *
1938			 * XXXAE: it should be safe to use 'rule' pointer
1939			 * without extra lookup, parent state is referenced
1940			 * and should not be freed.
1941			 */
1942			if (IS_IP4_FLOW_ID(&id))
1943				DPARENT_COUNT_DEC(
1944				    ((struct dyn_ipv4_state *)rule)->limit);
1945#ifdef INET6
1946			else if (IS_IP6_FLOW_ID(&id))
1947				DPARENT_COUNT_DEC(
1948				    ((struct dyn_ipv6_state *)rule)->limit);
1949#endif
1950		}
1951	}
1952	/*
1953	 * EEXIST means that simultaneous thread has created this
1954	 * state. Consider this as success.
1955	 *
1956	 * XXXAE: should we invalidate 'info' content here?
1957	 */
1958	if (ret == EEXIST)
1959		return (0);
1960	return (ret);
1961}
1962
1963/*
1964 * Install dynamic state.
1965 *  chain - ipfw's instance;
1966 *  rule - the parent rule that installs the state;
1967 *  cmd - opcode that installs the state;
1968 *  args - ipfw arguments;
1969 *  ulp - upper level protocol header;
1970 *  pktlen - packet length;
1971 *  info - dynamic state lookup info;
1972 *  tablearg - tablearg id.
1973 *
1974 * Returns non-zero value (failure) if state is not installed because
1975 * of errors or because session limitations are enforced.
1976 */
1977int
1978ipfw_dyn_install_state(struct ip_fw_chain *chain, struct ip_fw *rule,
1979    const ipfw_insn_limit *cmd, const struct ip_fw_args *args,
1980    const void *ulp, int pktlen, struct ipfw_dyn_info *info,
1981    uint32_t tablearg)
1982{
1983	uint32_t limit;
1984	uint16_t limit_mask;
1985
1986	if (cmd->o.opcode == O_LIMIT) {
1987		limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit);
1988		limit_mask = cmd->limit_mask;
1989	} else {
1990		limit = 0;
1991		limit_mask = 0;
1992	}
1993	return (dyn_install_state(&args->f_id,
1994#ifdef INET6
1995	    IS_IP6_FLOW_ID(&args->f_id) ? dyn_getscopeid(args):
1996#endif
1997	    0, M_GETFIB(args->m), ulp, pktlen, rule, rule->id, rule->rulenum,
1998	    rule->set, info, limit, limit_mask, cmd->o.arg1, cmd->o.opcode));
1999}
2000
2001/*
2002 * Free safe to remove state entries from expired lists.
2003 */
2004static void
2005dyn_free_states(struct ip_fw_chain *chain)
2006{
2007	struct dyn_ipv4_state *s4, *s4n;
2008#ifdef INET6
2009	struct dyn_ipv6_state *s6, *s6n;
2010#endif
2011	int cached_count, i;
2012
2013	/*
2014	 * We keep pointers to objects that are in use on each CPU
2015	 * in the per-cpu dyn_hp pointer. When object is going to be
2016	 * removed, first of it is unlinked from the corresponding
2017	 * list. This leads to changing of dyn_bucket_xxx_delver version.
2018	 * Unlinked objects is placed into corresponding dyn_expired_xxx
2019	 * list. Reader that is going to dereference object pointer checks
2020	 * dyn_bucket_xxx_delver version before and after storing pointer
2021	 * into dyn_hp. If version is the same, the object is protected
2022	 * from freeing and it is safe to dereference. Othervise reader
2023	 * tries to iterate list again from the beginning, but this object
2024	 * now unlinked and thus will not be accessible.
2025	 *
2026	 * Copy dyn_hp pointers for each CPU into dyn_hp_cache array.
2027	 * It does not matter that some pointer can be changed in
2028	 * time while we are copying. We need to check, that objects
2029	 * removed in the previous pass are not in use. And if dyn_hp
2030	 * pointer does not contain it in the time when we are copying,
2031	 * it will not appear there, because it is already unlinked.
2032	 * And for new pointers we will not free objects that will be
2033	 * unlinked in this pass.
2034	 */
2035	cached_count = 0;
2036	CPU_FOREACH(i) {
2037		dyn_hp_cache[cached_count] = DYNSTATE_GET(i);
2038		if (dyn_hp_cache[cached_count] != NULL)
2039			cached_count++;
2040	}
2041
2042	/*
2043	 * Free expired states that are safe to free.
2044	 * Check each entry from previous pass in the dyn_expired_xxx
2045	 * list, if pointer to the object is in the dyn_hp_cache array,
2046	 * keep it until next pass. Otherwise it is safe to free the
2047	 * object.
2048	 *
2049	 * XXXAE: optimize this to use SLIST_REMOVE_AFTER.
2050	 */
2051#define	DYN_FREE_STATES(s, next, name)		do {			\
2052	s = SLIST_FIRST(&V_dyn_expired_ ## name);			\
2053	while (s != NULL) {						\
2054		next = SLIST_NEXT(s, expired);				\
2055		for (i = 0; i < cached_count; i++)			\
2056			if (dyn_hp_cache[i] == s)			\
2057				break;					\
2058		if (i == cached_count) {				\
2059			if (s->type == O_LIMIT_PARENT &&		\
2060			    s->limit->count != 0) {			\
2061				s = next;				\
2062				continue;				\
2063			}						\
2064			SLIST_REMOVE(&V_dyn_expired_ ## name,		\
2065			    s, dyn_ ## name ## _state, expired);	\
2066			if (s->type == O_LIMIT_PARENT)			\
2067				uma_zfree(V_dyn_parent_zone, s->limit);	\
2068			else						\
2069				uma_zfree(V_dyn_data_zone, s->data);	\
2070			uma_zfree(V_dyn_ ## name ## _zone, s);		\
2071		}							\
2072		s = next;						\
2073	}								\
2074} while (0)
2075
2076	/*
2077	 * Protect access to expired lists with DYN_EXPIRED_LOCK.
2078	 * Userland can invoke ipfw_expire_dyn_states() to delete
2079	 * specific states, this will lead to modification of expired
2080	 * lists.
2081	 *
2082	 * XXXAE: do we need DYN_EXPIRED_LOCK? We can just use
2083	 *	  IPFW_UH_WLOCK to protect access to these lists.
2084	 */
2085	DYN_EXPIRED_LOCK();
2086	DYN_FREE_STATES(s4, s4n, ipv4);
2087#ifdef INET6
2088	DYN_FREE_STATES(s6, s6n, ipv6);
2089#endif
2090	DYN_EXPIRED_UNLOCK();
2091#undef DYN_FREE_STATES
2092}
2093
2094/*
2095 * Returns 1 when state is matched by specified range, otherwise returns 0.
2096 */
2097static int
2098dyn_match_range(uint16_t rulenum, uint8_t set, const ipfw_range_tlv *rt)
2099{
2100
2101	MPASS(rt != NULL);
2102	/* flush all states */
2103	if (rt->flags & IPFW_RCFLAG_ALL)
2104		return (1);
2105	if ((rt->flags & IPFW_RCFLAG_SET) != 0 && set != rt->set)
2106		return (0);
2107	if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 &&
2108	    (rulenum < rt->start_rule || rulenum > rt->end_rule))
2109		return (0);
2110	return (1);
2111}
2112
2113static int
2114dyn_match_ipv4_state(struct dyn_ipv4_state *s, const ipfw_range_tlv *rt)
2115{
2116
2117	if (s->type == O_LIMIT_PARENT)
2118		return (dyn_match_range(s->limit->rulenum,
2119		    s->limit->set, rt));
2120
2121	if (s->type == O_LIMIT)
2122		return (dyn_match_range(s->data->rulenum, s->data->set, rt));
2123
2124	if (V_dyn_keep_states == 0 &&
2125	    dyn_match_range(s->data->rulenum, s->data->set, rt))
2126		return (1);
2127
2128	return (0);
2129}
2130
2131#ifdef INET6
2132static int
2133dyn_match_ipv6_state(struct dyn_ipv6_state *s, const ipfw_range_tlv *rt)
2134{
2135
2136	if (s->type == O_LIMIT_PARENT)
2137		return (dyn_match_range(s->limit->rulenum,
2138		    s->limit->set, rt));
2139
2140	if (s->type == O_LIMIT)
2141		return (dyn_match_range(s->data->rulenum, s->data->set, rt));
2142
2143	if (V_dyn_keep_states == 0 &&
2144	    dyn_match_range(s->data->rulenum, s->data->set, rt))
2145		return (1);
2146
2147	return (0);
2148}
2149#endif
2150
2151/*
2152 * Unlink expired entries from states lists.
2153 * @rt can be used to specify the range of states for deletion.
2154 */
2155static void
2156dyn_expire_states(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
2157{
2158	struct dyn_ipv4_slist expired_ipv4;
2159#ifdef INET6
2160	struct dyn_ipv6_slist expired_ipv6;
2161	struct dyn_ipv6_state *s6, *s6n, *s6p;
2162#endif
2163	struct dyn_ipv4_state *s4, *s4n, *s4p;
2164	int bucket, removed, length, max_length;
2165
2166	/*
2167	 * Unlink expired states from each bucket.
2168	 * With acquired bucket lock iterate entries of each lists:
2169	 * ipv4, ipv4_parent, ipv6, and ipv6_parent. Check expired time
2170	 * and unlink entry from the list, link entry into temporary
2171	 * expired_xxx lists then bump "del" bucket version.
2172	 *
2173	 * When an entry is removed, corresponding states counter is
2174	 * decremented. If entry has O_LIMIT type, parent's reference
2175	 * counter is decremented.
2176	 *
2177	 * NOTE: this function can be called from userspace context
2178	 * when user deletes rules. In this case all matched states
2179	 * will be forcedly unlinked. O_LIMIT_PARENT states will be kept
2180	 * in the expired lists until reference counter become zero.
2181	 */
2182#define	DYN_UNLINK_STATES(s, prev, next, exp, af, name, extra)	do {	\
2183	length = 0;							\
2184	removed = 0;							\
2185	prev = NULL;							\
2186	s = CK_SLIST_FIRST(&V_dyn_ ## name [bucket]);			\
2187	while (s != NULL) {						\
2188		next = CK_SLIST_NEXT(s, entry);				\
2189		if ((TIME_LEQ((s)->exp, time_uptime) && extra) ||	\
2190		    (rt != NULL && dyn_match_ ## af ## _state(s, rt))) {\
2191			if (prev != NULL)				\
2192				CK_SLIST_REMOVE_AFTER(prev, entry);	\
2193			else						\
2194				CK_SLIST_REMOVE_HEAD(			\
2195				    &V_dyn_ ## name [bucket], entry);	\
2196			removed++;					\
2197			SLIST_INSERT_HEAD(&expired_ ## af, s, expired);	\
2198			if (s->type == O_LIMIT_PARENT)			\
2199				DYN_COUNT_DEC(dyn_parent_count);	\
2200			else {						\
2201				DYN_COUNT_DEC(dyn_count);		\
2202				if (s->type == O_LIMIT)	{		\
2203					s = s->data->parent;		\
2204					DPARENT_COUNT_DEC(s->limit);	\
2205				}					\
2206			}						\
2207		} else {						\
2208			prev = s;					\
2209			length++;					\
2210		}							\
2211		s = next;						\
2212	}								\
2213	if (removed != 0)						\
2214		DYN_BUCKET_VERSION_BUMP(bucket, name ## _del);		\
2215	if (length > max_length)				\
2216		max_length = length;				\
2217} while (0)
2218
2219	SLIST_INIT(&expired_ipv4);
2220#ifdef INET6
2221	SLIST_INIT(&expired_ipv6);
2222#endif
2223	max_length = 0;
2224	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2225		DYN_BUCKET_LOCK(bucket);
2226		DYN_UNLINK_STATES(s4, s4p, s4n, data->expire, ipv4, ipv4, 1);
2227		DYN_UNLINK_STATES(s4, s4p, s4n, limit->expire, ipv4,
2228		    ipv4_parent, (s4->limit->count == 0));
2229#ifdef INET6
2230		DYN_UNLINK_STATES(s6, s6p, s6n, data->expire, ipv6, ipv6, 1);
2231		DYN_UNLINK_STATES(s6, s6p, s6n, limit->expire, ipv6,
2232		    ipv6_parent, (s6->limit->count == 0));
2233#endif
2234		DYN_BUCKET_UNLOCK(bucket);
2235	}
2236	/* Update curr_max_length for statistics. */
2237	V_curr_max_length = max_length;
2238	/*
2239	 * Concatenate temporary lists with global expired lists.
2240	 */
2241	DYN_EXPIRED_LOCK();
2242	SLIST_CONCAT(&V_dyn_expired_ipv4, &expired_ipv4,
2243	    dyn_ipv4_state, expired);
2244#ifdef INET6
2245	SLIST_CONCAT(&V_dyn_expired_ipv6, &expired_ipv6,
2246	    dyn_ipv6_state, expired);
2247#endif
2248	DYN_EXPIRED_UNLOCK();
2249#undef DYN_UNLINK_STATES
2250#undef DYN_UNREF_STATES
2251}
2252
2253static struct mbuf *
2254dyn_mgethdr(int len, uint16_t fibnum)
2255{
2256	struct mbuf *m;
2257
2258	m = m_gethdr(M_NOWAIT, MT_DATA);
2259	if (m == NULL)
2260		return (NULL);
2261#ifdef MAC
2262	mac_netinet_firewall_send(m);
2263#endif
2264	M_SETFIB(m, fibnum);
2265	m->m_data += max_linkhdr;
2266	m->m_flags |= M_SKIP_FIREWALL;
2267	m->m_len = m->m_pkthdr.len = len;
2268	bzero(m->m_data, len);
2269	return (m);
2270}
2271
2272static void
2273dyn_make_keepalive_ipv4(struct mbuf *m, in_addr_t src, in_addr_t dst,
2274    uint32_t seq, uint32_t ack, uint16_t sport, uint16_t dport)
2275{
2276	struct tcphdr *tcp;
2277	struct ip *ip;
2278
2279	ip = mtod(m, struct ip *);
2280	ip->ip_v = 4;
2281	ip->ip_hl = sizeof(*ip) >> 2;
2282	ip->ip_tos = IPTOS_LOWDELAY;
2283	ip->ip_len = htons(m->m_len);
2284	ip->ip_off |= htons(IP_DF);
2285	ip->ip_ttl = V_ip_defttl;
2286	ip->ip_p = IPPROTO_TCP;
2287	ip->ip_src.s_addr = htonl(src);
2288	ip->ip_dst.s_addr = htonl(dst);
2289
2290	tcp = mtodo(m, sizeof(struct ip));
2291	tcp->th_sport = htons(sport);
2292	tcp->th_dport = htons(dport);
2293	tcp->th_off = sizeof(struct tcphdr) >> 2;
2294	tcp->th_seq = htonl(seq);
2295	tcp->th_ack = htonl(ack);
2296	tcp->th_flags = TH_ACK;
2297	tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2298	    htons(sizeof(struct tcphdr) + IPPROTO_TCP));
2299
2300	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
2301	m->m_pkthdr.csum_flags = CSUM_TCP;
2302}
2303
2304static void
2305dyn_enqueue_keepalive_ipv4(struct mbufq *q, const struct dyn_ipv4_state *s)
2306{
2307	struct mbuf *m;
2308
2309	if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) {
2310		m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr),
2311		    s->data->fibnum);
2312		if (m != NULL) {
2313			dyn_make_keepalive_ipv4(m, s->dst, s->src,
2314			    s->data->ack_fwd - 1, s->data->ack_rev,
2315			    s->dport, s->sport);
2316			if (mbufq_enqueue(q, m)) {
2317				m_freem(m);
2318				log(LOG_DEBUG, "ipfw: limit for IPv4 "
2319				    "keepalive queue is reached.\n");
2320				return;
2321			}
2322		}
2323	}
2324
2325	if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) {
2326		m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr),
2327		    s->data->fibnum);
2328		if (m != NULL) {
2329			dyn_make_keepalive_ipv4(m, s->src, s->dst,
2330			    s->data->ack_rev - 1, s->data->ack_fwd,
2331			    s->sport, s->dport);
2332			if (mbufq_enqueue(q, m)) {
2333				m_freem(m);
2334				log(LOG_DEBUG, "ipfw: limit for IPv4 "
2335				    "keepalive queue is reached.\n");
2336				return;
2337			}
2338		}
2339	}
2340}
2341
2342/*
2343 * Prepare and send keep-alive packets.
2344 */
2345static void
2346dyn_send_keepalive_ipv4(struct ip_fw_chain *chain)
2347{
2348	struct mbufq q;
2349	struct mbuf *m;
2350	struct dyn_ipv4_state *s;
2351	uint32_t bucket;
2352
2353	mbufq_init(&q, DYN_KEEPALIVE_MAXQ);
2354	IPFW_UH_RLOCK(chain);
2355	/*
2356	 * It is safe to not use hazard pointer and just do lockless
2357	 * access to the lists, because states entries can not be deleted
2358	 * while we hold IPFW_UH_RLOCK.
2359	 */
2360	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2361		CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) {
2362			/*
2363			 * Only established TCP connections that will
2364			 * become expired withing dyn_keepalive_interval.
2365			 */
2366			if (s->proto != IPPROTO_TCP ||
2367			    (s->data->state & BOTH_SYN) != BOTH_SYN ||
2368			    TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
2369				s->data->expire))
2370				continue;
2371			dyn_enqueue_keepalive_ipv4(&q, s);
2372		}
2373	}
2374	IPFW_UH_RUNLOCK(chain);
2375	while ((m = mbufq_dequeue(&q)) != NULL)
2376		ip_output(m, NULL, NULL, 0, NULL, NULL);
2377}
2378
2379#ifdef INET6
2380static void
2381dyn_make_keepalive_ipv6(struct mbuf *m, const struct in6_addr *src,
2382    const struct in6_addr *dst, uint32_t zoneid, uint32_t seq, uint32_t ack,
2383    uint16_t sport, uint16_t dport)
2384{
2385	struct tcphdr *tcp;
2386	struct ip6_hdr *ip6;
2387
2388	ip6 = mtod(m, struct ip6_hdr *);
2389	ip6->ip6_vfc |= IPV6_VERSION;
2390	ip6->ip6_plen = htons(sizeof(struct tcphdr));
2391	ip6->ip6_nxt = IPPROTO_TCP;
2392	ip6->ip6_hlim = IPV6_DEFHLIM;
2393	ip6->ip6_src = *src;
2394	if (IN6_IS_ADDR_LINKLOCAL(src))
2395		ip6->ip6_src.s6_addr16[1] = htons(zoneid & 0xffff);
2396	ip6->ip6_dst = *dst;
2397	if (IN6_IS_ADDR_LINKLOCAL(dst))
2398		ip6->ip6_dst.s6_addr16[1] = htons(zoneid & 0xffff);
2399
2400	tcp = mtodo(m, sizeof(struct ip6_hdr));
2401	tcp->th_sport = htons(sport);
2402	tcp->th_dport = htons(dport);
2403	tcp->th_off = sizeof(struct tcphdr) >> 2;
2404	tcp->th_seq = htonl(seq);
2405	tcp->th_ack = htonl(ack);
2406	tcp->th_flags = TH_ACK;
2407	tcp->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr),
2408	    IPPROTO_TCP, 0);
2409
2410	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
2411	m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
2412}
2413
2414static void
2415dyn_enqueue_keepalive_ipv6(struct mbufq *q, const struct dyn_ipv6_state *s)
2416{
2417	struct mbuf *m;
2418
2419	if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) {
2420		m = dyn_mgethdr(sizeof(struct ip6_hdr) +
2421		    sizeof(struct tcphdr), s->data->fibnum);
2422		if (m != NULL) {
2423			dyn_make_keepalive_ipv6(m, &s->dst, &s->src,
2424			    s->zoneid, s->data->ack_fwd - 1, s->data->ack_rev,
2425			    s->dport, s->sport);
2426			if (mbufq_enqueue(q, m)) {
2427				m_freem(m);
2428				log(LOG_DEBUG, "ipfw: limit for IPv6 "
2429				    "keepalive queue is reached.\n");
2430				return;
2431			}
2432		}
2433	}
2434
2435	if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) {
2436		m = dyn_mgethdr(sizeof(struct ip6_hdr) +
2437		    sizeof(struct tcphdr), s->data->fibnum);
2438		if (m != NULL) {
2439			dyn_make_keepalive_ipv6(m, &s->src, &s->dst,
2440			    s->zoneid, s->data->ack_rev - 1, s->data->ack_fwd,
2441			    s->sport, s->dport);
2442			if (mbufq_enqueue(q, m)) {
2443				m_freem(m);
2444				log(LOG_DEBUG, "ipfw: limit for IPv6 "
2445				    "keepalive queue is reached.\n");
2446				return;
2447			}
2448		}
2449	}
2450}
2451
2452static void
2453dyn_send_keepalive_ipv6(struct ip_fw_chain *chain)
2454{
2455	struct mbufq q;
2456	struct mbuf *m;
2457	struct dyn_ipv6_state *s;
2458	uint32_t bucket;
2459
2460	mbufq_init(&q, DYN_KEEPALIVE_MAXQ);
2461	IPFW_UH_RLOCK(chain);
2462	/*
2463	 * It is safe to not use hazard pointer and just do lockless
2464	 * access to the lists, because states entries can not be deleted
2465	 * while we hold IPFW_UH_RLOCK.
2466	 */
2467	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2468		CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) {
2469			/*
2470			 * Only established TCP connections that will
2471			 * become expired withing dyn_keepalive_interval.
2472			 */
2473			if (s->proto != IPPROTO_TCP ||
2474			    (s->data->state & BOTH_SYN) != BOTH_SYN ||
2475			    TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
2476				s->data->expire))
2477				continue;
2478			dyn_enqueue_keepalive_ipv6(&q, s);
2479		}
2480	}
2481	IPFW_UH_RUNLOCK(chain);
2482	while ((m = mbufq_dequeue(&q)) != NULL)
2483		ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
2484}
2485#endif /* INET6 */
2486
2487static void
2488dyn_grow_hashtable(struct ip_fw_chain *chain, uint32_t new)
2489{
2490#ifdef INET6
2491	struct dyn_ipv6ck_slist *ipv6, *ipv6_parent;
2492	uint32_t *ipv6_add, *ipv6_del, *ipv6_parent_add, *ipv6_parent_del;
2493	struct dyn_ipv6_state *s6;
2494#endif
2495	struct dyn_ipv4ck_slist *ipv4, *ipv4_parent;
2496	uint32_t *ipv4_add, *ipv4_del, *ipv4_parent_add, *ipv4_parent_del;
2497	struct dyn_ipv4_state *s4;
2498	struct mtx *bucket_lock;
2499	void *tmp;
2500	uint32_t bucket;
2501
2502	MPASS(powerof2(new));
2503	DYN_DEBUG("grow hash size %u -> %u", V_curr_dyn_buckets, new);
2504	/*
2505	 * Allocate and initialize new lists.
2506	 * XXXAE: on memory pressure this can disable callout timer.
2507	 */
2508	bucket_lock = malloc(new * sizeof(struct mtx), M_IPFW,
2509	    M_WAITOK | M_ZERO);
2510	ipv4 = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW,
2511	    M_WAITOK | M_ZERO);
2512	ipv4_parent = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW,
2513	    M_WAITOK | M_ZERO);
2514	ipv4_add = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO);
2515	ipv4_del = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO);
2516	ipv4_parent_add = malloc(new * sizeof(uint32_t), M_IPFW,
2517	    M_WAITOK | M_ZERO);
2518	ipv4_parent_del = malloc(new * sizeof(uint32_t), M_IPFW,
2519	    M_WAITOK | M_ZERO);
2520#ifdef INET6
2521	ipv6 = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW,
2522	    M_WAITOK | M_ZERO);
2523	ipv6_parent = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW,
2524	    M_WAITOK | M_ZERO);
2525	ipv6_add = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO);
2526	ipv6_del = malloc(new * sizeof(uint32_t), M_IPFW, M_WAITOK | M_ZERO);
2527	ipv6_parent_add = malloc(new * sizeof(uint32_t), M_IPFW,
2528	    M_WAITOK | M_ZERO);
2529	ipv6_parent_del = malloc(new * sizeof(uint32_t), M_IPFW,
2530	    M_WAITOK | M_ZERO);
2531#endif
2532	for (bucket = 0; bucket < new; bucket++) {
2533		DYN_BUCKET_LOCK_INIT(bucket_lock, bucket);
2534		CK_SLIST_INIT(&ipv4[bucket]);
2535		CK_SLIST_INIT(&ipv4_parent[bucket]);
2536#ifdef INET6
2537		CK_SLIST_INIT(&ipv6[bucket]);
2538		CK_SLIST_INIT(&ipv6_parent[bucket]);
2539#endif
2540	}
2541
2542#define DYN_RELINK_STATES(s, hval, i, head, ohead)	do {		\
2543	while ((s = CK_SLIST_FIRST(&V_dyn_ ## ohead[i])) != NULL) {	\
2544		CK_SLIST_REMOVE_HEAD(&V_dyn_ ## ohead[i], entry);	\
2545		CK_SLIST_INSERT_HEAD(&head[DYN_BUCKET(s->hval, new)],	\
2546		    s, entry);						\
2547	}								\
2548} while (0)
2549	/*
2550	 * Prevent rules changing from userland.
2551	 */
2552	IPFW_UH_WLOCK(chain);
2553	/*
2554	 * Hold traffic processing until we finish resize to
2555	 * prevent access to states lists.
2556	 */
2557	IPFW_WLOCK(chain);
2558	/* Re-link all dynamic states */
2559	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2560		DYN_RELINK_STATES(s4, data->hashval, bucket, ipv4, ipv4);
2561		DYN_RELINK_STATES(s4, limit->hashval, bucket, ipv4_parent,
2562		    ipv4_parent);
2563#ifdef INET6
2564		DYN_RELINK_STATES(s6, data->hashval, bucket, ipv6, ipv6);
2565		DYN_RELINK_STATES(s6, limit->hashval, bucket, ipv6_parent,
2566		    ipv6_parent);
2567#endif
2568	}
2569
2570#define	DYN_SWAP_PTR(old, new, tmp)	do {		\
2571	tmp = old;					\
2572	old = new;					\
2573	new = tmp;					\
2574} while (0)
2575	/* Swap pointers */
2576	DYN_SWAP_PTR(V_dyn_bucket_lock, bucket_lock, tmp);
2577	DYN_SWAP_PTR(V_dyn_ipv4, ipv4, tmp);
2578	DYN_SWAP_PTR(V_dyn_ipv4_parent, ipv4_parent, tmp);
2579	DYN_SWAP_PTR(V_dyn_ipv4_add, ipv4_add, tmp);
2580	DYN_SWAP_PTR(V_dyn_ipv4_parent_add, ipv4_parent_add, tmp);
2581	DYN_SWAP_PTR(V_dyn_ipv4_del, ipv4_del, tmp);
2582	DYN_SWAP_PTR(V_dyn_ipv4_parent_del, ipv4_parent_del, tmp);
2583
2584#ifdef INET6
2585	DYN_SWAP_PTR(V_dyn_ipv6, ipv6, tmp);
2586	DYN_SWAP_PTR(V_dyn_ipv6_parent, ipv6_parent, tmp);
2587	DYN_SWAP_PTR(V_dyn_ipv6_add, ipv6_add, tmp);
2588	DYN_SWAP_PTR(V_dyn_ipv6_parent_add, ipv6_parent_add, tmp);
2589	DYN_SWAP_PTR(V_dyn_ipv6_del, ipv6_del, tmp);
2590	DYN_SWAP_PTR(V_dyn_ipv6_parent_del, ipv6_parent_del, tmp);
2591#endif
2592	bucket = V_curr_dyn_buckets;
2593	V_curr_dyn_buckets = new;
2594
2595	IPFW_WUNLOCK(chain);
2596	IPFW_UH_WUNLOCK(chain);
2597
2598	/* Release old resources */
2599	while (bucket-- != 0)
2600		DYN_BUCKET_LOCK_DESTROY(bucket_lock, bucket);
2601	free(bucket_lock, M_IPFW);
2602	free(ipv4, M_IPFW);
2603	free(ipv4_parent, M_IPFW);
2604	free(ipv4_add, M_IPFW);
2605	free(ipv4_parent_add, M_IPFW);
2606	free(ipv4_del, M_IPFW);
2607	free(ipv4_parent_del, M_IPFW);
2608#ifdef INET6
2609	free(ipv6, M_IPFW);
2610	free(ipv6_parent, M_IPFW);
2611	free(ipv6_add, M_IPFW);
2612	free(ipv6_parent_add, M_IPFW);
2613	free(ipv6_del, M_IPFW);
2614	free(ipv6_parent_del, M_IPFW);
2615#endif
2616}
2617
2618/*
2619 * This function is used to perform various maintenance
2620 * on dynamic hash lists. Currently it is called every second.
2621 */
2622static void
2623dyn_tick(void *vnetx)
2624{
2625	uint32_t buckets;
2626
2627	CURVNET_SET((struct vnet *)vnetx);
2628	/*
2629	 * First free states unlinked in previous passes.
2630	 */
2631	dyn_free_states(&V_layer3_chain);
2632	/*
2633	 * Now unlink others expired states.
2634	 * We use IPFW_UH_WLOCK to avoid concurrent call of
2635	 * dyn_expire_states(). It is the only function that does
2636	 * deletion of state entries from states lists.
2637	 */
2638	IPFW_UH_WLOCK(&V_layer3_chain);
2639	dyn_expire_states(&V_layer3_chain, NULL);
2640	IPFW_UH_WUNLOCK(&V_layer3_chain);
2641	/*
2642	 * Send keepalives if they are enabled and the time has come.
2643	 */
2644	if (V_dyn_keepalive != 0 &&
2645	    V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) {
2646		V_dyn_keepalive_last = time_uptime;
2647		dyn_send_keepalive_ipv4(&V_layer3_chain);
2648#ifdef INET6
2649		dyn_send_keepalive_ipv6(&V_layer3_chain);
2650#endif
2651	}
2652	/*
2653	 * Check if we need to resize the hash:
2654	 * if current number of states exceeds number of buckets in hash,
2655	 * and dyn_buckets_max permits to grow the number of buckets, then
2656	 * do it. Grow hash size to the minimum power of 2 which is bigger
2657	 * than current states count.
2658	 */
2659	if (V_curr_dyn_buckets < V_dyn_buckets_max &&
2660	    (V_curr_dyn_buckets < V_dyn_count / 2 || (
2661	    V_curr_dyn_buckets < V_dyn_count && V_curr_max_length > 8))) {
2662		buckets = 1 << fls(V_dyn_count);
2663		if (buckets > V_dyn_buckets_max)
2664			buckets = V_dyn_buckets_max;
2665		dyn_grow_hashtable(&V_layer3_chain, buckets);
2666	}
2667
2668	callout_reset_on(&V_dyn_timeout, hz, dyn_tick, vnetx, 0);
2669	CURVNET_RESTORE();
2670}
2671
2672void
2673ipfw_expire_dyn_states(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
2674{
2675	/*
2676	 * Do not perform any checks if we currently have no dynamic states
2677	 */
2678	if (V_dyn_count == 0)
2679		return;
2680
2681	IPFW_UH_WLOCK_ASSERT(chain);
2682	dyn_expire_states(chain, rt);
2683}
2684
2685/*
2686 * Returns size of dynamic states in legacy format
2687 */
2688int
2689ipfw_dyn_len(void)
2690{
2691
2692	return ((V_dyn_count + V_dyn_parent_count) * sizeof(ipfw_dyn_rule));
2693}
2694
2695/*
2696 * Returns number of dynamic states.
2697 * Used by dump format v1 (current).
2698 */
2699uint32_t
2700ipfw_dyn_get_count(void)
2701{
2702
2703	return (V_dyn_count + V_dyn_parent_count);
2704}
2705
2706/*
2707 * Check if rule contains at least one dynamic opcode.
2708 *
2709 * Returns 1 if such opcode is found, 0 otherwise.
2710 */
2711int
2712ipfw_is_dyn_rule(struct ip_fw *rule)
2713{
2714	int cmdlen, l;
2715	ipfw_insn *cmd;
2716
2717	l = rule->cmd_len;
2718	cmd = rule->cmd;
2719	cmdlen = 0;
2720	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
2721		cmdlen = F_LEN(cmd);
2722
2723		switch (cmd->opcode) {
2724		case O_LIMIT:
2725		case O_KEEP_STATE:
2726		case O_PROBE_STATE:
2727		case O_CHECK_STATE:
2728			return (1);
2729		}
2730	}
2731
2732	return (0);
2733}
2734
2735static void
2736dyn_export_parent(const struct dyn_parent *p, uint16_t kidx,
2737    ipfw_dyn_rule *dst)
2738{
2739
2740	dst->dyn_type = O_LIMIT_PARENT;
2741	dst->kidx = kidx;
2742	dst->count = (uint16_t)DPARENT_COUNT(p);
2743	dst->expire = TIME_LEQ(p->expire, time_uptime) ?  0:
2744	    p->expire - time_uptime;
2745
2746	/* 'rule' is used to pass up the rule number and set */
2747	memcpy(&dst->rule, &p->rulenum, sizeof(p->rulenum));
2748	/* store set number into high word of dst->rule pointer. */
2749	memcpy((char *)&dst->rule + sizeof(p->rulenum), &p->set,
2750	    sizeof(p->set));
2751
2752	/* unused fields */
2753	dst->pcnt = 0;
2754	dst->bcnt = 0;
2755	dst->parent = NULL;
2756	dst->state = 0;
2757	dst->ack_fwd = 0;
2758	dst->ack_rev = 0;
2759	dst->bucket = p->hashval;
2760	/*
2761	 * The legacy userland code will interpret a NULL here as a marker
2762	 * for the last dynamic rule.
2763	 */
2764	dst->next = (ipfw_dyn_rule *)1;
2765}
2766
2767static void
2768dyn_export_data(const struct dyn_data *data, uint16_t kidx, uint8_t type,
2769    ipfw_dyn_rule *dst)
2770{
2771
2772	dst->dyn_type = type;
2773	dst->kidx = kidx;
2774	dst->pcnt = data->pcnt_fwd + data->pcnt_rev;
2775	dst->bcnt = data->bcnt_fwd + data->bcnt_rev;
2776	dst->expire = TIME_LEQ(data->expire, time_uptime) ?  0:
2777	    data->expire - time_uptime;
2778
2779	/* 'rule' is used to pass up the rule number and set */
2780	memcpy(&dst->rule, &data->rulenum, sizeof(data->rulenum));
2781	/* store set number into high word of dst->rule pointer. */
2782	memcpy((char *)&dst->rule + sizeof(data->rulenum), &data->set,
2783	    sizeof(data->set));
2784
2785	/* unused fields */
2786	dst->parent = NULL;
2787	dst->state = data->state;
2788	dst->ack_fwd = data->ack_fwd;
2789	dst->ack_rev = data->ack_rev;
2790	dst->count = 0;
2791	dst->bucket = data->hashval;
2792	/*
2793	 * The legacy userland code will interpret a NULL here as a marker
2794	 * for the last dynamic rule.
2795	 */
2796	dst->next = (ipfw_dyn_rule *)1;
2797}
2798
2799static void
2800dyn_export_ipv4_state(const struct dyn_ipv4_state *s, ipfw_dyn_rule *dst)
2801{
2802
2803	switch (s->type) {
2804	case O_LIMIT_PARENT:
2805		dyn_export_parent(s->limit, s->kidx, dst);
2806		break;
2807	default:
2808		dyn_export_data(s->data, s->kidx, s->type, dst);
2809	}
2810
2811	dst->id.dst_ip = s->dst;
2812	dst->id.src_ip = s->src;
2813	dst->id.dst_port = s->dport;
2814	dst->id.src_port = s->sport;
2815	dst->id.fib = s->data->fibnum;
2816	dst->id.proto = s->proto;
2817	dst->id._flags = 0;
2818	dst->id.addr_type = 4;
2819
2820	memset(&dst->id.dst_ip6, 0, sizeof(dst->id.dst_ip6));
2821	memset(&dst->id.src_ip6, 0, sizeof(dst->id.src_ip6));
2822	dst->id.flow_id6 = dst->id.extra = 0;
2823}
2824
2825#ifdef INET6
2826static void
2827dyn_export_ipv6_state(const struct dyn_ipv6_state *s, ipfw_dyn_rule *dst)
2828{
2829
2830	switch (s->type) {
2831	case O_LIMIT_PARENT:
2832		dyn_export_parent(s->limit, s->kidx, dst);
2833		break;
2834	default:
2835		dyn_export_data(s->data, s->kidx, s->type, dst);
2836	}
2837
2838	dst->id.src_ip6 = s->src;
2839	dst->id.dst_ip6 = s->dst;
2840	dst->id.dst_port = s->dport;
2841	dst->id.src_port = s->sport;
2842	dst->id.fib = s->data->fibnum;
2843	dst->id.proto = s->proto;
2844	dst->id._flags = 0;
2845	dst->id.addr_type = 6;
2846
2847	dst->id.dst_ip = dst->id.src_ip = 0;
2848	dst->id.flow_id6 = dst->id.extra = 0;
2849}
2850#endif /* INET6 */
2851
2852/*
2853 * Fills the buffer given by @sd with dynamic states.
2854 * Used by dump format v1 (current).
2855 *
2856 * Returns 0 on success.
2857 */
2858int
2859ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd)
2860{
2861#ifdef INET6
2862	struct dyn_ipv6_state *s6;
2863#endif
2864	struct dyn_ipv4_state *s4;
2865	ipfw_obj_dyntlv *dst, *last;
2866	ipfw_obj_ctlv *ctlv;
2867	uint32_t bucket;
2868
2869	if (V_dyn_count == 0)
2870		return (0);
2871
2872	/*
2873	 * IPFW_UH_RLOCK garantees that another userland request
2874	 * and callout thread will not delete entries from states
2875	 * lists.
2876	 */
2877	IPFW_UH_RLOCK_ASSERT(chain);
2878
2879	ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
2880	if (ctlv == NULL)
2881		return (ENOMEM);
2882	ctlv->head.type = IPFW_TLV_DYNSTATE_LIST;
2883	ctlv->objsize = sizeof(ipfw_obj_dyntlv);
2884	last = NULL;
2885
2886#define	DYN_EXPORT_STATES(s, af, h, b)				\
2887	CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) {			\
2888		dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd,	\
2889		    sizeof(ipfw_obj_dyntlv));				\
2890		if (dst == NULL)					\
2891			return (ENOMEM);				\
2892		dyn_export_ ## af ## _state(s, &dst->state);		\
2893		dst->head.length = sizeof(ipfw_obj_dyntlv);		\
2894		dst->head.type = IPFW_TLV_DYN_ENT;			\
2895		last = dst;						\
2896	}
2897
2898	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2899		DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket);
2900		DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket);
2901#ifdef INET6
2902		DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket);
2903		DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket);
2904#endif /* INET6 */
2905	}
2906
2907	/* mark last dynamic rule */
2908	if (last != NULL)
2909		last->head.flags = IPFW_DF_LAST; /* XXX: unused */
2910	return (0);
2911#undef DYN_EXPORT_STATES
2912}
2913
2914/*
2915 * Fill given buffer with dynamic states (legacy format).
2916 * IPFW_UH_RLOCK has to be held while calling.
2917 */
2918void
2919ipfw_get_dynamic(struct ip_fw_chain *chain, char **pbp, const char *ep)
2920{
2921#ifdef INET6
2922	struct dyn_ipv6_state *s6;
2923#endif
2924	struct dyn_ipv4_state *s4;
2925	ipfw_dyn_rule *p, *last = NULL;
2926	char *bp;
2927	uint32_t bucket;
2928
2929	if (V_dyn_count == 0)
2930		return;
2931	bp = *pbp;
2932
2933	IPFW_UH_RLOCK_ASSERT(chain);
2934
2935#define	DYN_EXPORT_STATES(s, af, head, b)				\
2936	CK_SLIST_FOREACH(s, &V_dyn_ ## head[b], entry) {		\
2937		if (bp + sizeof(*p) > ep)				\
2938			break;						\
2939		p = (ipfw_dyn_rule *)bp;				\
2940		dyn_export_ ## af ## _state(s, p);			\
2941		last = p;						\
2942		bp += sizeof(*p);					\
2943	}
2944
2945	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
2946		DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket);
2947		DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket);
2948#ifdef INET6
2949		DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket);
2950		DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket);
2951#endif /* INET6 */
2952	}
2953
2954	if (last != NULL) /* mark last dynamic rule */
2955		last->next = NULL;
2956	*pbp = bp;
2957#undef DYN_EXPORT_STATES
2958}
2959
2960void
2961ipfw_dyn_init(struct ip_fw_chain *chain)
2962{
2963
2964#ifdef IPFIREWALL_JENKINSHASH
2965	V_dyn_hashseed = arc4random();
2966#endif
2967	V_dyn_max = 16384;		/* max # of states */
2968	V_dyn_parent_max = 4096;	/* max # of parent states */
2969	V_dyn_buckets_max = 8192;	/* must be power of 2 */
2970
2971	V_dyn_ack_lifetime = 300;
2972	V_dyn_syn_lifetime = 20;
2973	V_dyn_fin_lifetime = 1;
2974	V_dyn_rst_lifetime = 1;
2975	V_dyn_udp_lifetime = 10;
2976	V_dyn_short_lifetime = 5;
2977
2978	V_dyn_keepalive_interval = 20;
2979	V_dyn_keepalive_period = 5;
2980	V_dyn_keepalive = 1;		/* send keepalives */
2981	V_dyn_keepalive_last = time_uptime;
2982
2983	V_dyn_data_zone = uma_zcreate("IPFW dynamic states data",
2984	    sizeof(struct dyn_data), NULL, NULL, NULL, NULL,
2985	    UMA_ALIGN_PTR, 0);
2986	uma_zone_set_max(V_dyn_data_zone, V_dyn_max);
2987
2988	V_dyn_parent_zone = uma_zcreate("IPFW parent dynamic states",
2989	    sizeof(struct dyn_parent), NULL, NULL, NULL, NULL,
2990	    UMA_ALIGN_PTR, 0);
2991	uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max);
2992
2993	SLIST_INIT(&V_dyn_expired_ipv4);
2994	V_dyn_ipv4 = NULL;
2995	V_dyn_ipv4_parent = NULL;
2996	V_dyn_ipv4_zone = uma_zcreate("IPFW IPv4 dynamic states",
2997	    sizeof(struct dyn_ipv4_state), NULL, NULL, NULL, NULL,
2998	    UMA_ALIGN_PTR, 0);
2999
3000#ifdef INET6
3001	SLIST_INIT(&V_dyn_expired_ipv6);
3002	V_dyn_ipv6 = NULL;
3003	V_dyn_ipv6_parent = NULL;
3004	V_dyn_ipv6_zone = uma_zcreate("IPFW IPv6 dynamic states",
3005	    sizeof(struct dyn_ipv6_state), NULL, NULL, NULL, NULL,
3006	    UMA_ALIGN_PTR, 0);
3007#endif
3008
3009	/* Initialize buckets. */
3010	V_curr_dyn_buckets = 0;
3011	V_dyn_bucket_lock = NULL;
3012	dyn_grow_hashtable(chain, 256);
3013
3014	if (IS_DEFAULT_VNET(curvnet))
3015		dyn_hp_cache = malloc(mp_ncpus * sizeof(void *), M_IPFW,
3016		    M_WAITOK | M_ZERO);
3017
3018	DYN_EXPIRED_LOCK_INIT();
3019	callout_init(&V_dyn_timeout, 1);
3020	callout_reset(&V_dyn_timeout, hz, dyn_tick, curvnet);
3021	IPFW_ADD_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes);
3022}
3023
3024void
3025ipfw_dyn_uninit(int pass)
3026{
3027#ifdef INET6
3028	struct dyn_ipv6_state *s6;
3029#endif
3030	struct dyn_ipv4_state *s4;
3031	int bucket;
3032
3033	if (pass == 0) {
3034		callout_drain(&V_dyn_timeout);
3035		return;
3036	}
3037	IPFW_DEL_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes);
3038	DYN_EXPIRED_LOCK_DESTROY();
3039
3040#define	DYN_FREE_STATES_FORCED(CK, s, af, name, en)	do {		\
3041	while ((s = CK ## SLIST_FIRST(&V_dyn_ ## name)) != NULL) {	\
3042		CK ## SLIST_REMOVE_HEAD(&V_dyn_ ## name, en);	\
3043		if (s->type == O_LIMIT_PARENT)				\
3044			uma_zfree(V_dyn_parent_zone, s->limit);		\
3045		else							\
3046			uma_zfree(V_dyn_data_zone, s->data);		\
3047		uma_zfree(V_dyn_ ## af ## _zone, s);			\
3048	}								\
3049} while (0)
3050	for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) {
3051		DYN_BUCKET_LOCK_DESTROY(V_dyn_bucket_lock, bucket);
3052
3053		DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4[bucket], entry);
3054		DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4_parent[bucket],
3055		    entry);
3056#ifdef INET6
3057		DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6[bucket], entry);
3058		DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6_parent[bucket],
3059		    entry);
3060#endif /* INET6 */
3061	}
3062	DYN_FREE_STATES_FORCED(, s4, ipv4, expired_ipv4, expired);
3063#ifdef INET6
3064	DYN_FREE_STATES_FORCED(, s6, ipv6, expired_ipv6, expired);
3065#endif
3066#undef DYN_FREE_STATES_FORCED
3067
3068	uma_zdestroy(V_dyn_ipv4_zone);
3069	uma_zdestroy(V_dyn_data_zone);
3070	uma_zdestroy(V_dyn_parent_zone);
3071#ifdef INET6
3072	uma_zdestroy(V_dyn_ipv6_zone);
3073	free(V_dyn_ipv6, M_IPFW);
3074	free(V_dyn_ipv6_parent, M_IPFW);
3075	free(V_dyn_ipv6_add, M_IPFW);
3076	free(V_dyn_ipv6_parent_add, M_IPFW);
3077	free(V_dyn_ipv6_del, M_IPFW);
3078	free(V_dyn_ipv6_parent_del, M_IPFW);
3079#endif
3080	free(V_dyn_bucket_lock, M_IPFW);
3081	free(V_dyn_ipv4, M_IPFW);
3082	free(V_dyn_ipv4_parent, M_IPFW);
3083	free(V_dyn_ipv4_add, M_IPFW);
3084	free(V_dyn_ipv4_parent_add, M_IPFW);
3085	free(V_dyn_ipv4_del, M_IPFW);
3086	free(V_dyn_ipv4_parent_del, M_IPFW);
3087	if (IS_DEFAULT_VNET(curvnet))
3088		free(dyn_hp_cache, M_IPFW);
3089}
3090
3091
3092