ip_fw2.c revision 142906
1/*-
2 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: head/sys/netinet/ip_fw2.c 142906 2005-03-01 12:01:17Z glebius $
26 */
27
28#define        DEB(x)
29#define        DDB(x) x
30
31/*
32 * Implement IP packet firewall (new version)
33 */
34
35#if !defined(KLD_MODULE)
36#include "opt_ipfw.h"
37#include "opt_ipdn.h"
38#include "opt_inet.h"
39#include "opt_ipsec.h"
40#ifndef INET
41#error IPFIREWALL requires INET.
42#endif /* INET */
43#endif
44
45#define IPFW2	1
46#if IPFW2
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/condvar.h>
50#include <sys/malloc.h>
51#include <sys/mbuf.h>
52#include <sys/kernel.h>
53#include <sys/jail.h>
54#include <sys/module.h>
55#include <sys/proc.h>
56#include <sys/socket.h>
57#include <sys/socketvar.h>
58#include <sys/sysctl.h>
59#include <sys/syslog.h>
60#include <sys/ucred.h>
61#include <net/if.h>
62#include <net/radix.h>
63#include <net/route.h>
64#include <netinet/in.h>
65#include <netinet/in_systm.h>
66#include <netinet/in_var.h>
67#include <netinet/in_pcb.h>
68#include <netinet/ip.h>
69#include <netinet/ip_var.h>
70#include <netinet/ip_icmp.h>
71#include <netinet/ip_fw.h>
72#include <netinet/ip_divert.h>
73#include <netinet/ip_dummynet.h>
74#include <netinet/tcp.h>
75#include <netinet/tcp_timer.h>
76#include <netinet/tcp_var.h>
77#include <netinet/tcpip.h>
78#include <netinet/udp.h>
79#include <netinet/udp_var.h>
80
81#include <netgraph/ng_ipfw.h>
82
83#include <altq/if_altq.h>
84
85#ifdef IPSEC
86#include <netinet6/ipsec.h>
87#endif
88
89#include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
90
91#include <machine/in_cksum.h>	/* XXX for in_cksum */
92
93/*
94 * set_disable contains one bit per set value (0..31).
95 * If the bit is set, all rules with the corresponding set
96 * are disabled. Set RESVD_SET(31) is reserved for the default rule
97 * and rules that are not deleted by the flush command,
98 * and CANNOT be disabled.
99 * Rules in set RESVD_SET can only be deleted explicitly.
100 */
101static u_int32_t set_disable;
102
103static int fw_verbose;
104static int verbose_limit;
105
106static struct callout ipfw_timeout;
107static uma_zone_t ipfw_dyn_rule_zone;
108#define	IPFW_DEFAULT_RULE	65535
109
110/*
111 * Data structure to cache our ucred related
112 * information. This structure only gets used if
113 * the user specified UID/GID based constraints in
114 * a firewall rule.
115 */
116struct ip_fw_ugid {
117	gid_t		fw_groups[NGROUPS];
118	int		fw_ngroups;
119	uid_t		fw_uid;
120	int		fw_prid;
121};
122
123struct ip_fw_chain {
124	struct ip_fw	*rules;		/* list of rules */
125	struct ip_fw	*reap;		/* list of rules to reap */
126	struct mtx	mtx;		/* lock guarding rule list */
127	int		busy_count;	/* busy count for rw locks */
128	int		want_write;
129	struct cv	cv;
130};
131#define	IPFW_LOCK_INIT(_chain) \
132	mtx_init(&(_chain)->mtx, "IPFW static rules", NULL, \
133		MTX_DEF | MTX_RECURSE)
134#define	IPFW_LOCK_DESTROY(_chain)	mtx_destroy(&(_chain)->mtx)
135#define	IPFW_WLOCK_ASSERT(_chain)	do {				\
136	mtx_assert(&(_chain)->mtx, MA_OWNED);				\
137	NET_ASSERT_GIANT();						\
138} while (0)
139
140static __inline void
141IPFW_RLOCK(struct ip_fw_chain *chain)
142{
143	mtx_lock(&chain->mtx);
144	chain->busy_count++;
145	mtx_unlock(&chain->mtx);
146}
147
148static __inline void
149IPFW_RUNLOCK(struct ip_fw_chain *chain)
150{
151	mtx_lock(&chain->mtx);
152	chain->busy_count--;
153	if (chain->busy_count == 0 && chain->want_write)
154		cv_signal(&chain->cv);
155	mtx_unlock(&chain->mtx);
156}
157
158static __inline void
159IPFW_WLOCK(struct ip_fw_chain *chain)
160{
161	mtx_lock(&chain->mtx);
162	chain->want_write++;
163	while (chain->busy_count > 0)
164		cv_wait(&chain->cv, &chain->mtx);
165}
166
167static __inline void
168IPFW_WUNLOCK(struct ip_fw_chain *chain)
169{
170	chain->want_write--;
171	cv_signal(&chain->cv);
172	mtx_unlock(&chain->mtx);
173}
174
175/*
176 * list of rules for layer 3
177 */
178static struct ip_fw_chain layer3_chain;
179
180MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
181MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
182
183struct table_entry {
184	struct radix_node	rn[2];
185	struct sockaddr_in	addr, mask;
186	u_int32_t		value;
187};
188
189#define	IPFW_TABLES_MAX		128
190static struct {
191	struct radix_node_head	*rnh;
192	int			modified;
193} ipfw_tables[IPFW_TABLES_MAX];
194
195static int fw_debug = 1;
196static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
197
198#ifdef SYSCTL_NODE
199SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
200SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable,
201    CTLFLAG_RW | CTLFLAG_SECURE3,
202    &fw_enable, 0, "Enable ipfw");
203SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW,
204    &autoinc_step, 0, "Rule number autincrement step");
205SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
206    CTLFLAG_RW | CTLFLAG_SECURE3,
207    &fw_one_pass, 0,
208    "Only do a single pass through ipfw when using dummynet(4)");
209SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
210    &fw_debug, 0, "Enable printing of debug ip_fw statements");
211SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
212    CTLFLAG_RW | CTLFLAG_SECURE3,
213    &fw_verbose, 0, "Log matches to ipfw rules");
214SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
215    &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
216
217/*
218 * Description of dynamic rules.
219 *
220 * Dynamic rules are stored in lists accessed through a hash table
221 * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
222 * be modified through the sysctl variable dyn_buckets which is
223 * updated when the table becomes empty.
224 *
225 * XXX currently there is only one list, ipfw_dyn.
226 *
227 * When a packet is received, its address fields are first masked
228 * with the mask defined for the rule, then hashed, then matched
229 * against the entries in the corresponding list.
230 * Dynamic rules can be used for different purposes:
231 *  + stateful rules;
232 *  + enforcing limits on the number of sessions;
233 *  + in-kernel NAT (not implemented yet)
234 *
235 * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
236 * measured in seconds and depending on the flags.
237 *
238 * The total number of dynamic rules is stored in dyn_count.
239 * The max number of dynamic rules is dyn_max. When we reach
240 * the maximum number of rules we do not create anymore. This is
241 * done to avoid consuming too much memory, but also too much
242 * time when searching on each packet (ideally, we should try instead
243 * to put a limit on the length of the list on each bucket...).
244 *
245 * Each dynamic rule holds a pointer to the parent ipfw rule so
246 * we know what action to perform. Dynamic rules are removed when
247 * the parent rule is deleted. XXX we should make them survive.
248 *
249 * There are some limitations with dynamic rules -- we do not
250 * obey the 'randomized match', and we do not do multiple
251 * passes through the firewall. XXX check the latter!!!
252 */
253static ipfw_dyn_rule **ipfw_dyn_v = NULL;
254static u_int32_t dyn_buckets = 256; /* must be power of 2 */
255static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */
256
257static struct mtx ipfw_dyn_mtx;		/* mutex guarding dynamic rules */
258#define	IPFW_DYN_LOCK_INIT() \
259	mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
260#define	IPFW_DYN_LOCK_DESTROY()	mtx_destroy(&ipfw_dyn_mtx)
261#define	IPFW_DYN_LOCK()		mtx_lock(&ipfw_dyn_mtx)
262#define	IPFW_DYN_UNLOCK()	mtx_unlock(&ipfw_dyn_mtx)
263#define	IPFW_DYN_LOCK_ASSERT()	mtx_assert(&ipfw_dyn_mtx, MA_OWNED)
264
265/*
266 * Timeouts for various events in handing dynamic rules.
267 */
268static u_int32_t dyn_ack_lifetime = 300;
269static u_int32_t dyn_syn_lifetime = 20;
270static u_int32_t dyn_fin_lifetime = 1;
271static u_int32_t dyn_rst_lifetime = 1;
272static u_int32_t dyn_udp_lifetime = 10;
273static u_int32_t dyn_short_lifetime = 5;
274
275/*
276 * Keepalives are sent if dyn_keepalive is set. They are sent every
277 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
278 * seconds of lifetime of a rule.
279 * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
280 * than dyn_keepalive_period.
281 */
282
283static u_int32_t dyn_keepalive_interval = 20;
284static u_int32_t dyn_keepalive_period = 5;
285static u_int32_t dyn_keepalive = 1;	/* do send keepalives */
286
287static u_int32_t static_count;	/* # of static rules */
288static u_int32_t static_len;	/* size in bytes of static rules */
289static u_int32_t dyn_count;		/* # of dynamic rules */
290static u_int32_t dyn_max = 4096;	/* max # of dynamic rules */
291
292SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW,
293    &dyn_buckets, 0, "Number of dyn. buckets");
294SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD,
295    &curr_dyn_buckets, 0, "Current Number of dyn. buckets");
296SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD,
297    &dyn_count, 0, "Number of dyn. rules");
298SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW,
299    &dyn_max, 0, "Max number of dyn. rules");
300SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
301    &static_count, 0, "Number of static rules");
302SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
303    &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
304SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
305    &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
306SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
307    &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
308SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
309    &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
310SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
311    &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
312SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
313    &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
314SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
315    &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
316
317#endif /* SYSCTL_NODE */
318
319
320/*
321 * This macro maps an ip pointer into a layer3 header pointer of type T
322 */
323#define	L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
324
325static __inline int
326icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
327{
328	int type = L3HDR(struct icmp,ip)->icmp_type;
329
330	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
331}
332
333#define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
334    (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
335
336static int
337is_icmp_query(struct ip *ip)
338{
339	int type = L3HDR(struct icmp, ip)->icmp_type;
340	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
341}
342#undef TT
343
344/*
345 * The following checks use two arrays of 8 or 16 bits to store the
346 * bits that we want set or clear, respectively. They are in the
347 * low and high half of cmd->arg1 or cmd->d[0].
348 *
349 * We scan options and store the bits we find set. We succeed if
350 *
351 *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
352 *
353 * The code is sometimes optimized not to store additional variables.
354 */
355
356static int
357flags_match(ipfw_insn *cmd, u_int8_t bits)
358{
359	u_char want_clear;
360	bits = ~bits;
361
362	if ( ((cmd->arg1 & 0xff) & bits) != 0)
363		return 0; /* some bits we want set were clear */
364	want_clear = (cmd->arg1 >> 8) & 0xff;
365	if ( (want_clear & bits) != want_clear)
366		return 0; /* some bits we want clear were set */
367	return 1;
368}
369
370static int
371ipopts_match(struct ip *ip, ipfw_insn *cmd)
372{
373	int optlen, bits = 0;
374	u_char *cp = (u_char *)(ip + 1);
375	int x = (ip->ip_hl << 2) - sizeof (struct ip);
376
377	for (; x > 0; x -= optlen, cp += optlen) {
378		int opt = cp[IPOPT_OPTVAL];
379
380		if (opt == IPOPT_EOL)
381			break;
382		if (opt == IPOPT_NOP)
383			optlen = 1;
384		else {
385			optlen = cp[IPOPT_OLEN];
386			if (optlen <= 0 || optlen > x)
387				return 0; /* invalid or truncated */
388		}
389		switch (opt) {
390
391		default:
392			break;
393
394		case IPOPT_LSRR:
395			bits |= IP_FW_IPOPT_LSRR;
396			break;
397
398		case IPOPT_SSRR:
399			bits |= IP_FW_IPOPT_SSRR;
400			break;
401
402		case IPOPT_RR:
403			bits |= IP_FW_IPOPT_RR;
404			break;
405
406		case IPOPT_TS:
407			bits |= IP_FW_IPOPT_TS;
408			break;
409		}
410	}
411	return (flags_match(cmd, bits));
412}
413
414static int
415tcpopts_match(struct ip *ip, ipfw_insn *cmd)
416{
417	int optlen, bits = 0;
418	struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
419	u_char *cp = (u_char *)(tcp + 1);
420	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
421
422	for (; x > 0; x -= optlen, cp += optlen) {
423		int opt = cp[0];
424		if (opt == TCPOPT_EOL)
425			break;
426		if (opt == TCPOPT_NOP)
427			optlen = 1;
428		else {
429			optlen = cp[1];
430			if (optlen <= 0)
431				break;
432		}
433
434		switch (opt) {
435
436		default:
437			break;
438
439		case TCPOPT_MAXSEG:
440			bits |= IP_FW_TCPOPT_MSS;
441			break;
442
443		case TCPOPT_WINDOW:
444			bits |= IP_FW_TCPOPT_WINDOW;
445			break;
446
447		case TCPOPT_SACK_PERMITTED:
448		case TCPOPT_SACK:
449			bits |= IP_FW_TCPOPT_SACK;
450			break;
451
452		case TCPOPT_TIMESTAMP:
453			bits |= IP_FW_TCPOPT_TS;
454			break;
455
456		}
457	}
458	return (flags_match(cmd, bits));
459}
460
461static int
462iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
463{
464	if (ifp == NULL)	/* no iface with this packet, match fails */
465		return 0;
466	/* Check by name or by IP address */
467	if (cmd->name[0] != '\0') { /* match by name */
468		/* Check name */
469		if (cmd->p.glob) {
470			if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
471				return(1);
472		} else {
473			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
474				return(1);
475		}
476	} else {
477		struct ifaddr *ia;
478
479		/* XXX lock? */
480		TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
481			if (ia->ifa_addr == NULL)
482				continue;
483			if (ia->ifa_addr->sa_family != AF_INET)
484				continue;
485			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
486			    (ia->ifa_addr))->sin_addr.s_addr)
487				return(1);	/* match */
488		}
489	}
490	return(0);	/* no match, fail ... */
491}
492
493/*
494 * The verify_path function checks if a route to the src exists and
495 * if it is reachable via ifp (when provided).
496 *
497 * The 'verrevpath' option checks that the interface that an IP packet
498 * arrives on is the same interface that traffic destined for the
499 * packet's source address would be routed out of.  The 'versrcreach'
500 * option just checks that the source address is reachable via any route
501 * (except default) in the routing table.  These two are a measure to block
502 * forged packets.  This is also commonly known as "anti-spoofing" or Unicast
503 * Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
504 * is purposely reminiscent of the Cisco IOS command,
505 *
506 *   ip verify unicast reverse-path
507 *   ip verify unicast source reachable-via any
508 *
509 * which implements the same functionality. But note that syntax is
510 * misleading. The check may be performed on all IP packets whether unicast,
511 * multicast, or broadcast.
512 */
513static int
514verify_path(struct in_addr src, struct ifnet *ifp)
515{
516	struct route ro;
517	struct sockaddr_in *dst;
518
519	bzero(&ro, sizeof(ro));
520
521	dst = (struct sockaddr_in *)&(ro.ro_dst);
522	dst->sin_family = AF_INET;
523	dst->sin_len = sizeof(*dst);
524	dst->sin_addr = src;
525	rtalloc_ign(&ro, RTF_CLONING);
526
527	if (ro.ro_rt == NULL)
528		return 0;
529
530	/* if ifp is provided, check for equality with rtentry */
531	if (ifp != NULL && ro.ro_rt->rt_ifp != ifp) {
532		RTFREE(ro.ro_rt);
533		return 0;
534	}
535
536	/* if no ifp provided, check if rtentry is not default route */
537	if (ifp == NULL &&
538	     satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
539		RTFREE(ro.ro_rt);
540		return 0;
541	}
542
543	/* or if this is a blackhole/reject route */
544	if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
545		RTFREE(ro.ro_rt);
546		return 0;
547	}
548
549	/* found valid route */
550	RTFREE(ro.ro_rt);
551	return 1;
552}
553
554
555static u_int64_t norule_counter;	/* counter for ipfw_log(NULL...) */
556
557#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
558#define SNP(buf) buf, sizeof(buf)
559
560/*
561 * We enter here when we have a rule with O_LOG.
562 * XXX this function alone takes about 2Kbytes of code!
563 */
564static void
565ipfw_log(struct ip_fw *f, u_int hlen, struct ether_header *eh,
566	struct mbuf *m, struct ifnet *oif)
567{
568	char *action;
569	int limit_reached = 0;
570	char action2[40], proto[48], fragment[28];
571
572	fragment[0] = '\0';
573	proto[0] = '\0';
574
575	if (f == NULL) {	/* bogus pkt */
576		if (verbose_limit != 0 && norule_counter >= verbose_limit)
577			return;
578		norule_counter++;
579		if (norule_counter == verbose_limit)
580			limit_reached = verbose_limit;
581		action = "Refuse";
582	} else {	/* O_LOG is the first action, find the real one */
583		ipfw_insn *cmd = ACTION_PTR(f);
584		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
585
586		if (l->max_log != 0 && l->log_left == 0)
587			return;
588		l->log_left--;
589		if (l->log_left == 0)
590			limit_reached = l->max_log;
591		cmd += F_LEN(cmd);	/* point to first action */
592		if (cmd->opcode == O_ALTQ) {
593			ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
594
595			snprintf(SNPARGS(action2, 0), "Altq %d",
596				altq->qid);
597			cmd += F_LEN(cmd);
598		}
599		if (cmd->opcode == O_PROB)
600			cmd += F_LEN(cmd);
601
602		action = action2;
603		switch (cmd->opcode) {
604		case O_DENY:
605			action = "Deny";
606			break;
607
608		case O_REJECT:
609			if (cmd->arg1==ICMP_REJECT_RST)
610				action = "Reset";
611			else if (cmd->arg1==ICMP_UNREACH_HOST)
612				action = "Reject";
613			else
614				snprintf(SNPARGS(action2, 0), "Unreach %d",
615					cmd->arg1);
616			break;
617
618		case O_ACCEPT:
619			action = "Accept";
620			break;
621		case O_COUNT:
622			action = "Count";
623			break;
624		case O_DIVERT:
625			snprintf(SNPARGS(action2, 0), "Divert %d",
626				cmd->arg1);
627			break;
628		case O_TEE:
629			snprintf(SNPARGS(action2, 0), "Tee %d",
630				cmd->arg1);
631			break;
632		case O_SKIPTO:
633			snprintf(SNPARGS(action2, 0), "SkipTo %d",
634				cmd->arg1);
635			break;
636		case O_PIPE:
637			snprintf(SNPARGS(action2, 0), "Pipe %d",
638				cmd->arg1);
639			break;
640		case O_QUEUE:
641			snprintf(SNPARGS(action2, 0), "Queue %d",
642				cmd->arg1);
643			break;
644		case O_FORWARD_IP: {
645			ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
646			int len;
647
648			len = snprintf(SNPARGS(action2, 0), "Forward to %s",
649				inet_ntoa(sa->sa.sin_addr));
650			if (sa->sa.sin_port)
651				snprintf(SNPARGS(action2, len), ":%d",
652				    sa->sa.sin_port);
653			}
654			break;
655		case O_NETGRAPH:
656			snprintf(SNPARGS(action2, 0), "Netgraph %d",
657				cmd->arg1);
658			break;
659		case O_NGTEE:
660			snprintf(SNPARGS(action2, 0), "Ngtee %d",
661				cmd->arg1);
662			break;
663		default:
664			action = "UNKNOWN";
665			break;
666		}
667	}
668
669	if (hlen == 0) {	/* non-ip */
670		snprintf(SNPARGS(proto, 0), "MAC");
671	} else {
672		struct ip *ip = mtod(m, struct ip *);
673		/* these three are all aliases to the same thing */
674		struct icmp *const icmp = L3HDR(struct icmp, ip);
675		struct tcphdr *const tcp = (struct tcphdr *)icmp;
676		struct udphdr *const udp = (struct udphdr *)icmp;
677
678		int ip_off, offset, ip_len;
679
680		int len;
681
682		if (eh != NULL) { /* layer 2 packets are as on the wire */
683			ip_off = ntohs(ip->ip_off);
684			ip_len = ntohs(ip->ip_len);
685		} else {
686			ip_off = ip->ip_off;
687			ip_len = ip->ip_len;
688		}
689		offset = ip_off & IP_OFFMASK;
690		switch (ip->ip_p) {
691		case IPPROTO_TCP:
692			len = snprintf(SNPARGS(proto, 0), "TCP %s",
693			    inet_ntoa(ip->ip_src));
694			if (offset == 0)
695				snprintf(SNPARGS(proto, len), ":%d %s:%d",
696				    ntohs(tcp->th_sport),
697				    inet_ntoa(ip->ip_dst),
698				    ntohs(tcp->th_dport));
699			else
700				snprintf(SNPARGS(proto, len), " %s",
701				    inet_ntoa(ip->ip_dst));
702			break;
703
704		case IPPROTO_UDP:
705			len = snprintf(SNPARGS(proto, 0), "UDP %s",
706				inet_ntoa(ip->ip_src));
707			if (offset == 0)
708				snprintf(SNPARGS(proto, len), ":%d %s:%d",
709				    ntohs(udp->uh_sport),
710				    inet_ntoa(ip->ip_dst),
711				    ntohs(udp->uh_dport));
712			else
713				snprintf(SNPARGS(proto, len), " %s",
714				    inet_ntoa(ip->ip_dst));
715			break;
716
717		case IPPROTO_ICMP:
718			if (offset == 0)
719				len = snprintf(SNPARGS(proto, 0),
720				    "ICMP:%u.%u ",
721				    icmp->icmp_type, icmp->icmp_code);
722			else
723				len = snprintf(SNPARGS(proto, 0), "ICMP ");
724			len += snprintf(SNPARGS(proto, len), "%s",
725			    inet_ntoa(ip->ip_src));
726			snprintf(SNPARGS(proto, len), " %s",
727			    inet_ntoa(ip->ip_dst));
728			break;
729
730		default:
731			len = snprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
732			    inet_ntoa(ip->ip_src));
733			snprintf(SNPARGS(proto, len), " %s",
734			    inet_ntoa(ip->ip_dst));
735			break;
736		}
737
738		if (ip_off & (IP_MF | IP_OFFMASK))
739			snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
740			     ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
741			     offset << 3,
742			     (ip_off & IP_MF) ? "+" : "");
743	}
744	if (oif || m->m_pkthdr.rcvif)
745		log(LOG_SECURITY | LOG_INFO,
746		    "ipfw: %d %s %s %s via %s%s\n",
747		    f ? f->rulenum : -1,
748		    action, proto, oif ? "out" : "in",
749		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
750		    fragment);
751	else
752		log(LOG_SECURITY | LOG_INFO,
753		    "ipfw: %d %s %s [no if info]%s\n",
754		    f ? f->rulenum : -1,
755		    action, proto, fragment);
756	if (limit_reached)
757		log(LOG_SECURITY | LOG_NOTICE,
758		    "ipfw: limit %d reached on entry %d\n",
759		    limit_reached, f ? f->rulenum : -1);
760}
761
762/*
763 * IMPORTANT: the hash function for dynamic rules must be commutative
764 * in source and destination (ip,port), because rules are bidirectional
765 * and we want to find both in the same bucket.
766 */
767static __inline int
768hash_packet(struct ipfw_flow_id *id)
769{
770	u_int32_t i;
771
772	i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
773	i &= (curr_dyn_buckets - 1);
774	return i;
775}
776
777/**
778 * unlink a dynamic rule from a chain. prev is a pointer to
779 * the previous one, q is a pointer to the rule to delete,
780 * head is a pointer to the head of the queue.
781 * Modifies q and potentially also head.
782 */
783#define UNLINK_DYN_RULE(prev, head, q) {				\
784	ipfw_dyn_rule *old_q = q;					\
785									\
786	/* remove a refcount to the parent */				\
787	if (q->dyn_type == O_LIMIT)					\
788		q->parent->count--;					\
789	DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\
790		(q->id.src_ip), (q->id.src_port),			\
791		(q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); )	\
792	if (prev != NULL)						\
793		prev->next = q = q->next;				\
794	else								\
795		head = q = q->next;					\
796	dyn_count--;							\
797	uma_zfree(ipfw_dyn_rule_zone, old_q); }
798
799#define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
800
801/**
802 * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
803 *
804 * If keep_me == NULL, rules are deleted even if not expired,
805 * otherwise only expired rules are removed.
806 *
807 * The value of the second parameter is also used to point to identify
808 * a rule we absolutely do not want to remove (e.g. because we are
809 * holding a reference to it -- this is the case with O_LIMIT_PARENT
810 * rules). The pointer is only used for comparison, so any non-null
811 * value will do.
812 */
813static void
814remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
815{
816	static u_int32_t last_remove = 0;
817
818#define FORCE (keep_me == NULL)
819
820	ipfw_dyn_rule *prev, *q;
821	int i, pass = 0, max_pass = 0;
822
823	IPFW_DYN_LOCK_ASSERT();
824
825	if (ipfw_dyn_v == NULL || dyn_count == 0)
826		return;
827	/* do not expire more than once per second, it is useless */
828	if (!FORCE && last_remove == time_second)
829		return;
830	last_remove = time_second;
831
832	/*
833	 * because O_LIMIT refer to parent rules, during the first pass only
834	 * remove child and mark any pending LIMIT_PARENT, and remove
835	 * them in a second pass.
836	 */
837next_pass:
838	for (i = 0 ; i < curr_dyn_buckets ; i++) {
839		for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) {
840			/*
841			 * Logic can become complex here, so we split tests.
842			 */
843			if (q == keep_me)
844				goto next;
845			if (rule != NULL && rule != q->rule)
846				goto next; /* not the one we are looking for */
847			if (q->dyn_type == O_LIMIT_PARENT) {
848				/*
849				 * handle parent in the second pass,
850				 * record we need one.
851				 */
852				max_pass = 1;
853				if (pass == 0)
854					goto next;
855				if (FORCE && q->count != 0 ) {
856					/* XXX should not happen! */
857					printf("ipfw: OUCH! cannot remove rule,"
858					     " count %d\n", q->count);
859				}
860			} else {
861				if (!FORCE &&
862				    !TIME_LEQ( q->expire, time_second ))
863					goto next;
864			}
865             if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
866                     UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
867                     continue;
868             }
869next:
870			prev=q;
871			q=q->next;
872		}
873	}
874	if (pass++ < max_pass)
875		goto next_pass;
876}
877
878
879/**
880 * lookup a dynamic rule.
881 */
882static ipfw_dyn_rule *
883lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
884	struct tcphdr *tcp)
885{
886	/*
887	 * stateful ipfw extensions.
888	 * Lookup into dynamic session queue
889	 */
890#define MATCH_REVERSE	0
891#define MATCH_FORWARD	1
892#define MATCH_NONE	2
893#define MATCH_UNKNOWN	3
894	int i, dir = MATCH_NONE;
895	ipfw_dyn_rule *prev, *q=NULL;
896
897	IPFW_DYN_LOCK_ASSERT();
898
899	if (ipfw_dyn_v == NULL)
900		goto done;	/* not found */
901	i = hash_packet( pkt );
902	for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) {
903		if (q->dyn_type == O_LIMIT_PARENT && q->count)
904			goto next;
905		if (TIME_LEQ( q->expire, time_second)) { /* expire entry */
906			UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
907			continue;
908		}
909		if (pkt->proto == q->id.proto &&
910		    q->dyn_type != O_LIMIT_PARENT) {
911			if (pkt->src_ip == q->id.src_ip &&
912			    pkt->dst_ip == q->id.dst_ip &&
913			    pkt->src_port == q->id.src_port &&
914			    pkt->dst_port == q->id.dst_port ) {
915				dir = MATCH_FORWARD;
916				break;
917			}
918			if (pkt->src_ip == q->id.dst_ip &&
919			    pkt->dst_ip == q->id.src_ip &&
920			    pkt->src_port == q->id.dst_port &&
921			    pkt->dst_port == q->id.src_port ) {
922				dir = MATCH_REVERSE;
923				break;
924			}
925		}
926next:
927		prev = q;
928		q = q->next;
929	}
930	if (q == NULL)
931		goto done; /* q = NULL, not found */
932
933	if ( prev != NULL) { /* found and not in front */
934		prev->next = q->next;
935		q->next = ipfw_dyn_v[i];
936		ipfw_dyn_v[i] = q;
937	}
938	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
939		u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
940
941#define BOTH_SYN	(TH_SYN | (TH_SYN << 8))
942#define BOTH_FIN	(TH_FIN | (TH_FIN << 8))
943		q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
944		switch (q->state) {
945		case TH_SYN:				/* opening */
946			q->expire = time_second + dyn_syn_lifetime;
947			break;
948
949		case BOTH_SYN:			/* move to established */
950		case BOTH_SYN | TH_FIN :	/* one side tries to close */
951		case BOTH_SYN | (TH_FIN << 8) :
952 			if (tcp) {
953#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
954			    u_int32_t ack = ntohl(tcp->th_ack);
955			    if (dir == MATCH_FORWARD) {
956				if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
957				    q->ack_fwd = ack;
958				else { /* ignore out-of-sequence */
959				    break;
960				}
961			    } else {
962				if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
963				    q->ack_rev = ack;
964				else { /* ignore out-of-sequence */
965				    break;
966				}
967			    }
968			}
969			q->expire = time_second + dyn_ack_lifetime;
970			break;
971
972		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
973			if (dyn_fin_lifetime >= dyn_keepalive_period)
974				dyn_fin_lifetime = dyn_keepalive_period - 1;
975			q->expire = time_second + dyn_fin_lifetime;
976			break;
977
978		default:
979#if 0
980			/*
981			 * reset or some invalid combination, but can also
982			 * occur if we use keep-state the wrong way.
983			 */
984			if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
985				printf("invalid state: 0x%x\n", q->state);
986#endif
987			if (dyn_rst_lifetime >= dyn_keepalive_period)
988				dyn_rst_lifetime = dyn_keepalive_period - 1;
989			q->expire = time_second + dyn_rst_lifetime;
990			break;
991		}
992	} else if (pkt->proto == IPPROTO_UDP) {
993		q->expire = time_second + dyn_udp_lifetime;
994	} else {
995		/* other protocols */
996		q->expire = time_second + dyn_short_lifetime;
997	}
998done:
999	if (match_direction)
1000		*match_direction = dir;
1001	return q;
1002}
1003
1004static ipfw_dyn_rule *
1005lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
1006	struct tcphdr *tcp)
1007{
1008	ipfw_dyn_rule *q;
1009
1010	IPFW_DYN_LOCK();
1011	q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
1012	if (q == NULL)
1013		IPFW_DYN_UNLOCK();
1014	/* NB: return table locked when q is not NULL */
1015	return q;
1016}
1017
1018static void
1019realloc_dynamic_table(void)
1020{
1021	IPFW_DYN_LOCK_ASSERT();
1022
1023	/*
1024	 * Try reallocation, make sure we have a power of 2 and do
1025	 * not allow more than 64k entries. In case of overflow,
1026	 * default to 1024.
1027	 */
1028
1029	if (dyn_buckets > 65536)
1030		dyn_buckets = 1024;
1031	if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */
1032		dyn_buckets = curr_dyn_buckets; /* reset */
1033		return;
1034	}
1035	curr_dyn_buckets = dyn_buckets;
1036	if (ipfw_dyn_v != NULL)
1037		free(ipfw_dyn_v, M_IPFW);
1038	for (;;) {
1039		ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
1040		       M_IPFW, M_NOWAIT | M_ZERO);
1041		if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2)
1042			break;
1043		curr_dyn_buckets /= 2;
1044	}
1045}
1046
1047/**
1048 * Install state of type 'type' for a dynamic session.
1049 * The hash table contains two type of rules:
1050 * - regular rules (O_KEEP_STATE)
1051 * - rules for sessions with limited number of sess per user
1052 *   (O_LIMIT). When they are created, the parent is
1053 *   increased by 1, and decreased on delete. In this case,
1054 *   the third parameter is the parent rule and not the chain.
1055 * - "parent" rules for the above (O_LIMIT_PARENT).
1056 */
1057static ipfw_dyn_rule *
1058add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
1059{
1060	ipfw_dyn_rule *r;
1061	int i;
1062
1063	IPFW_DYN_LOCK_ASSERT();
1064
1065	if (ipfw_dyn_v == NULL ||
1066	    (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) {
1067		realloc_dynamic_table();
1068		if (ipfw_dyn_v == NULL)
1069			return NULL; /* failed ! */
1070	}
1071	i = hash_packet(id);
1072
1073	r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
1074	if (r == NULL) {
1075		printf ("ipfw: sorry cannot allocate state\n");
1076		return NULL;
1077	}
1078
1079	/* increase refcount on parent, and set pointer */
1080	if (dyn_type == O_LIMIT) {
1081		ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
1082		if ( parent->dyn_type != O_LIMIT_PARENT)
1083			panic("invalid parent");
1084		parent->count++;
1085		r->parent = parent;
1086		rule = parent->rule;
1087	}
1088
1089	r->id = *id;
1090	r->expire = time_second + dyn_syn_lifetime;
1091	r->rule = rule;
1092	r->dyn_type = dyn_type;
1093	r->pcnt = r->bcnt = 0;
1094	r->count = 0;
1095
1096	r->bucket = i;
1097	r->next = ipfw_dyn_v[i];
1098	ipfw_dyn_v[i] = r;
1099	dyn_count++;
1100	DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n",
1101	   dyn_type,
1102	   (r->id.src_ip), (r->id.src_port),
1103	   (r->id.dst_ip), (r->id.dst_port),
1104	   dyn_count ); )
1105	return r;
1106}
1107
1108/**
1109 * lookup dynamic parent rule using pkt and rule as search keys.
1110 * If the lookup fails, then install one.
1111 */
1112static ipfw_dyn_rule *
1113lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
1114{
1115	ipfw_dyn_rule *q;
1116	int i;
1117
1118	IPFW_DYN_LOCK_ASSERT();
1119
1120	if (ipfw_dyn_v) {
1121		i = hash_packet( pkt );
1122		for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next)
1123			if (q->dyn_type == O_LIMIT_PARENT &&
1124			    rule== q->rule &&
1125			    pkt->proto == q->id.proto &&
1126			    pkt->src_ip == q->id.src_ip &&
1127			    pkt->dst_ip == q->id.dst_ip &&
1128			    pkt->src_port == q->id.src_port &&
1129			    pkt->dst_port == q->id.dst_port) {
1130				q->expire = time_second + dyn_short_lifetime;
1131				DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
1132				return q;
1133			}
1134	}
1135	return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
1136}
1137
1138/**
1139 * Install dynamic state for rule type cmd->o.opcode
1140 *
1141 * Returns 1 (failure) if state is not installed because of errors or because
1142 * session limitations are enforced.
1143 */
1144static int
1145install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
1146	struct ip_fw_args *args)
1147{
1148	static int last_log;
1149
1150	ipfw_dyn_rule *q;
1151
1152	DEB(printf("ipfw: install state type %d 0x%08x %u -> 0x%08x %u\n",
1153	    cmd->o.opcode,
1154	    (args->f_id.src_ip), (args->f_id.src_port),
1155	    (args->f_id.dst_ip), (args->f_id.dst_port) );)
1156
1157	IPFW_DYN_LOCK();
1158
1159	q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
1160
1161	if (q != NULL) { /* should never occur */
1162		if (last_log != time_second) {
1163			last_log = time_second;
1164			printf("ipfw: install_state: entry already present, done\n");
1165		}
1166		IPFW_DYN_UNLOCK();
1167		return 0;
1168	}
1169
1170	if (dyn_count >= dyn_max)
1171		/*
1172		 * Run out of slots, try to remove any expired rule.
1173		 */
1174		remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
1175
1176	if (dyn_count >= dyn_max) {
1177		if (last_log != time_second) {
1178			last_log = time_second;
1179			printf("ipfw: install_state: Too many dynamic rules\n");
1180		}
1181		IPFW_DYN_UNLOCK();
1182		return 1; /* cannot install, notify caller */
1183	}
1184
1185	switch (cmd->o.opcode) {
1186	case O_KEEP_STATE: /* bidir rule */
1187		add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
1188		break;
1189
1190	case O_LIMIT: /* limit number of sessions */
1191	    {
1192		u_int16_t limit_mask = cmd->limit_mask;
1193		struct ipfw_flow_id id;
1194		ipfw_dyn_rule *parent;
1195
1196		DEB(printf("ipfw: installing dyn-limit rule %d\n",
1197		    cmd->conn_limit);)
1198
1199		id.dst_ip = id.src_ip = 0;
1200		id.dst_port = id.src_port = 0;
1201		id.proto = args->f_id.proto;
1202
1203		if (limit_mask & DYN_SRC_ADDR)
1204			id.src_ip = args->f_id.src_ip;
1205		if (limit_mask & DYN_DST_ADDR)
1206			id.dst_ip = args->f_id.dst_ip;
1207		if (limit_mask & DYN_SRC_PORT)
1208			id.src_port = args->f_id.src_port;
1209		if (limit_mask & DYN_DST_PORT)
1210			id.dst_port = args->f_id.dst_port;
1211		parent = lookup_dyn_parent(&id, rule);
1212		if (parent == NULL) {
1213			printf("ipfw: add parent failed\n");
1214			return 1;
1215		}
1216		if (parent->count >= cmd->conn_limit) {
1217			/*
1218			 * See if we can remove some expired rule.
1219			 */
1220			remove_dyn_rule(rule, parent);
1221			if (parent->count >= cmd->conn_limit) {
1222				if (fw_verbose && last_log != time_second) {
1223					last_log = time_second;
1224					log(LOG_SECURITY | LOG_DEBUG,
1225					    "drop session, too many entries\n");
1226				}
1227				IPFW_DYN_UNLOCK();
1228				return 1;
1229			}
1230		}
1231		add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
1232	    }
1233		break;
1234	default:
1235		printf("ipfw: unknown dynamic rule type %u\n", cmd->o.opcode);
1236		IPFW_DYN_UNLOCK();
1237		return 1;
1238	}
1239	lookup_dyn_rule_locked(&args->f_id, NULL, NULL); /* XXX just set lifetime */
1240	IPFW_DYN_UNLOCK();
1241	return 0;
1242}
1243
1244/*
1245 * Transmit a TCP packet, containing either a RST or a keepalive.
1246 * When flags & TH_RST, we are sending a RST packet, because of a
1247 * "reset" action matched the packet.
1248 * Otherwise we are sending a keepalive, and flags & TH_
1249 */
1250static void
1251send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags)
1252{
1253	struct mbuf *m;
1254	struct ip *ip;
1255	struct tcphdr *tcp;
1256
1257	MGETHDR(m, M_DONTWAIT, MT_HEADER);
1258	if (m == 0)
1259		return;
1260	m->m_pkthdr.rcvif = (struct ifnet *)0;
1261	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
1262	m->m_data += max_linkhdr;
1263
1264	ip = mtod(m, struct ip *);
1265	bzero(ip, m->m_len);
1266	tcp = (struct tcphdr *)(ip + 1); /* no IP options */
1267	ip->ip_p = IPPROTO_TCP;
1268	tcp->th_off = 5;
1269	/*
1270	 * Assume we are sending a RST (or a keepalive in the reverse
1271	 * direction), swap src and destination addresses and ports.
1272	 */
1273	ip->ip_src.s_addr = htonl(id->dst_ip);
1274	ip->ip_dst.s_addr = htonl(id->src_ip);
1275	tcp->th_sport = htons(id->dst_port);
1276	tcp->th_dport = htons(id->src_port);
1277	if (flags & TH_RST) {	/* we are sending a RST */
1278		if (flags & TH_ACK) {
1279			tcp->th_seq = htonl(ack);
1280			tcp->th_ack = htonl(0);
1281			tcp->th_flags = TH_RST;
1282		} else {
1283			if (flags & TH_SYN)
1284				seq++;
1285			tcp->th_seq = htonl(0);
1286			tcp->th_ack = htonl(seq);
1287			tcp->th_flags = TH_RST | TH_ACK;
1288		}
1289	} else {
1290		/*
1291		 * We are sending a keepalive. flags & TH_SYN determines
1292		 * the direction, forward if set, reverse if clear.
1293		 * NOTE: seq and ack are always assumed to be correct
1294		 * as set by the caller. This may be confusing...
1295		 */
1296		if (flags & TH_SYN) {
1297			/*
1298			 * we have to rewrite the correct addresses!
1299			 */
1300			ip->ip_dst.s_addr = htonl(id->dst_ip);
1301			ip->ip_src.s_addr = htonl(id->src_ip);
1302			tcp->th_dport = htons(id->dst_port);
1303			tcp->th_sport = htons(id->src_port);
1304		}
1305		tcp->th_seq = htonl(seq);
1306		tcp->th_ack = htonl(ack);
1307		tcp->th_flags = TH_ACK;
1308	}
1309	/*
1310	 * set ip_len to the payload size so we can compute
1311	 * the tcp checksum on the pseudoheader
1312	 * XXX check this, could save a couple of words ?
1313	 */
1314	ip->ip_len = htons(sizeof(struct tcphdr));
1315	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
1316	/*
1317	 * now fill fields left out earlier
1318	 */
1319	ip->ip_ttl = ip_defttl;
1320	ip->ip_len = m->m_pkthdr.len;
1321	m->m_flags |= M_SKIP_FIREWALL;
1322	ip_output(m, NULL, NULL, 0, NULL, NULL);
1323}
1324
1325/*
1326 * sends a reject message, consuming the mbuf passed as an argument.
1327 */
1328static void
1329send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
1330{
1331
1332	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
1333		/* We need the IP header in host order for icmp_error(). */
1334		if (args->eh != NULL) {
1335			struct ip *ip = mtod(args->m, struct ip *);
1336			ip->ip_len = ntohs(ip->ip_len);
1337			ip->ip_off = ntohs(ip->ip_off);
1338		}
1339		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
1340	} else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
1341		struct tcphdr *const tcp =
1342		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
1343		if ( (tcp->th_flags & TH_RST) == 0)
1344			send_pkt(&(args->f_id), ntohl(tcp->th_seq),
1345				ntohl(tcp->th_ack),
1346				tcp->th_flags | TH_RST);
1347		m_freem(args->m);
1348	} else
1349		m_freem(args->m);
1350	args->m = NULL;
1351}
1352
1353/**
1354 *
1355 * Given an ip_fw *, lookup_next_rule will return a pointer
1356 * to the next rule, which can be either the jump
1357 * target (for skipto instructions) or the next one in the list (in
1358 * all other cases including a missing jump target).
1359 * The result is also written in the "next_rule" field of the rule.
1360 * Backward jumps are not allowed, so start looking from the next
1361 * rule...
1362 *
1363 * This never returns NULL -- in case we do not have an exact match,
1364 * the next rule is returned. When the ruleset is changed,
1365 * pointers are flushed so we are always correct.
1366 */
1367
1368static struct ip_fw *
1369lookup_next_rule(struct ip_fw *me)
1370{
1371	struct ip_fw *rule = NULL;
1372	ipfw_insn *cmd;
1373
1374	/* look for action, in case it is a skipto */
1375	cmd = ACTION_PTR(me);
1376	if (cmd->opcode == O_LOG)
1377		cmd += F_LEN(cmd);
1378	if (cmd->opcode == O_ALTQ)
1379		cmd += F_LEN(cmd);
1380	if ( cmd->opcode == O_SKIPTO )
1381		for (rule = me->next; rule ; rule = rule->next)
1382			if (rule->rulenum >= cmd->arg1)
1383				break;
1384	if (rule == NULL)			/* failure or not a skipto */
1385		rule = me->next;
1386	me->next_rule = rule;
1387	return rule;
1388}
1389
1390static void
1391init_tables(void)
1392{
1393	int i;
1394
1395	for (i = 0; i < IPFW_TABLES_MAX; i++) {
1396		rn_inithead((void **)&ipfw_tables[i].rnh, 32);
1397		ipfw_tables[i].modified = 1;
1398	}
1399}
1400
1401static int
1402add_table_entry(u_int16_t tbl, in_addr_t addr, u_int8_t mlen, u_int32_t value)
1403{
1404	struct radix_node_head *rnh;
1405	struct table_entry *ent;
1406
1407	if (tbl >= IPFW_TABLES_MAX)
1408		return (EINVAL);
1409	rnh = ipfw_tables[tbl].rnh;
1410	ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
1411	if (ent == NULL)
1412		return (ENOMEM);
1413	ent->value = value;
1414	ent->addr.sin_len = ent->mask.sin_len = 8;
1415	ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
1416	ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
1417	RADIX_NODE_HEAD_LOCK(rnh);
1418	if (rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent) ==
1419	    NULL) {
1420		RADIX_NODE_HEAD_UNLOCK(rnh);
1421		free(ent, M_IPFW_TBL);
1422		return (EEXIST);
1423	}
1424	ipfw_tables[tbl].modified = 1;
1425	RADIX_NODE_HEAD_UNLOCK(rnh);
1426	return (0);
1427}
1428
1429static int
1430del_table_entry(u_int16_t tbl, in_addr_t addr, u_int8_t mlen)
1431{
1432	struct radix_node_head *rnh;
1433	struct table_entry *ent;
1434	struct sockaddr_in sa, mask;
1435
1436	if (tbl >= IPFW_TABLES_MAX)
1437		return (EINVAL);
1438	rnh = ipfw_tables[tbl].rnh;
1439	sa.sin_len = mask.sin_len = 8;
1440	mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
1441	sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
1442	RADIX_NODE_HEAD_LOCK(rnh);
1443	ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
1444	if (ent == NULL) {
1445		RADIX_NODE_HEAD_UNLOCK(rnh);
1446		return (ESRCH);
1447	}
1448	ipfw_tables[tbl].modified = 1;
1449	RADIX_NODE_HEAD_UNLOCK(rnh);
1450	free(ent, M_IPFW_TBL);
1451	return (0);
1452}
1453
1454static int
1455flush_table_entry(struct radix_node *rn, void *arg)
1456{
1457	struct radix_node_head * const rnh = arg;
1458	struct table_entry *ent;
1459
1460	ent = (struct table_entry *)
1461	    rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
1462	if (ent != NULL)
1463		free(ent, M_IPFW_TBL);
1464	return (0);
1465}
1466
1467static int
1468flush_table(u_int16_t tbl)
1469{
1470	struct radix_node_head *rnh;
1471
1472	if (tbl >= IPFW_TABLES_MAX)
1473		return (EINVAL);
1474	rnh = ipfw_tables[tbl].rnh;
1475	RADIX_NODE_HEAD_LOCK(rnh);
1476	rnh->rnh_walktree(rnh, flush_table_entry, rnh);
1477	ipfw_tables[tbl].modified = 1;
1478	RADIX_NODE_HEAD_UNLOCK(rnh);
1479	return (0);
1480}
1481
1482static void
1483flush_tables(void)
1484{
1485	u_int16_t tbl;
1486
1487	for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
1488		flush_table(tbl);
1489}
1490
1491static int
1492lookup_table(u_int16_t tbl, in_addr_t addr, u_int32_t *val)
1493{
1494	struct radix_node_head *rnh;
1495	struct table_entry *ent;
1496	struct sockaddr_in sa;
1497	static in_addr_t last_addr;
1498	static int last_tbl;
1499	static int last_match;
1500	static u_int32_t last_value;
1501
1502	if (tbl >= IPFW_TABLES_MAX)
1503		return (0);
1504	if (tbl == last_tbl && addr == last_addr &&
1505	    !ipfw_tables[tbl].modified) {
1506		if (last_match)
1507			*val = last_value;
1508		return (last_match);
1509	}
1510	rnh = ipfw_tables[tbl].rnh;
1511	sa.sin_len = 8;
1512	sa.sin_addr.s_addr = addr;
1513	RADIX_NODE_HEAD_LOCK(rnh);
1514	ipfw_tables[tbl].modified = 0;
1515	ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
1516	RADIX_NODE_HEAD_UNLOCK(rnh);
1517	last_addr = addr;
1518	last_tbl = tbl;
1519	if (ent != NULL) {
1520		last_value = *val = ent->value;
1521		last_match = 1;
1522		return (1);
1523	}
1524	last_match = 0;
1525	return (0);
1526}
1527
1528static int
1529count_table_entry(struct radix_node *rn, void *arg)
1530{
1531	u_int32_t * const cnt = arg;
1532
1533	(*cnt)++;
1534	return (0);
1535}
1536
1537static int
1538count_table(u_int32_t tbl, u_int32_t *cnt)
1539{
1540	struct radix_node_head *rnh;
1541
1542	if (tbl >= IPFW_TABLES_MAX)
1543		return (EINVAL);
1544	rnh = ipfw_tables[tbl].rnh;
1545	*cnt = 0;
1546	RADIX_NODE_HEAD_LOCK(rnh);
1547	rnh->rnh_walktree(rnh, count_table_entry, cnt);
1548	RADIX_NODE_HEAD_UNLOCK(rnh);
1549	return (0);
1550}
1551
1552static int
1553dump_table_entry(struct radix_node *rn, void *arg)
1554{
1555	struct table_entry * const n = (struct table_entry *)rn;
1556	ipfw_table * const tbl = arg;
1557	ipfw_table_entry *ent;
1558
1559	if (tbl->cnt == tbl->size)
1560		return (1);
1561	ent = &tbl->ent[tbl->cnt];
1562	ent->tbl = tbl->tbl;
1563	if (in_nullhost(n->mask.sin_addr))
1564		ent->masklen = 0;
1565	else
1566		ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
1567	ent->addr = n->addr.sin_addr.s_addr;
1568	ent->value = n->value;
1569	tbl->cnt++;
1570	return (0);
1571}
1572
1573static int
1574dump_table(ipfw_table *tbl)
1575{
1576	struct radix_node_head *rnh;
1577
1578	if (tbl->tbl >= IPFW_TABLES_MAX)
1579		return (EINVAL);
1580	rnh = ipfw_tables[tbl->tbl].rnh;
1581	tbl->cnt = 0;
1582	RADIX_NODE_HEAD_LOCK(rnh);
1583	rnh->rnh_walktree(rnh, dump_table_entry, tbl);
1584	RADIX_NODE_HEAD_UNLOCK(rnh);
1585	return (0);
1586}
1587
1588static void
1589fill_ugid_cache(struct inpcb *inp, struct ip_fw_ugid *ugp)
1590{
1591	struct ucred *cr;
1592
1593	if (inp->inp_socket != NULL) {
1594		cr = inp->inp_socket->so_cred;
1595		ugp->fw_prid = jailed(cr) ?
1596		    cr->cr_prison->pr_id : -1;
1597		ugp->fw_uid = cr->cr_uid;
1598		ugp->fw_ngroups = cr->cr_ngroups;
1599		bcopy(cr->cr_groups, ugp->fw_groups,
1600		    sizeof(ugp->fw_groups));
1601	}
1602}
1603
1604static int
1605check_uidgid(ipfw_insn_u32 *insn,
1606	int proto, struct ifnet *oif,
1607	struct in_addr dst_ip, u_int16_t dst_port,
1608	struct in_addr src_ip, u_int16_t src_port,
1609	struct ip_fw_ugid *ugp, int *lookup, struct inpcb *inp)
1610{
1611	struct inpcbinfo *pi;
1612	int wildcard;
1613	struct inpcb *pcb;
1614	int match;
1615	gid_t *gp;
1616
1617	/*
1618	 * Check to see if the UDP or TCP stack supplied us with
1619	 * the PCB. If so, rather then holding a lock and looking
1620	 * up the PCB, we can use the one that was supplied.
1621	 */
1622	if (inp && *lookup == 0) {
1623		INP_LOCK_ASSERT(inp);
1624		if (inp->inp_socket != NULL) {
1625			fill_ugid_cache(inp, ugp);
1626			*lookup = 1;
1627		}
1628	}
1629	/*
1630	 * If we have already been here and the packet has no
1631	 * PCB entry associated with it, then we can safely
1632	 * assume that this is a no match.
1633	 */
1634	if (*lookup == -1)
1635		return (0);
1636	if (proto == IPPROTO_TCP) {
1637		wildcard = 0;
1638		pi = &tcbinfo;
1639	} else if (proto == IPPROTO_UDP) {
1640		wildcard = 1;
1641		pi = &udbinfo;
1642	} else
1643		return 0;
1644	match = 0;
1645	if (*lookup == 0) {
1646		INP_INFO_RLOCK(pi);
1647		pcb =  (oif) ?
1648			in_pcblookup_hash(pi,
1649				dst_ip, htons(dst_port),
1650				src_ip, htons(src_port),
1651				wildcard, oif) :
1652			in_pcblookup_hash(pi,
1653				src_ip, htons(src_port),
1654				dst_ip, htons(dst_port),
1655				wildcard, NULL);
1656		if (pcb != NULL) {
1657			INP_LOCK(pcb);
1658			if (pcb->inp_socket != NULL) {
1659				fill_ugid_cache(pcb, ugp);
1660				*lookup = 1;
1661			}
1662			INP_UNLOCK(pcb);
1663		}
1664		INP_INFO_RUNLOCK(pi);
1665		if (*lookup == 0) {
1666			/*
1667			 * If the lookup did not yield any results, there
1668			 * is no sense in coming back and trying again. So
1669			 * we can set lookup to -1 and ensure that we wont
1670			 * bother the pcb system again.
1671			 */
1672			*lookup = -1;
1673			return (0);
1674		}
1675	}
1676	if (insn->o.opcode == O_UID)
1677		match = (ugp->fw_uid == (uid_t)insn->d[0]);
1678	else if (insn->o.opcode == O_GID) {
1679		for (gp = ugp->fw_groups;
1680			gp < &ugp->fw_groups[ugp->fw_ngroups]; gp++)
1681			if (*gp == (gid_t)insn->d[0]) {
1682				match = 1;
1683				break;
1684			}
1685	} else if (insn->o.opcode == O_JAIL)
1686		match = (ugp->fw_prid == (int)insn->d[0]);
1687	return match;
1688}
1689
1690/*
1691 * The main check routine for the firewall.
1692 *
1693 * All arguments are in args so we can modify them and return them
1694 * back to the caller.
1695 *
1696 * Parameters:
1697 *
1698 *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
1699 *		Starts with the IP header.
1700 *	args->eh (in)	Mac header if present, or NULL for layer3 packet.
1701 *	args->oif	Outgoing interface, or NULL if packet is incoming.
1702 *		The incoming interface is in the mbuf. (in)
1703 *	args->divert_rule (in/out)
1704 *		Skip up to the first rule past this rule number;
1705 *		upon return, non-zero port number for divert or tee.
1706 *
1707 *	args->rule	Pointer to the last matching rule (in/out)
1708 *	args->next_hop	Socket we are forwarding to (out).
1709 *	args->f_id	Addresses grabbed from the packet (out)
1710 * 	args->cookie	a cookie depending on rule action
1711 *
1712 * Return value:
1713 *
1714 *	IP_FW_PASS	the packet must be accepted
1715 *	IP_FW_DENY	the packet must be dropped
1716 *	IP_FW_DIVERT	divert packet, port in m_tag
1717 *	IP_FW_TEE	tee packet, port in m_tag
1718 *	IP_FW_DUMMYNET	to dummynet, pipe in args->cookie
1719 *	IP_FW_NETGRAPH	into netgraph, cookie args->cookie
1720 *
1721 */
1722
1723int
1724ipfw_chk(struct ip_fw_args *args)
1725{
1726	/*
1727	 * Local variables hold state during the processing of a packet.
1728	 *
1729	 * IMPORTANT NOTE: to speed up the processing of rules, there
1730	 * are some assumption on the values of the variables, which
1731	 * are documented here. Should you change them, please check
1732	 * the implementation of the various instructions to make sure
1733	 * that they still work.
1734	 *
1735	 * args->eh	The MAC header. It is non-null for a layer2
1736	 *	packet, it is NULL for a layer-3 packet.
1737	 *
1738	 * m | args->m	Pointer to the mbuf, as received from the caller.
1739	 *	It may change if ipfw_chk() does an m_pullup, or if it
1740	 *	consumes the packet because it calls send_reject().
1741	 *	XXX This has to change, so that ipfw_chk() never modifies
1742	 *	or consumes the buffer.
1743	 * ip	is simply an alias of the value of m, and it is kept
1744	 *	in sync with it (the packet is	supposed to start with
1745	 *	the ip header).
1746	 */
1747	struct mbuf *m = args->m;
1748	struct ip *ip = mtod(m, struct ip *);
1749
1750	/*
1751	 * For rules which contain uid/gid or jail constraints, cache
1752	 * a copy of the users credentials after the pcb lookup has been
1753	 * executed. This will speed up the processing of rules with
1754	 * these types of constraints, as well as decrease contention
1755	 * on pcb related locks.
1756	 */
1757	struct ip_fw_ugid fw_ugid_cache;
1758	int ugid_lookup = 0;
1759
1760	/*
1761	 * divinput_flags	If non-zero, set to the IP_FW_DIVERT_*_FLAG
1762	 *	associated with a packet input on a divert socket.  This
1763	 *	will allow to distinguish traffic and its direction when
1764	 *	it originates from a divert socket.
1765	 */
1766	u_int divinput_flags = 0;
1767
1768	/*
1769	 * oif | args->oif	If NULL, ipfw_chk has been called on the
1770	 *	inbound path (ether_input, bdg_forward, ip_input).
1771	 *	If non-NULL, ipfw_chk has been called on the outbound path
1772	 *	(ether_output, ip_output).
1773	 */
1774	struct ifnet *oif = args->oif;
1775
1776	struct ip_fw *f = NULL;		/* matching rule */
1777	int retval = 0;
1778
1779	/*
1780	 * hlen	The length of the IPv4 header.
1781	 *	hlen >0 means we have an IPv4 packet.
1782	 */
1783	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
1784
1785	/*
1786	 * offset	The offset of a fragment. offset != 0 means that
1787	 *	we have a fragment at this offset of an IPv4 packet.
1788	 *	offset == 0 means that (if this is an IPv4 packet)
1789	 *	this is the first or only fragment.
1790	 */
1791	u_short offset = 0;
1792
1793	/*
1794	 * Local copies of addresses. They are only valid if we have
1795	 * an IP packet.
1796	 *
1797	 * proto	The protocol. Set to 0 for non-ip packets,
1798	 *	or to the protocol read from the packet otherwise.
1799	 *	proto != 0 means that we have an IPv4 packet.
1800	 *
1801	 * src_port, dst_port	port numbers, in HOST format. Only
1802	 *	valid for TCP and UDP packets.
1803	 *
1804	 * src_ip, dst_ip	ip addresses, in NETWORK format.
1805	 *	Only valid for IPv4 packets.
1806	 */
1807	u_int8_t proto;
1808	u_int16_t src_port = 0, dst_port = 0;	/* NOTE: host format	*/
1809	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
1810	u_int16_t ip_len=0;
1811	int pktlen;
1812	int dyn_dir = MATCH_UNKNOWN;
1813	ipfw_dyn_rule *q = NULL;
1814	struct ip_fw_chain *chain = &layer3_chain;
1815	struct m_tag *mtag;
1816
1817	if (m->m_flags & M_SKIP_FIREWALL)
1818		return (IP_FW_PASS);	/* accept */
1819	/*
1820	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
1821	 * 	MATCH_NONE when checked and not matched (q = NULL),
1822	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
1823	 */
1824
1825	pktlen = m->m_pkthdr.len;
1826	if (args->eh == NULL ||		/* layer 3 packet */
1827		( m->m_pkthdr.len >= sizeof(struct ip) &&
1828		    ntohs(args->eh->ether_type) == ETHERTYPE_IP))
1829			hlen = ip->ip_hl << 2;
1830
1831	/*
1832	 * Collect parameters into local variables for faster matching.
1833	 */
1834	if (hlen == 0) {	/* do not grab addresses for non-ip pkts */
1835		proto = args->f_id.proto = 0;	/* mark f_id invalid */
1836		goto after_ip_checks;
1837	}
1838
1839	proto = args->f_id.proto = ip->ip_p;
1840	src_ip = ip->ip_src;
1841	dst_ip = ip->ip_dst;
1842	if (args->eh != NULL) { /* layer 2 packets are as on the wire */
1843		offset = ntohs(ip->ip_off) & IP_OFFMASK;
1844		ip_len = ntohs(ip->ip_len);
1845	} else {
1846		offset = ip->ip_off & IP_OFFMASK;
1847		ip_len = ip->ip_len;
1848	}
1849	pktlen = ip_len < pktlen ? ip_len : pktlen;
1850
1851#define PULLUP_TO(len)						\
1852		do {						\
1853			if ((m)->m_len < (len)) {		\
1854			    args->m = m = m_pullup(m, (len));	\
1855			    if (m == 0)				\
1856				goto pullup_failed;		\
1857			    ip = mtod(m, struct ip *);		\
1858			}					\
1859		} while (0)
1860
1861	if (offset == 0) {
1862		switch (proto) {
1863		case IPPROTO_TCP:
1864		    {
1865			struct tcphdr *tcp;
1866
1867			PULLUP_TO(hlen + sizeof(struct tcphdr));
1868			tcp = L3HDR(struct tcphdr, ip);
1869			dst_port = tcp->th_dport;
1870			src_port = tcp->th_sport;
1871			args->f_id.flags = tcp->th_flags;
1872			}
1873			break;
1874
1875		case IPPROTO_UDP:
1876		    {
1877			struct udphdr *udp;
1878
1879			PULLUP_TO(hlen + sizeof(struct udphdr));
1880			udp = L3HDR(struct udphdr, ip);
1881			dst_port = udp->uh_dport;
1882			src_port = udp->uh_sport;
1883			}
1884			break;
1885
1886		case IPPROTO_ICMP:
1887			PULLUP_TO(hlen + 4);	/* type, code and checksum. */
1888			args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
1889			break;
1890
1891		default:
1892			break;
1893		}
1894#undef PULLUP_TO
1895	}
1896
1897	args->f_id.src_ip = ntohl(src_ip.s_addr);
1898	args->f_id.dst_ip = ntohl(dst_ip.s_addr);
1899	args->f_id.src_port = src_port = ntohs(src_port);
1900	args->f_id.dst_port = dst_port = ntohs(dst_port);
1901
1902after_ip_checks:
1903	IPFW_RLOCK(chain);
1904	mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
1905	if (args->rule) {
1906		/*
1907		 * Packet has already been tagged. Look for the next rule
1908		 * to restart processing.
1909		 *
1910		 * If fw_one_pass != 0 then just accept it.
1911		 * XXX should not happen here, but optimized out in
1912		 * the caller.
1913		 */
1914		if (fw_one_pass) {
1915			IPFW_RUNLOCK(chain);
1916			return (IP_FW_PASS);
1917		}
1918
1919		f = args->rule->next_rule;
1920		if (f == NULL)
1921			f = lookup_next_rule(args->rule);
1922	} else {
1923		/*
1924		 * Find the starting rule. It can be either the first
1925		 * one, or the one after divert_rule if asked so.
1926		 */
1927		int skipto = mtag ? divert_cookie(mtag) : 0;
1928
1929		f = chain->rules;
1930		if (args->eh == NULL && skipto != 0) {
1931			if (skipto >= IPFW_DEFAULT_RULE) {
1932				IPFW_RUNLOCK(chain);
1933				return (IP_FW_DENY); /* invalid */
1934			}
1935			while (f && f->rulenum <= skipto)
1936				f = f->next;
1937			if (f == NULL) {	/* drop packet */
1938				IPFW_RUNLOCK(chain);
1939				return (IP_FW_DENY);
1940			}
1941		}
1942	}
1943	/* reset divert rule to avoid confusion later */
1944	if (mtag) {
1945		divinput_flags = divert_info(mtag) &
1946		    (IP_FW_DIVERT_OUTPUT_FLAG | IP_FW_DIVERT_LOOPBACK_FLAG);
1947		m_tag_delete(m, mtag);
1948	}
1949
1950	/*
1951	 * Now scan the rules, and parse microinstructions for each rule.
1952	 */
1953	for (; f; f = f->next) {
1954		int l, cmdlen;
1955		ipfw_insn *cmd;
1956		int skip_or; /* skip rest of OR block */
1957
1958again:
1959		if (set_disable & (1 << f->set) )
1960			continue;
1961
1962		skip_or = 0;
1963		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
1964		    l -= cmdlen, cmd += cmdlen) {
1965			int match;
1966
1967			/*
1968			 * check_body is a jump target used when we find a
1969			 * CHECK_STATE, and need to jump to the body of
1970			 * the target rule.
1971			 */
1972
1973check_body:
1974			cmdlen = F_LEN(cmd);
1975			/*
1976			 * An OR block (insn_1 || .. || insn_n) has the
1977			 * F_OR bit set in all but the last instruction.
1978			 * The first match will set "skip_or", and cause
1979			 * the following instructions to be skipped until
1980			 * past the one with the F_OR bit clear.
1981			 */
1982			if (skip_or) {		/* skip this instruction */
1983				if ((cmd->len & F_OR) == 0)
1984					skip_or = 0;	/* next one is good */
1985				continue;
1986			}
1987			match = 0; /* set to 1 if we succeed */
1988
1989			switch (cmd->opcode) {
1990			/*
1991			 * The first set of opcodes compares the packet's
1992			 * fields with some pattern, setting 'match' if a
1993			 * match is found. At the end of the loop there is
1994			 * logic to deal with F_NOT and F_OR flags associated
1995			 * with the opcode.
1996			 */
1997			case O_NOP:
1998				match = 1;
1999				break;
2000
2001			case O_FORWARD_MAC:
2002				printf("ipfw: opcode %d unimplemented\n",
2003				    cmd->opcode);
2004				break;
2005
2006			case O_GID:
2007			case O_UID:
2008			case O_JAIL:
2009				/*
2010				 * We only check offset == 0 && proto != 0,
2011				 * as this ensures that we have an IPv4
2012				 * packet with the ports info.
2013				 */
2014				if (offset!=0)
2015					break;
2016				if (proto == IPPROTO_TCP ||
2017				    proto == IPPROTO_UDP)
2018					match = check_uidgid(
2019						    (ipfw_insn_u32 *)cmd,
2020						    proto, oif,
2021						    dst_ip, dst_port,
2022						    src_ip, src_port, &fw_ugid_cache,
2023						    &ugid_lookup, args->inp);
2024				break;
2025
2026			case O_RECV:
2027				match = iface_match(m->m_pkthdr.rcvif,
2028				    (ipfw_insn_if *)cmd);
2029				break;
2030
2031			case O_XMIT:
2032				match = iface_match(oif, (ipfw_insn_if *)cmd);
2033				break;
2034
2035			case O_VIA:
2036				match = iface_match(oif ? oif :
2037				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
2038				break;
2039
2040			case O_MACADDR2:
2041				if (args->eh != NULL) {	/* have MAC header */
2042					u_int32_t *want = (u_int32_t *)
2043						((ipfw_insn_mac *)cmd)->addr;
2044					u_int32_t *mask = (u_int32_t *)
2045						((ipfw_insn_mac *)cmd)->mask;
2046					u_int32_t *hdr = (u_int32_t *)args->eh;
2047
2048					match =
2049					    ( want[0] == (hdr[0] & mask[0]) &&
2050					      want[1] == (hdr[1] & mask[1]) &&
2051					      want[2] == (hdr[2] & mask[2]) );
2052				}
2053				break;
2054
2055			case O_MAC_TYPE:
2056				if (args->eh != NULL) {
2057					u_int16_t t =
2058					    ntohs(args->eh->ether_type);
2059					u_int16_t *p =
2060					    ((ipfw_insn_u16 *)cmd)->ports;
2061					int i;
2062
2063					for (i = cmdlen - 1; !match && i>0;
2064					    i--, p += 2)
2065						match = (t>=p[0] && t<=p[1]);
2066				}
2067				break;
2068
2069			case O_FRAG:
2070				match = (hlen > 0 && offset != 0);
2071				break;
2072
2073			case O_IN:	/* "out" is "not in" */
2074				match = (oif == NULL);
2075				break;
2076
2077			case O_LAYER2:
2078				match = (args->eh != NULL);
2079				break;
2080
2081			case O_DIVERTED:
2082				match = (cmd->arg1 & 1 && divinput_flags &
2083				    IP_FW_DIVERT_LOOPBACK_FLAG) ||
2084					(cmd->arg1 & 2 && divinput_flags &
2085				    IP_FW_DIVERT_OUTPUT_FLAG);
2086				break;
2087
2088			case O_PROTO:
2089				/*
2090				 * We do not allow an arg of 0 so the
2091				 * check of "proto" only suffices.
2092				 */
2093				match = (proto == cmd->arg1);
2094				break;
2095
2096			case O_IP_SRC:
2097				match = (hlen > 0 &&
2098				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
2099				    src_ip.s_addr);
2100				break;
2101
2102			case O_IP_SRC_LOOKUP:
2103			case O_IP_DST_LOOKUP:
2104				if (hlen > 0) {
2105				    uint32_t a =
2106					(cmd->opcode == O_IP_DST_LOOKUP) ?
2107					    dst_ip.s_addr : src_ip.s_addr;
2108				    uint32_t v;
2109
2110				    match = lookup_table(cmd->arg1, a, &v);
2111				    if (!match)
2112					break;
2113				    if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
2114					match =
2115					    ((ipfw_insn_u32 *)cmd)->d[0] == v;
2116				}
2117				break;
2118
2119			case O_IP_SRC_MASK:
2120			case O_IP_DST_MASK:
2121				if (hlen > 0) {
2122				    uint32_t a =
2123					(cmd->opcode == O_IP_DST_MASK) ?
2124					    dst_ip.s_addr : src_ip.s_addr;
2125				    uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
2126				    int i = cmdlen-1;
2127
2128				    for (; !match && i>0; i-= 2, p+= 2)
2129					match = (p[0] == (a & p[1]));
2130				}
2131				break;
2132
2133			case O_IP_SRC_ME:
2134				if (hlen > 0) {
2135					struct ifnet *tif;
2136
2137					INADDR_TO_IFP(src_ip, tif);
2138					match = (tif != NULL);
2139				}
2140				break;
2141
2142			case O_IP_DST_SET:
2143			case O_IP_SRC_SET:
2144				if (hlen > 0) {
2145					u_int32_t *d = (u_int32_t *)(cmd+1);
2146					u_int32_t addr =
2147					    cmd->opcode == O_IP_DST_SET ?
2148						args->f_id.dst_ip :
2149						args->f_id.src_ip;
2150
2151					    if (addr < d[0])
2152						    break;
2153					    addr -= d[0]; /* subtract base */
2154					    match = (addr < cmd->arg1) &&
2155						( d[ 1 + (addr>>5)] &
2156						  (1<<(addr & 0x1f)) );
2157				}
2158				break;
2159
2160			case O_IP_DST:
2161				match = (hlen > 0 &&
2162				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
2163				    dst_ip.s_addr);
2164				break;
2165
2166			case O_IP_DST_ME:
2167				if (hlen > 0) {
2168					struct ifnet *tif;
2169
2170					INADDR_TO_IFP(dst_ip, tif);
2171					match = (tif != NULL);
2172				}
2173				break;
2174
2175			case O_IP_SRCPORT:
2176			case O_IP_DSTPORT:
2177				/*
2178				 * offset == 0 && proto != 0 is enough
2179				 * to guarantee that we have an IPv4
2180				 * packet with port info.
2181				 */
2182				if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
2183				    && offset == 0) {
2184					u_int16_t x =
2185					    (cmd->opcode == O_IP_SRCPORT) ?
2186						src_port : dst_port ;
2187					u_int16_t *p =
2188					    ((ipfw_insn_u16 *)cmd)->ports;
2189					int i;
2190
2191					for (i = cmdlen - 1; !match && i>0;
2192					    i--, p += 2)
2193						match = (x>=p[0] && x<=p[1]);
2194				}
2195				break;
2196
2197			case O_ICMPTYPE:
2198				match = (offset == 0 && proto==IPPROTO_ICMP &&
2199				    icmptype_match(ip, (ipfw_insn_u32 *)cmd) );
2200				break;
2201
2202			case O_IPOPT:
2203				match = (hlen > 0 && ipopts_match(ip, cmd) );
2204				break;
2205
2206			case O_IPVER:
2207				match = (hlen > 0 && cmd->arg1 == ip->ip_v);
2208				break;
2209
2210			case O_IPID:
2211			case O_IPLEN:
2212			case O_IPTTL:
2213				if (hlen > 0) {	/* only for IP packets */
2214				    uint16_t x;
2215				    uint16_t *p;
2216				    int i;
2217
2218				    if (cmd->opcode == O_IPLEN)
2219					x = ip_len;
2220				    else if (cmd->opcode == O_IPTTL)
2221					x = ip->ip_ttl;
2222				    else /* must be IPID */
2223					x = ntohs(ip->ip_id);
2224				    if (cmdlen == 1) {
2225					match = (cmd->arg1 == x);
2226					break;
2227				    }
2228				    /* otherwise we have ranges */
2229				    p = ((ipfw_insn_u16 *)cmd)->ports;
2230				    i = cmdlen - 1;
2231				    for (; !match && i>0; i--, p += 2)
2232					match = (x >= p[0] && x <= p[1]);
2233				}
2234				break;
2235
2236			case O_IPPRECEDENCE:
2237				match = (hlen > 0 &&
2238				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
2239				break;
2240
2241			case O_IPTOS:
2242				match = (hlen > 0 &&
2243				    flags_match(cmd, ip->ip_tos));
2244				break;
2245
2246			case O_TCPDATALEN:
2247				if (proto == IPPROTO_TCP && offset == 0) {
2248				    struct tcphdr *tcp;
2249				    uint16_t x;
2250				    uint16_t *p;
2251				    int i;
2252
2253				    tcp = L3HDR(struct tcphdr,ip);
2254				    x = ip_len -
2255					((ip->ip_hl + tcp->th_off) << 2);
2256				    if (cmdlen == 1) {
2257					match = (cmd->arg1 == x);
2258					break;
2259				    }
2260				    /* otherwise we have ranges */
2261				    p = ((ipfw_insn_u16 *)cmd)->ports;
2262				    i = cmdlen - 1;
2263				    for (; !match && i>0; i--, p += 2)
2264					match = (x >= p[0] && x <= p[1]);
2265				}
2266				break;
2267
2268			case O_TCPFLAGS:
2269				match = (proto == IPPROTO_TCP && offset == 0 &&
2270				    flags_match(cmd,
2271					L3HDR(struct tcphdr,ip)->th_flags));
2272				break;
2273
2274			case O_TCPOPTS:
2275				match = (proto == IPPROTO_TCP && offset == 0 &&
2276				    tcpopts_match(ip, cmd));
2277				break;
2278
2279			case O_TCPSEQ:
2280				match = (proto == IPPROTO_TCP && offset == 0 &&
2281				    ((ipfw_insn_u32 *)cmd)->d[0] ==
2282					L3HDR(struct tcphdr,ip)->th_seq);
2283				break;
2284
2285			case O_TCPACK:
2286				match = (proto == IPPROTO_TCP && offset == 0 &&
2287				    ((ipfw_insn_u32 *)cmd)->d[0] ==
2288					L3HDR(struct tcphdr,ip)->th_ack);
2289				break;
2290
2291			case O_TCPWIN:
2292				match = (proto == IPPROTO_TCP && offset == 0 &&
2293				    cmd->arg1 ==
2294					L3HDR(struct tcphdr,ip)->th_win);
2295				break;
2296
2297			case O_ESTAB:
2298				/* reject packets which have SYN only */
2299				/* XXX should i also check for TH_ACK ? */
2300				match = (proto == IPPROTO_TCP && offset == 0 &&
2301				    (L3HDR(struct tcphdr,ip)->th_flags &
2302				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
2303				break;
2304
2305			case O_ALTQ: {
2306				struct altq_tag *at;
2307				ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
2308
2309				match = 1;
2310				mtag = m_tag_get(PACKET_TAG_PF_QID,
2311						sizeof(struct altq_tag),
2312						M_NOWAIT);
2313				if (mtag == NULL) {
2314					/*
2315					 * Let the packet fall back to the
2316					 * default ALTQ.
2317					 */
2318					break;
2319				}
2320				at = (struct altq_tag *)(mtag+1);
2321				at->qid = altq->qid;
2322				if (hlen != 0)
2323					at->af = AF_INET;
2324				else
2325					at->af = AF_LINK;
2326				at->hdr = ip;
2327				m_tag_prepend(m, mtag);
2328				break;
2329			}
2330
2331			case O_LOG:
2332				if (fw_verbose)
2333					ipfw_log(f, hlen, args->eh, m, oif);
2334				match = 1;
2335				break;
2336
2337			case O_PROB:
2338				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
2339				break;
2340
2341			case O_VERREVPATH:
2342				/* Outgoing packets automatically pass/match */
2343				match = (hlen > 0 && ((oif != NULL) ||
2344				    (m->m_pkthdr.rcvif == NULL) ||
2345				    verify_path(src_ip, m->m_pkthdr.rcvif)));
2346				break;
2347
2348			case O_VERSRCREACH:
2349				/* Outgoing packets automatically pass/match */
2350				match = (hlen > 0 && ((oif != NULL) ||
2351				     verify_path(src_ip, NULL)));
2352				break;
2353
2354			case O_ANTISPOOF:
2355				/* Outgoing packets automatically pass/match */
2356				if (oif == NULL && hlen > 0 &&
2357				    in_localaddr(src_ip))
2358					match = verify_path(src_ip,
2359							m->m_pkthdr.rcvif);
2360				else
2361					match = 1;
2362				break;
2363
2364			case O_IPSEC:
2365#ifdef FAST_IPSEC
2366				match = (m_tag_find(m,
2367				    PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
2368#endif
2369#ifdef IPSEC
2370				match = (ipsec_getnhist(m) != 0);
2371#endif
2372				/* otherwise no match */
2373				break;
2374
2375			/*
2376			 * The second set of opcodes represents 'actions',
2377			 * i.e. the terminal part of a rule once the packet
2378			 * matches all previous patterns.
2379			 * Typically there is only one action for each rule,
2380			 * and the opcode is stored at the end of the rule
2381			 * (but there are exceptions -- see below).
2382			 *
2383			 * In general, here we set retval and terminate the
2384			 * outer loop (would be a 'break 3' in some language,
2385			 * but we need to do a 'goto done').
2386			 *
2387			 * Exceptions:
2388			 * O_COUNT and O_SKIPTO actions:
2389			 *   instead of terminating, we jump to the next rule
2390			 *   ('goto next_rule', equivalent to a 'break 2'),
2391			 *   or to the SKIPTO target ('goto again' after
2392			 *   having set f, cmd and l), respectively.
2393			 *
2394			 * O_LOG and O_ALTQ action parameters:
2395			 *   perform some action and set match = 1;
2396			 *
2397			 * O_LIMIT and O_KEEP_STATE: these opcodes are
2398			 *   not real 'actions', and are stored right
2399			 *   before the 'action' part of the rule.
2400			 *   These opcodes try to install an entry in the
2401			 *   state tables; if successful, we continue with
2402			 *   the next opcode (match=1; break;), otherwise
2403			 *   the packet *   must be dropped
2404			 *   ('goto done' after setting retval);
2405			 *
2406			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
2407			 *   cause a lookup of the state table, and a jump
2408			 *   to the 'action' part of the parent rule
2409			 *   ('goto check_body') if an entry is found, or
2410			 *   (CHECK_STATE only) a jump to the next rule if
2411			 *   the entry is not found ('goto next_rule').
2412			 *   The result of the lookup is cached to make
2413			 *   further instances of these opcodes are
2414			 *   effectively NOPs.
2415			 */
2416			case O_LIMIT:
2417			case O_KEEP_STATE:
2418				if (install_state(f,
2419				    (ipfw_insn_limit *)cmd, args)) {
2420					retval = IP_FW_DENY;
2421					goto done; /* error/limit violation */
2422				}
2423				match = 1;
2424				break;
2425
2426			case O_PROBE_STATE:
2427			case O_CHECK_STATE:
2428				/*
2429				 * dynamic rules are checked at the first
2430				 * keep-state or check-state occurrence,
2431				 * with the result being stored in dyn_dir.
2432				 * The compiler introduces a PROBE_STATE
2433				 * instruction for us when we have a
2434				 * KEEP_STATE (because PROBE_STATE needs
2435				 * to be run first).
2436				 */
2437				if (dyn_dir == MATCH_UNKNOWN &&
2438				    (q = lookup_dyn_rule(&args->f_id,
2439				     &dyn_dir, proto == IPPROTO_TCP ?
2440					L3HDR(struct tcphdr, ip) : NULL))
2441					!= NULL) {
2442					/*
2443					 * Found dynamic entry, update stats
2444					 * and jump to the 'action' part of
2445					 * the parent rule.
2446					 */
2447					q->pcnt++;
2448					q->bcnt += pktlen;
2449					f = q->rule;
2450					cmd = ACTION_PTR(f);
2451					l = f->cmd_len - f->act_ofs;
2452					IPFW_DYN_UNLOCK();
2453					goto check_body;
2454				}
2455				/*
2456				 * Dynamic entry not found. If CHECK_STATE,
2457				 * skip to next rule, if PROBE_STATE just
2458				 * ignore and continue with next opcode.
2459				 */
2460				if (cmd->opcode == O_CHECK_STATE)
2461					goto next_rule;
2462				match = 1;
2463				break;
2464
2465			case O_ACCEPT:
2466				retval = 0;	/* accept */
2467				goto done;
2468
2469			case O_PIPE:
2470			case O_QUEUE:
2471				args->rule = f; /* report matching rule */
2472				args->cookie = cmd->arg1;
2473				retval = IP_FW_DUMMYNET;
2474				goto done;
2475
2476			case O_DIVERT:
2477			case O_TEE: {
2478				struct divert_tag *dt;
2479
2480				if (args->eh) /* not on layer 2 */
2481					break;
2482				mtag = m_tag_get(PACKET_TAG_DIVERT,
2483						sizeof(struct divert_tag),
2484						M_NOWAIT);
2485				if (mtag == NULL) {
2486					/* XXX statistic */
2487					/* drop packet */
2488					IPFW_RUNLOCK(chain);
2489					return (IP_FW_DENY);
2490				}
2491				dt = (struct divert_tag *)(mtag+1);
2492				dt->cookie = f->rulenum;
2493				dt->info = cmd->arg1;
2494				m_tag_prepend(m, mtag);
2495				retval = (cmd->opcode == O_DIVERT) ?
2496				    IP_FW_DIVERT : IP_FW_TEE;
2497				goto done;
2498			}
2499
2500			case O_COUNT:
2501			case O_SKIPTO:
2502				f->pcnt++;	/* update stats */
2503				f->bcnt += pktlen;
2504				f->timestamp = time_second;
2505				if (cmd->opcode == O_COUNT)
2506					goto next_rule;
2507				/* handle skipto */
2508				if (f->next_rule == NULL)
2509					lookup_next_rule(f);
2510				f = f->next_rule;
2511				goto again;
2512
2513			case O_REJECT:
2514				/*
2515				 * Drop the packet and send a reject notice
2516				 * if the packet is not ICMP (or is an ICMP
2517				 * query), and it is not multicast/broadcast.
2518				 */
2519				if (hlen > 0 &&
2520				    (proto != IPPROTO_ICMP ||
2521				     is_icmp_query(ip)) &&
2522				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
2523				    !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
2524					send_reject(args, cmd->arg1,
2525					    offset,ip_len);
2526					m = args->m;
2527				}
2528				/* FALLTHROUGH */
2529			case O_DENY:
2530				retval = IP_FW_DENY;
2531				goto done;
2532
2533			case O_FORWARD_IP:
2534				if (args->eh)	/* not valid on layer2 pkts */
2535					break;
2536				if (!q || dyn_dir == MATCH_FORWARD)
2537					args->next_hop =
2538					    &((ipfw_insn_sa *)cmd)->sa;
2539				retval = IP_FW_PASS;
2540				goto done;
2541
2542			case O_NETGRAPH:
2543			case O_NGTEE:
2544				args->rule = f;	/* report matching rule */
2545				args->cookie = cmd->arg1;
2546				retval = (cmd->opcode == O_NETGRAPH) ?
2547				    IP_FW_NETGRAPH : IP_FW_NGTEE;
2548				goto done;
2549
2550			default:
2551				panic("-- unknown opcode %d\n", cmd->opcode);
2552			} /* end of switch() on opcodes */
2553
2554			if (cmd->len & F_NOT)
2555				match = !match;
2556
2557			if (match) {
2558				if (cmd->len & F_OR)
2559					skip_or = 1;
2560			} else {
2561				if (!(cmd->len & F_OR)) /* not an OR block, */
2562					break;		/* try next rule    */
2563			}
2564
2565		}	/* end of inner for, scan opcodes */
2566
2567next_rule:;		/* try next rule		*/
2568
2569	}		/* end of outer for, scan rules */
2570	printf("ipfw: ouch!, skip past end of rules, denying packet\n");
2571	IPFW_RUNLOCK(chain);
2572	return (IP_FW_DENY);
2573
2574done:
2575	/* Update statistics */
2576	f->pcnt++;
2577	f->bcnt += pktlen;
2578	f->timestamp = time_second;
2579	IPFW_RUNLOCK(chain);
2580	return (retval);
2581
2582pullup_failed:
2583	if (fw_verbose)
2584		printf("ipfw: pullup failed\n");
2585	return (IP_FW_DENY);
2586}
2587
2588/*
2589 * When a rule is added/deleted, clear the next_rule pointers in all rules.
2590 * These will be reconstructed on the fly as packets are matched.
2591 */
2592static void
2593flush_rule_ptrs(struct ip_fw_chain *chain)
2594{
2595	struct ip_fw *rule;
2596
2597	IPFW_WLOCK_ASSERT(chain);
2598
2599	for (rule = chain->rules; rule; rule = rule->next)
2600		rule->next_rule = NULL;
2601}
2602
2603/*
2604 * When pipes/queues are deleted, clear the "pipe_ptr" pointer to a given
2605 * pipe/queue, or to all of them (match == NULL).
2606 */
2607void
2608flush_pipe_ptrs(struct dn_flow_set *match)
2609{
2610	struct ip_fw *rule;
2611
2612	IPFW_WLOCK(&layer3_chain);
2613	for (rule = layer3_chain.rules; rule; rule = rule->next) {
2614		ipfw_insn_pipe *cmd = (ipfw_insn_pipe *)ACTION_PTR(rule);
2615
2616		if (cmd->o.opcode != O_PIPE && cmd->o.opcode != O_QUEUE)
2617			continue;
2618		/*
2619		 * XXX Use bcmp/bzero to handle pipe_ptr to overcome
2620		 * possible alignment problems on 64-bit architectures.
2621		 * This code is seldom used so we do not worry too
2622		 * much about efficiency.
2623		 */
2624		if (match == NULL ||
2625		    !bcmp(&cmd->pipe_ptr, &match, sizeof(match)) )
2626			bzero(&cmd->pipe_ptr, sizeof(cmd->pipe_ptr));
2627	}
2628	IPFW_WUNLOCK(&layer3_chain);
2629}
2630
2631/*
2632 * Add a new rule to the list. Copy the rule into a malloc'ed area, then
2633 * possibly create a rule number and add the rule to the list.
2634 * Update the rule_number in the input struct so the caller knows it as well.
2635 */
2636static int
2637add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
2638{
2639	struct ip_fw *rule, *f, *prev;
2640	int l = RULESIZE(input_rule);
2641
2642	if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE)
2643		return (EINVAL);
2644
2645	rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO);
2646	if (rule == NULL)
2647		return (ENOSPC);
2648
2649	bcopy(input_rule, rule, l);
2650
2651	rule->next = NULL;
2652	rule->next_rule = NULL;
2653
2654	rule->pcnt = 0;
2655	rule->bcnt = 0;
2656	rule->timestamp = 0;
2657
2658	IPFW_WLOCK(chain);
2659
2660	if (chain->rules == NULL) {	/* default rule */
2661		chain->rules = rule;
2662		goto done;
2663        }
2664
2665	/*
2666	 * If rulenum is 0, find highest numbered rule before the
2667	 * default rule, and add autoinc_step
2668	 */
2669	if (autoinc_step < 1)
2670		autoinc_step = 1;
2671	else if (autoinc_step > 1000)
2672		autoinc_step = 1000;
2673	if (rule->rulenum == 0) {
2674		/*
2675		 * locate the highest numbered rule before default
2676		 */
2677		for (f = chain->rules; f; f = f->next) {
2678			if (f->rulenum == IPFW_DEFAULT_RULE)
2679				break;
2680			rule->rulenum = f->rulenum;
2681		}
2682		if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step)
2683			rule->rulenum += autoinc_step;
2684		input_rule->rulenum = rule->rulenum;
2685	}
2686
2687	/*
2688	 * Now insert the new rule in the right place in the sorted list.
2689	 */
2690	for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) {
2691		if (f->rulenum > rule->rulenum) { /* found the location */
2692			if (prev) {
2693				rule->next = f;
2694				prev->next = rule;
2695			} else { /* head insert */
2696				rule->next = chain->rules;
2697				chain->rules = rule;
2698			}
2699			break;
2700		}
2701	}
2702	flush_rule_ptrs(chain);
2703done:
2704	static_count++;
2705	static_len += l;
2706	IPFW_WUNLOCK(chain);
2707	DEB(printf("ipfw: installed rule %d, static count now %d\n",
2708		rule->rulenum, static_count);)
2709	return (0);
2710}
2711
2712/**
2713 * Remove a static rule (including derived * dynamic rules)
2714 * and place it on the ``reap list'' for later reclamation.
2715 * The caller is in charge of clearing rule pointers to avoid
2716 * dangling pointers.
2717 * @return a pointer to the next entry.
2718 * Arguments are not checked, so they better be correct.
2719 */
2720static struct ip_fw *
2721remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule, struct ip_fw *prev)
2722{
2723	struct ip_fw *n;
2724	int l = RULESIZE(rule);
2725
2726	IPFW_WLOCK_ASSERT(chain);
2727
2728	n = rule->next;
2729	IPFW_DYN_LOCK();
2730	remove_dyn_rule(rule, NULL /* force removal */);
2731	IPFW_DYN_UNLOCK();
2732	if (prev == NULL)
2733		chain->rules = n;
2734	else
2735		prev->next = n;
2736	static_count--;
2737	static_len -= l;
2738
2739	rule->next = chain->reap;
2740	chain->reap = rule;
2741
2742	return n;
2743}
2744
2745/**
2746 * Reclaim storage associated with a list of rules.  This is
2747 * typically the list created using remove_rule.
2748 */
2749static void
2750reap_rules(struct ip_fw *head)
2751{
2752	struct ip_fw *rule;
2753
2754	while ((rule = head) != NULL) {
2755		head = head->next;
2756		if (DUMMYNET_LOADED)
2757			ip_dn_ruledel_ptr(rule);
2758		free(rule, M_IPFW);
2759	}
2760}
2761
2762/*
2763 * Remove all rules from a chain (except rules in set RESVD_SET
2764 * unless kill_default = 1).  The caller is responsible for
2765 * reclaiming storage for the rules left in chain->reap.
2766 */
2767static void
2768free_chain(struct ip_fw_chain *chain, int kill_default)
2769{
2770	struct ip_fw *prev, *rule;
2771
2772	IPFW_WLOCK_ASSERT(chain);
2773
2774	flush_rule_ptrs(chain); /* more efficient to do outside the loop */
2775	for (prev = NULL, rule = chain->rules; rule ; )
2776		if (kill_default || rule->set != RESVD_SET)
2777			rule = remove_rule(chain, rule, prev);
2778		else {
2779			prev = rule;
2780			rule = rule->next;
2781		}
2782}
2783
2784/**
2785 * Remove all rules with given number, and also do set manipulation.
2786 * Assumes chain != NULL && *chain != NULL.
2787 *
2788 * The argument is an u_int32_t. The low 16 bit are the rule or set number,
2789 * the next 8 bits are the new set, the top 8 bits are the command:
2790 *
2791 *	0	delete rules with given number
2792 *	1	delete rules with given set number
2793 *	2	move rules with given number to new set
2794 *	3	move rules with given set number to new set
2795 *	4	swap sets with given numbers
2796 */
2797static int
2798del_entry(struct ip_fw_chain *chain, u_int32_t arg)
2799{
2800	struct ip_fw *prev = NULL, *rule;
2801	u_int16_t rulenum;	/* rule or old_set */
2802	u_int8_t cmd, new_set;
2803
2804	rulenum = arg & 0xffff;
2805	cmd = (arg >> 24) & 0xff;
2806	new_set = (arg >> 16) & 0xff;
2807
2808	if (cmd > 4)
2809		return EINVAL;
2810	if (new_set > RESVD_SET)
2811		return EINVAL;
2812	if (cmd == 0 || cmd == 2) {
2813		if (rulenum >= IPFW_DEFAULT_RULE)
2814			return EINVAL;
2815	} else {
2816		if (rulenum > RESVD_SET)	/* old_set */
2817			return EINVAL;
2818	}
2819
2820	IPFW_WLOCK(chain);
2821	rule = chain->rules;
2822	chain->reap = NULL;
2823	switch (cmd) {
2824	case 0:	/* delete rules with given number */
2825		/*
2826		 * locate first rule to delete
2827		 */
2828		for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
2829			;
2830		if (rule->rulenum != rulenum) {
2831			IPFW_WUNLOCK(chain);
2832			return EINVAL;
2833		}
2834
2835		/*
2836		 * flush pointers outside the loop, then delete all matching
2837		 * rules. prev remains the same throughout the cycle.
2838		 */
2839		flush_rule_ptrs(chain);
2840		while (rule->rulenum == rulenum)
2841			rule = remove_rule(chain, rule, prev);
2842		break;
2843
2844	case 1:	/* delete all rules with given set number */
2845		flush_rule_ptrs(chain);
2846		rule = chain->rules;
2847		while (rule->rulenum < IPFW_DEFAULT_RULE)
2848			if (rule->set == rulenum)
2849				rule = remove_rule(chain, rule, prev);
2850			else {
2851				prev = rule;
2852				rule = rule->next;
2853			}
2854		break;
2855
2856	case 2:	/* move rules with given number to new set */
2857		rule = chain->rules;
2858		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
2859			if (rule->rulenum == rulenum)
2860				rule->set = new_set;
2861		break;
2862
2863	case 3: /* move rules with given set number to new set */
2864		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
2865			if (rule->set == rulenum)
2866				rule->set = new_set;
2867		break;
2868
2869	case 4: /* swap two sets */
2870		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
2871			if (rule->set == rulenum)
2872				rule->set = new_set;
2873			else if (rule->set == new_set)
2874				rule->set = rulenum;
2875		break;
2876	}
2877	/*
2878	 * Look for rules to reclaim.  We grab the list before
2879	 * releasing the lock then reclaim them w/o the lock to
2880	 * avoid a LOR with dummynet.
2881	 */
2882	rule = chain->reap;
2883	chain->reap = NULL;
2884	IPFW_WUNLOCK(chain);
2885	if (rule)
2886		reap_rules(rule);
2887	return 0;
2888}
2889
2890/*
2891 * Clear counters for a specific rule.
2892 * The enclosing "table" is assumed locked.
2893 */
2894static void
2895clear_counters(struct ip_fw *rule, int log_only)
2896{
2897	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
2898
2899	if (log_only == 0) {
2900		rule->bcnt = rule->pcnt = 0;
2901		rule->timestamp = 0;
2902	}
2903	if (l->o.opcode == O_LOG)
2904		l->log_left = l->max_log;
2905}
2906
2907/**
2908 * Reset some or all counters on firewall rules.
2909 * @arg frwl is null to clear all entries, or contains a specific
2910 * rule number.
2911 * @arg log_only is 1 if we only want to reset logs, zero otherwise.
2912 */
2913static int
2914zero_entry(struct ip_fw_chain *chain, int rulenum, int log_only)
2915{
2916	struct ip_fw *rule;
2917	char *msg;
2918
2919	IPFW_WLOCK(chain);
2920	if (rulenum == 0) {
2921		norule_counter = 0;
2922		for (rule = chain->rules; rule; rule = rule->next)
2923			clear_counters(rule, log_only);
2924		msg = log_only ? "ipfw: All logging counts reset.\n" :
2925				"ipfw: Accounting cleared.\n";
2926	} else {
2927		int cleared = 0;
2928		/*
2929		 * We can have multiple rules with the same number, so we
2930		 * need to clear them all.
2931		 */
2932		for (rule = chain->rules; rule; rule = rule->next)
2933			if (rule->rulenum == rulenum) {
2934				while (rule && rule->rulenum == rulenum) {
2935					clear_counters(rule, log_only);
2936					rule = rule->next;
2937				}
2938				cleared = 1;
2939				break;
2940			}
2941		if (!cleared) {	/* we did not find any matching rules */
2942			IPFW_WUNLOCK(chain);
2943			return (EINVAL);
2944		}
2945		msg = log_only ? "ipfw: Entry %d logging count reset.\n" :
2946				"ipfw: Entry %d cleared.\n";
2947	}
2948	IPFW_WUNLOCK(chain);
2949
2950	if (fw_verbose)
2951		log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
2952	return (0);
2953}
2954
2955/*
2956 * Check validity of the structure before insert.
2957 * Fortunately rules are simple, so this mostly need to check rule sizes.
2958 */
2959static int
2960check_ipfw_struct(struct ip_fw *rule, int size)
2961{
2962	int l, cmdlen = 0;
2963	int have_action=0;
2964	ipfw_insn *cmd;
2965
2966	if (size < sizeof(*rule)) {
2967		printf("ipfw: rule too short\n");
2968		return (EINVAL);
2969	}
2970	/* first, check for valid size */
2971	l = RULESIZE(rule);
2972	if (l != size) {
2973		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
2974		return (EINVAL);
2975	}
2976	if (rule->act_ofs >= rule->cmd_len) {
2977		printf("ipfw: bogus action offset (%u > %u)\n",
2978		    rule->act_ofs, rule->cmd_len - 1);
2979		return (EINVAL);
2980	}
2981	/*
2982	 * Now go for the individual checks. Very simple ones, basically only
2983	 * instruction sizes.
2984	 */
2985	for (l = rule->cmd_len, cmd = rule->cmd ;
2986			l > 0 ; l -= cmdlen, cmd += cmdlen) {
2987		cmdlen = F_LEN(cmd);
2988		if (cmdlen > l) {
2989			printf("ipfw: opcode %d size truncated\n",
2990			    cmd->opcode);
2991			return EINVAL;
2992		}
2993		DEB(printf("ipfw: opcode %d\n", cmd->opcode);)
2994		switch (cmd->opcode) {
2995		case O_PROBE_STATE:
2996		case O_KEEP_STATE:
2997		case O_PROTO:
2998		case O_IP_SRC_ME:
2999		case O_IP_DST_ME:
3000		case O_LAYER2:
3001		case O_IN:
3002		case O_FRAG:
3003		case O_DIVERTED:
3004		case O_IPOPT:
3005		case O_IPTOS:
3006		case O_IPPRECEDENCE:
3007		case O_IPVER:
3008		case O_TCPWIN:
3009		case O_TCPFLAGS:
3010		case O_TCPOPTS:
3011		case O_ESTAB:
3012		case O_VERREVPATH:
3013		case O_VERSRCREACH:
3014		case O_ANTISPOOF:
3015		case O_IPSEC:
3016			if (cmdlen != F_INSN_SIZE(ipfw_insn))
3017				goto bad_size;
3018			break;
3019
3020		case O_UID:
3021		case O_GID:
3022		case O_JAIL:
3023		case O_IP_SRC:
3024		case O_IP_DST:
3025		case O_TCPSEQ:
3026		case O_TCPACK:
3027		case O_PROB:
3028		case O_ICMPTYPE:
3029			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
3030				goto bad_size;
3031			break;
3032
3033		case O_LIMIT:
3034			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
3035				goto bad_size;
3036			break;
3037
3038		case O_LOG:
3039			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
3040				goto bad_size;
3041
3042			((ipfw_insn_log *)cmd)->log_left =
3043			    ((ipfw_insn_log *)cmd)->max_log;
3044
3045			break;
3046
3047		case O_IP_SRC_MASK:
3048		case O_IP_DST_MASK:
3049			/* only odd command lengths */
3050			if ( !(cmdlen & 1) || cmdlen > 31)
3051				goto bad_size;
3052			break;
3053
3054		case O_IP_SRC_SET:
3055		case O_IP_DST_SET:
3056			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
3057				printf("ipfw: invalid set size %d\n",
3058					cmd->arg1);
3059				return EINVAL;
3060			}
3061			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
3062			    (cmd->arg1+31)/32 )
3063				goto bad_size;
3064			break;
3065
3066		case O_IP_SRC_LOOKUP:
3067		case O_IP_DST_LOOKUP:
3068			if (cmd->arg1 >= IPFW_TABLES_MAX) {
3069				printf("ipfw: invalid table number %d\n",
3070				    cmd->arg1);
3071				return (EINVAL);
3072			}
3073			if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
3074			    cmdlen != F_INSN_SIZE(ipfw_insn_u32))
3075				goto bad_size;
3076			break;
3077
3078		case O_MACADDR2:
3079			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
3080				goto bad_size;
3081			break;
3082
3083		case O_NOP:
3084		case O_IPID:
3085		case O_IPTTL:
3086		case O_IPLEN:
3087		case O_TCPDATALEN:
3088			if (cmdlen < 1 || cmdlen > 31)
3089				goto bad_size;
3090			break;
3091
3092		case O_MAC_TYPE:
3093		case O_IP_SRCPORT:
3094		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
3095			if (cmdlen < 2 || cmdlen > 31)
3096				goto bad_size;
3097			break;
3098
3099		case O_RECV:
3100		case O_XMIT:
3101		case O_VIA:
3102			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
3103				goto bad_size;
3104			break;
3105
3106		case O_ALTQ:
3107			if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
3108				goto bad_size;
3109			break;
3110
3111		case O_PIPE:
3112		case O_QUEUE:
3113			if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
3114				goto bad_size;
3115			goto check_action;
3116
3117		case O_FORWARD_IP:
3118#ifdef	IPFIREWALL_FORWARD
3119			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
3120				goto bad_size;
3121			goto check_action;
3122#else
3123			return EINVAL;
3124#endif
3125
3126		case O_DIVERT:
3127		case O_TEE:
3128			if (ip_divert_ptr == NULL)
3129				return EINVAL;
3130			else
3131				goto check_size;
3132		case O_NETGRAPH:
3133		case O_NGTEE:
3134			if (!NG_IPFW_LOADED)
3135				return EINVAL;
3136			else
3137				goto check_size;
3138		case O_FORWARD_MAC: /* XXX not implemented yet */
3139		case O_CHECK_STATE:
3140		case O_COUNT:
3141		case O_ACCEPT:
3142		case O_DENY:
3143		case O_REJECT:
3144		case O_SKIPTO:
3145check_size:
3146			if (cmdlen != F_INSN_SIZE(ipfw_insn))
3147				goto bad_size;
3148check_action:
3149			if (have_action) {
3150				printf("ipfw: opcode %d, multiple actions"
3151					" not allowed\n",
3152					cmd->opcode);
3153				return EINVAL;
3154			}
3155			have_action = 1;
3156			if (l != cmdlen) {
3157				printf("ipfw: opcode %d, action must be"
3158					" last opcode\n",
3159					cmd->opcode);
3160				return EINVAL;
3161			}
3162			break;
3163		default:
3164			printf("ipfw: opcode %d, unknown opcode\n",
3165				cmd->opcode);
3166			return EINVAL;
3167		}
3168	}
3169	if (have_action == 0) {
3170		printf("ipfw: missing action\n");
3171		return EINVAL;
3172	}
3173	return 0;
3174
3175bad_size:
3176	printf("ipfw: opcode %d size %d wrong\n",
3177		cmd->opcode, cmdlen);
3178	return EINVAL;
3179}
3180
3181/*
3182 * Copy the static and dynamic rules to the supplied buffer
3183 * and return the amount of space actually used.
3184 */
3185static size_t
3186ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
3187{
3188	char *bp = buf;
3189	char *ep = bp + space;
3190	struct ip_fw *rule;
3191	int i;
3192
3193	/* XXX this can take a long time and locking will block packet flow */
3194	IPFW_RLOCK(chain);
3195	for (rule = chain->rules; rule ; rule = rule->next) {
3196		/*
3197		 * Verify the entry fits in the buffer in case the
3198		 * rules changed between calculating buffer space and
3199		 * now.  This would be better done using a generation
3200		 * number but should suffice for now.
3201		 */
3202		i = RULESIZE(rule);
3203		if (bp + i <= ep) {
3204			bcopy(rule, bp, i);
3205			bcopy(&set_disable, &(((struct ip_fw *)bp)->next_rule),
3206			    sizeof(set_disable));
3207			bp += i;
3208		}
3209	}
3210	IPFW_RUNLOCK(chain);
3211	if (ipfw_dyn_v) {
3212		ipfw_dyn_rule *p, *last = NULL;
3213
3214		IPFW_DYN_LOCK();
3215		for (i = 0 ; i < curr_dyn_buckets; i++)
3216			for (p = ipfw_dyn_v[i] ; p != NULL; p = p->next) {
3217				if (bp + sizeof *p <= ep) {
3218					ipfw_dyn_rule *dst =
3219						(ipfw_dyn_rule *)bp;
3220					bcopy(p, dst, sizeof *p);
3221					bcopy(&(p->rule->rulenum), &(dst->rule),
3222					    sizeof(p->rule->rulenum));
3223					/*
3224					 * store a non-null value in "next".
3225					 * The userland code will interpret a
3226					 * NULL here as a marker
3227					 * for the last dynamic rule.
3228					 */
3229					bcopy(&dst, &dst->next, sizeof(dst));
3230					last = dst;
3231					dst->expire =
3232					    TIME_LEQ(dst->expire, time_second) ?
3233						0 : dst->expire - time_second ;
3234					bp += sizeof(ipfw_dyn_rule);
3235				}
3236			}
3237		IPFW_DYN_UNLOCK();
3238		if (last != NULL) /* mark last dynamic rule */
3239			bzero(&last->next, sizeof(last));
3240	}
3241	return (bp - (char *)buf);
3242}
3243
3244
3245/**
3246 * {set|get}sockopt parser.
3247 */
3248static int
3249ipfw_ctl(struct sockopt *sopt)
3250{
3251#define	RULE_MAXSIZE	(256*sizeof(u_int32_t))
3252	int error, rule_num;
3253	size_t size;
3254	struct ip_fw *buf, *rule;
3255	u_int32_t rulenum[2];
3256
3257	error = suser(sopt->sopt_td);
3258	if (error)
3259		return (error);
3260
3261	/*
3262	 * Disallow modifications in really-really secure mode, but still allow
3263	 * the logging counters to be reset.
3264	 */
3265	if (sopt->sopt_name == IP_FW_ADD ||
3266	    (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
3267#if __FreeBSD_version >= 500034
3268		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
3269		if (error)
3270			return (error);
3271#else /* FreeBSD 4.x */
3272		if (securelevel >= 3)
3273			return (EPERM);
3274#endif
3275	}
3276
3277	error = 0;
3278
3279	switch (sopt->sopt_name) {
3280	case IP_FW_GET:
3281		/*
3282		 * pass up a copy of the current rules. Static rules
3283		 * come first (the last of which has number IPFW_DEFAULT_RULE),
3284		 * followed by a possibly empty list of dynamic rule.
3285		 * The last dynamic rule has NULL in the "next" field.
3286		 *
3287		 * Note that the calculated size is used to bound the
3288		 * amount of data returned to the user.  The rule set may
3289		 * change between calculating the size and returning the
3290		 * data in which case we'll just return what fits.
3291		 */
3292		size = static_len;	/* size of static rules */
3293		if (ipfw_dyn_v)		/* add size of dyn.rules */
3294			size += (dyn_count * sizeof(ipfw_dyn_rule));
3295
3296		/*
3297		 * XXX todo: if the user passes a short length just to know
3298		 * how much room is needed, do not bother filling up the
3299		 * buffer, just jump to the sooptcopyout.
3300		 */
3301		buf = malloc(size, M_TEMP, M_WAITOK);
3302		error = sooptcopyout(sopt, buf,
3303				ipfw_getrules(&layer3_chain, buf, size));
3304		free(buf, M_TEMP);
3305		break;
3306
3307	case IP_FW_FLUSH:
3308		/*
3309		 * Normally we cannot release the lock on each iteration.
3310		 * We could do it here only because we start from the head all
3311		 * the times so there is no risk of missing some entries.
3312		 * On the other hand, the risk is that we end up with
3313		 * a very inconsistent ruleset, so better keep the lock
3314		 * around the whole cycle.
3315		 *
3316		 * XXX this code can be improved by resetting the head of
3317		 * the list to point to the default rule, and then freeing
3318		 * the old list without the need for a lock.
3319		 */
3320
3321		IPFW_WLOCK(&layer3_chain);
3322		layer3_chain.reap = NULL;
3323		free_chain(&layer3_chain, 0 /* keep default rule */);
3324		rule = layer3_chain.reap, layer3_chain.reap = NULL;
3325		IPFW_WUNLOCK(&layer3_chain);
3326		if (layer3_chain.reap != NULL)
3327			reap_rules(rule);
3328		break;
3329
3330	case IP_FW_ADD:
3331		rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
3332		error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
3333			sizeof(struct ip_fw) );
3334		if (error == 0)
3335			error = check_ipfw_struct(rule, sopt->sopt_valsize);
3336		if (error == 0) {
3337			error = add_rule(&layer3_chain, rule);
3338			size = RULESIZE(rule);
3339			if (!error && sopt->sopt_dir == SOPT_GET)
3340				error = sooptcopyout(sopt, rule, size);
3341		}
3342		free(rule, M_TEMP);
3343		break;
3344
3345	case IP_FW_DEL:
3346		/*
3347		 * IP_FW_DEL is used for deleting single rules or sets,
3348		 * and (ab)used to atomically manipulate sets. Argument size
3349		 * is used to distinguish between the two:
3350		 *    sizeof(u_int32_t)
3351		 *	delete single rule or set of rules,
3352		 *	or reassign rules (or sets) to a different set.
3353		 *    2*sizeof(u_int32_t)
3354		 *	atomic disable/enable sets.
3355		 *	first u_int32_t contains sets to be disabled,
3356		 *	second u_int32_t contains sets to be enabled.
3357		 */
3358		error = sooptcopyin(sopt, rulenum,
3359			2*sizeof(u_int32_t), sizeof(u_int32_t));
3360		if (error)
3361			break;
3362		size = sopt->sopt_valsize;
3363		if (size == sizeof(u_int32_t))	/* delete or reassign */
3364			error = del_entry(&layer3_chain, rulenum[0]);
3365		else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */
3366			set_disable =
3367			    (set_disable | rulenum[0]) & ~rulenum[1] &
3368			    ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
3369		else
3370			error = EINVAL;
3371		break;
3372
3373	case IP_FW_ZERO:
3374	case IP_FW_RESETLOG: /* argument is an int, the rule number */
3375		rule_num = 0;
3376		if (sopt->sopt_val != 0) {
3377		    error = sooptcopyin(sopt, &rule_num,
3378			    sizeof(int), sizeof(int));
3379		    if (error)
3380			break;
3381		}
3382		error = zero_entry(&layer3_chain, rule_num,
3383			sopt->sopt_name == IP_FW_RESETLOG);
3384		break;
3385
3386	case IP_FW_TABLE_ADD:
3387		{
3388			ipfw_table_entry ent;
3389
3390			error = sooptcopyin(sopt, &ent,
3391			    sizeof(ent), sizeof(ent));
3392			if (error)
3393				break;
3394			error = add_table_entry(ent.tbl, ent.addr,
3395			    ent.masklen, ent.value);
3396		}
3397		break;
3398
3399	case IP_FW_TABLE_DEL:
3400		{
3401			ipfw_table_entry ent;
3402
3403			error = sooptcopyin(sopt, &ent,
3404			    sizeof(ent), sizeof(ent));
3405			if (error)
3406				break;
3407			error = del_table_entry(ent.tbl, ent.addr, ent.masklen);
3408		}
3409		break;
3410
3411	case IP_FW_TABLE_FLUSH:
3412		{
3413			u_int16_t tbl;
3414
3415			error = sooptcopyin(sopt, &tbl,
3416			    sizeof(tbl), sizeof(tbl));
3417			if (error)
3418				break;
3419			error = flush_table(tbl);
3420		}
3421		break;
3422
3423	case IP_FW_TABLE_GETSIZE:
3424		{
3425			u_int32_t tbl, cnt;
3426
3427			if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
3428			    sizeof(tbl))))
3429				break;
3430			if ((error = count_table(tbl, &cnt)))
3431				break;
3432			error = sooptcopyout(sopt, &cnt, sizeof(cnt));
3433		}
3434		break;
3435
3436	case IP_FW_TABLE_LIST:
3437		{
3438			ipfw_table *tbl;
3439
3440			if (sopt->sopt_valsize < sizeof(*tbl)) {
3441				error = EINVAL;
3442				break;
3443			}
3444			size = sopt->sopt_valsize;
3445			tbl = malloc(size, M_TEMP, M_WAITOK);
3446			if (tbl == NULL) {
3447				error = ENOMEM;
3448				break;
3449			}
3450			error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
3451			if (error) {
3452				free(tbl, M_TEMP);
3453				break;
3454			}
3455			tbl->size = (size - sizeof(*tbl)) /
3456			    sizeof(ipfw_table_entry);
3457			error = dump_table(tbl);
3458			if (error) {
3459				free(tbl, M_TEMP);
3460				break;
3461			}
3462			error = sooptcopyout(sopt, tbl, size);
3463			free(tbl, M_TEMP);
3464		}
3465		break;
3466
3467	default:
3468		printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
3469		error = EINVAL;
3470	}
3471
3472	return (error);
3473#undef RULE_MAXSIZE
3474}
3475
3476/**
3477 * dummynet needs a reference to the default rule, because rules can be
3478 * deleted while packets hold a reference to them. When this happens,
3479 * dummynet changes the reference to the default rule (it could well be a
3480 * NULL pointer, but this way we do not need to check for the special
3481 * case, plus here he have info on the default behaviour).
3482 */
3483struct ip_fw *ip_fw_default_rule;
3484
3485/*
3486 * This procedure is only used to handle keepalives. It is invoked
3487 * every dyn_keepalive_period
3488 */
3489static void
3490ipfw_tick(void * __unused unused)
3491{
3492	int i;
3493	ipfw_dyn_rule *q;
3494
3495	if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0)
3496		goto done;
3497
3498	IPFW_DYN_LOCK();
3499	for (i = 0 ; i < curr_dyn_buckets ; i++) {
3500		for (q = ipfw_dyn_v[i] ; q ; q = q->next ) {
3501			if (q->dyn_type == O_LIMIT_PARENT)
3502				continue;
3503			if (q->id.proto != IPPROTO_TCP)
3504				continue;
3505			if ( (q->state & BOTH_SYN) != BOTH_SYN)
3506				continue;
3507			if (TIME_LEQ( time_second+dyn_keepalive_interval,
3508			    q->expire))
3509				continue;	/* too early */
3510			if (TIME_LEQ(q->expire, time_second))
3511				continue;	/* too late, rule expired */
3512
3513			send_pkt(&(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN);
3514			send_pkt(&(q->id), q->ack_fwd - 1, q->ack_rev, 0);
3515		}
3516	}
3517	IPFW_DYN_UNLOCK();
3518done:
3519	callout_reset(&ipfw_timeout, dyn_keepalive_period*hz, ipfw_tick, NULL);
3520}
3521
3522int
3523ipfw_init(void)
3524{
3525	struct ip_fw default_rule;
3526	int error;
3527
3528	layer3_chain.rules = NULL;
3529	layer3_chain.want_write = 0;
3530	layer3_chain.busy_count = 0;
3531	cv_init(&layer3_chain.cv, "Condition variable for IPFW rw locks");
3532	IPFW_LOCK_INIT(&layer3_chain);
3533	ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule zone",
3534	    sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
3535	    UMA_ALIGN_PTR, 0);
3536	IPFW_DYN_LOCK_INIT();
3537	callout_init(&ipfw_timeout, NET_CALLOUT_MPSAFE);
3538
3539	bzero(&default_rule, sizeof default_rule);
3540
3541	default_rule.act_ofs = 0;
3542	default_rule.rulenum = IPFW_DEFAULT_RULE;
3543	default_rule.cmd_len = 1;
3544	default_rule.set = RESVD_SET;
3545
3546	default_rule.cmd[0].len = 1;
3547	default_rule.cmd[0].opcode =
3548#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
3549				1 ? O_ACCEPT :
3550#endif
3551				O_DENY;
3552
3553	error = add_rule(&layer3_chain, &default_rule);
3554	if (error != 0) {
3555		printf("ipfw2: error %u initializing default rule "
3556			"(support disabled)\n", error);
3557		IPFW_DYN_LOCK_DESTROY();
3558		IPFW_LOCK_DESTROY(&layer3_chain);
3559		return (error);
3560	}
3561
3562	ip_fw_default_rule = layer3_chain.rules;
3563	printf("ipfw2 initialized, divert %s, "
3564		"rule-based forwarding "
3565#ifdef IPFIREWALL_FORWARD
3566		"enabled, "
3567#else
3568		"disabled, "
3569#endif
3570		"default to %s, logging ",
3571#ifdef IPDIVERT
3572		"enabled",
3573#else
3574		"loadable",
3575#endif
3576		default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny");
3577
3578#ifdef IPFIREWALL_VERBOSE
3579	fw_verbose = 1;
3580#endif
3581#ifdef IPFIREWALL_VERBOSE_LIMIT
3582	verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
3583#endif
3584	if (fw_verbose == 0)
3585		printf("disabled\n");
3586	else if (verbose_limit == 0)
3587		printf("unlimited\n");
3588	else
3589		printf("limited to %d packets/entry by default\n",
3590		    verbose_limit);
3591
3592	init_tables();
3593	ip_fw_ctl_ptr = ipfw_ctl;
3594	ip_fw_chk_ptr = ipfw_chk;
3595	callout_reset(&ipfw_timeout, hz, ipfw_tick, NULL);
3596
3597	return (0);
3598}
3599
3600void
3601ipfw_destroy(void)
3602{
3603	struct ip_fw *reap;
3604
3605	ip_fw_chk_ptr = NULL;
3606	ip_fw_ctl_ptr = NULL;
3607	callout_drain(&ipfw_timeout);
3608	IPFW_WLOCK(&layer3_chain);
3609	layer3_chain.reap = NULL;
3610	free_chain(&layer3_chain, 1 /* kill default rule */);
3611	reap = layer3_chain.reap, layer3_chain.reap = NULL;
3612	IPFW_WUNLOCK(&layer3_chain);
3613	if (reap != NULL)
3614		reap_rules(reap);
3615	flush_tables();
3616	IPFW_DYN_LOCK_DESTROY();
3617	uma_zdestroy(ipfw_dyn_rule_zone);
3618	IPFW_LOCK_DESTROY(&layer3_chain);
3619	printf("IP firewall unloaded\n");
3620}
3621
3622#endif /* IPFW2 */
3623