ip_fw2.c revision 123000
1112918Sjeff/*
2112918Sjeff * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3112918Sjeff *
4112918Sjeff * Redistribution and use in source and binary forms, with or without
5112918Sjeff * modification, are permitted provided that the following conditions
6112918Sjeff * are met:
7112918Sjeff * 1. Redistributions of source code must retain the above copyright
8112918Sjeff *    notice, this list of conditions and the following disclaimer.
9112918Sjeff * 2. Redistributions in binary form must reproduce the above copyright
10112918Sjeff *    notice, this list of conditions and the following disclaimer in the
11112918Sjeff *    documentation and/or other materials provided with the distribution.
12112918Sjeff *
13165967Simp * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14112918Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15112918Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16112918Sjeff * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17112918Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18112918Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19112918Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20112918Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21112918Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22112918Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23112918Sjeff * SUCH DAMAGE.
24112918Sjeff *
25112918Sjeff * $FreeBSD: head/sys/netinet/ip_fw2.c 123000 2003-11-27 09:40:13Z andre $
26112918Sjeff */
27112918Sjeff
28112918Sjeff#define        DEB(x)
29112918Sjeff#define        DDB(x) x
30112918Sjeff
31157457Sdavidxu/*
32112918Sjeff * Implement IP packet firewall (new version)
33112918Sjeff */
34157457Sdavidxu
35112918Sjeff#if !defined(KLD_MODULE)
36112918Sjeff#include "opt_ipfw.h"
37112918Sjeff#include "opt_ipdn.h"
38112918Sjeff#include "opt_ipdivert.h"
39112918Sjeff#include "opt_inet.h"
40112918Sjeff#ifndef INET
41112918Sjeff#error IPFIREWALL requires INET.
42112918Sjeff#endif /* INET */
43112918Sjeff#endif
44112918Sjeff
45157457Sdavidxu#define IPFW2	1
46112918Sjeff#if IPFW2
47112918Sjeff#include <sys/param.h>
48112918Sjeff#include <sys/systm.h>
49112918Sjeff#include <sys/malloc.h>
50112918Sjeff#include <sys/mbuf.h>
51112918Sjeff#include <sys/kernel.h>
52112918Sjeff#include <sys/proc.h>
53112918Sjeff#include <sys/socket.h>
54112918Sjeff#include <sys/socketvar.h>
55112918Sjeff#include <sys/sysctl.h>
56#include <sys/syslog.h>
57#include <sys/ucred.h>
58#include <net/if.h>
59#include <net/route.h>
60#include <netinet/in.h>
61#include <netinet/in_systm.h>
62#include <netinet/in_var.h>
63#include <netinet/in_pcb.h>
64#include <netinet/ip.h>
65#include <netinet/ip_var.h>
66#include <netinet/ip_icmp.h>
67#include <netinet/ip_fw.h>
68#include <netinet/ip_dummynet.h>
69#include <netinet/tcp.h>
70#include <netinet/tcp_timer.h>
71#include <netinet/tcp_var.h>
72#include <netinet/tcpip.h>
73#include <netinet/udp.h>
74#include <netinet/udp_var.h>
75
76#ifdef IPSEC
77#include <netinet6/ipsec.h>
78#endif
79
80#include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
81
82#include <machine/in_cksum.h>	/* XXX for in_cksum */
83
84/*
85 * This is used to avoid that a firewall-generated packet
86 * loops forever through the firewall.  Note that it must
87 * be a flag that is unused by other protocols that might
88 * be called from ip_output (e.g. IPsec) and it must be
89 * listed in M_COPYFLAGS in mbuf.h so that if the mbuf chain
90 * is altered on the way through ip_output it is not lost.
91 * It might be better to add an m_tag since the this happens
92 * infrequently.
93 */
94#define M_SKIP_FIREWALL         M_PROTO6
95
96/*
97 * set_disable contains one bit per set value (0..31).
98 * If the bit is set, all rules with the corresponding set
99 * are disabled. Set RESVD_SET(31) is reserved for the default rule
100 * and rules that are not deleted by the flush command,
101 * and CANNOT be disabled.
102 * Rules in set RESVD_SET can only be deleted explicitly.
103 */
104static u_int32_t set_disable;
105
106static int fw_verbose;
107static int verbose_limit;
108
109static struct callout ipfw_timeout;
110#define	IPFW_DEFAULT_RULE	65535
111
112struct ip_fw_chain {
113	struct ip_fw	*rules;		/* list of rules */
114	struct ip_fw	*reap;		/* list of rules to reap */
115	struct mtx	mtx;		/* lock guarding rule list */
116};
117#define	IPFW_LOCK_INIT(_chain) \
118	mtx_init(&(_chain)->mtx, "IPFW static rules", NULL, \
119		MTX_DEF | MTX_RECURSE)
120#define	IPFW_LOCK_DESTROY(_chain)	mtx_destroy(&(_chain)->mtx)
121#define	IPFW_LOCK(_chain)	mtx_lock(&(_chain)->mtx)
122#define	IPFW_UNLOCK(_chain)	mtx_unlock(&(_chain)->mtx)
123#define	IPFW_LOCK_ASSERT(_chain)	mtx_assert(&(_chain)->mtx, MA_OWNED)
124
125/*
126 * list of rules for layer 3
127 */
128static struct ip_fw_chain layer3_chain;
129
130MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
131
132static int fw_debug = 1;
133static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
134
135#ifdef SYSCTL_NODE
136SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
137SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable,
138    CTLFLAG_RW | CTLFLAG_SECURE3,
139    &fw_enable, 0, "Enable ipfw");
140SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW,
141    &autoinc_step, 0, "Rule number autincrement step");
142SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
143    CTLFLAG_RW | CTLFLAG_SECURE3,
144    &fw_one_pass, 0,
145    "Only do a single pass through ipfw when using dummynet(4)");
146SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
147    &fw_debug, 0, "Enable printing of debug ip_fw statements");
148SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
149    CTLFLAG_RW | CTLFLAG_SECURE3,
150    &fw_verbose, 0, "Log matches to ipfw rules");
151SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
152    &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
153
154/*
155 * Description of dynamic rules.
156 *
157 * Dynamic rules are stored in lists accessed through a hash table
158 * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
159 * be modified through the sysctl variable dyn_buckets which is
160 * updated when the table becomes empty.
161 *
162 * XXX currently there is only one list, ipfw_dyn.
163 *
164 * When a packet is received, its address fields are first masked
165 * with the mask defined for the rule, then hashed, then matched
166 * against the entries in the corresponding list.
167 * Dynamic rules can be used for different purposes:
168 *  + stateful rules;
169 *  + enforcing limits on the number of sessions;
170 *  + in-kernel NAT (not implemented yet)
171 *
172 * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
173 * measured in seconds and depending on the flags.
174 *
175 * The total number of dynamic rules is stored in dyn_count.
176 * The max number of dynamic rules is dyn_max. When we reach
177 * the maximum number of rules we do not create anymore. This is
178 * done to avoid consuming too much memory, but also too much
179 * time when searching on each packet (ideally, we should try instead
180 * to put a limit on the length of the list on each bucket...).
181 *
182 * Each dynamic rule holds a pointer to the parent ipfw rule so
183 * we know what action to perform. Dynamic rules are removed when
184 * the parent rule is deleted. XXX we should make them survive.
185 *
186 * There are some limitations with dynamic rules -- we do not
187 * obey the 'randomized match', and we do not do multiple
188 * passes through the firewall. XXX check the latter!!!
189 */
190static ipfw_dyn_rule **ipfw_dyn_v = NULL;
191static u_int32_t dyn_buckets = 256; /* must be power of 2 */
192static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */
193
194static struct mtx ipfw_dyn_mtx;		/* mutex guarding dynamic rules */
195#define	IPFW_DYN_LOCK_INIT() \
196	mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
197#define	IPFW_DYN_LOCK_DESTROY()	mtx_destroy(&ipfw_dyn_mtx)
198#define	IPFW_DYN_LOCK()		mtx_lock(&ipfw_dyn_mtx)
199#define	IPFW_DYN_UNLOCK()	mtx_unlock(&ipfw_dyn_mtx)
200#define	IPFW_DYN_LOCK_ASSERT()	mtx_assert(&ipfw_dyn_mtx, MA_OWNED)
201
202/*
203 * Timeouts for various events in handing dynamic rules.
204 */
205static u_int32_t dyn_ack_lifetime = 300;
206static u_int32_t dyn_syn_lifetime = 20;
207static u_int32_t dyn_fin_lifetime = 1;
208static u_int32_t dyn_rst_lifetime = 1;
209static u_int32_t dyn_udp_lifetime = 10;
210static u_int32_t dyn_short_lifetime = 5;
211
212/*
213 * Keepalives are sent if dyn_keepalive is set. They are sent every
214 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
215 * seconds of lifetime of a rule.
216 * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
217 * than dyn_keepalive_period.
218 */
219
220static u_int32_t dyn_keepalive_interval = 20;
221static u_int32_t dyn_keepalive_period = 5;
222static u_int32_t dyn_keepalive = 1;	/* do send keepalives */
223
224static u_int32_t static_count;	/* # of static rules */
225static u_int32_t static_len;	/* size in bytes of static rules */
226static u_int32_t dyn_count;		/* # of dynamic rules */
227static u_int32_t dyn_max = 4096;	/* max # of dynamic rules */
228
229SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW,
230    &dyn_buckets, 0, "Number of dyn. buckets");
231SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD,
232    &curr_dyn_buckets, 0, "Current Number of dyn. buckets");
233SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD,
234    &dyn_count, 0, "Number of dyn. rules");
235SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW,
236    &dyn_max, 0, "Max number of dyn. rules");
237SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
238    &static_count, 0, "Number of static rules");
239SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
240    &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
241SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
242    &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
243SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
244    &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
245SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
246    &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
247SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
248    &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
249SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
250    &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
251SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
252    &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
253
254#endif /* SYSCTL_NODE */
255
256
257static ip_fw_chk_t	ipfw_chk;
258
259ip_dn_ruledel_t *ip_dn_ruledel_ptr = NULL;	/* hook into dummynet */
260
261/*
262 * This macro maps an ip pointer into a layer3 header pointer of type T
263 */
264#define	L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
265
266static __inline int
267icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
268{
269	int type = L3HDR(struct icmp,ip)->icmp_type;
270
271	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
272}
273
274#define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
275    (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
276
277static int
278is_icmp_query(struct ip *ip)
279{
280	int type = L3HDR(struct icmp, ip)->icmp_type;
281	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
282}
283#undef TT
284
285/*
286 * The following checks use two arrays of 8 or 16 bits to store the
287 * bits that we want set or clear, respectively. They are in the
288 * low and high half of cmd->arg1 or cmd->d[0].
289 *
290 * We scan options and store the bits we find set. We succeed if
291 *
292 *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
293 *
294 * The code is sometimes optimized not to store additional variables.
295 */
296
297static int
298flags_match(ipfw_insn *cmd, u_int8_t bits)
299{
300	u_char want_clear;
301	bits = ~bits;
302
303	if ( ((cmd->arg1 & 0xff) & bits) != 0)
304		return 0; /* some bits we want set were clear */
305	want_clear = (cmd->arg1 >> 8) & 0xff;
306	if ( (want_clear & bits) != want_clear)
307		return 0; /* some bits we want clear were set */
308	return 1;
309}
310
311static int
312ipopts_match(struct ip *ip, ipfw_insn *cmd)
313{
314	int optlen, bits = 0;
315	u_char *cp = (u_char *)(ip + 1);
316	int x = (ip->ip_hl << 2) - sizeof (struct ip);
317
318	for (; x > 0; x -= optlen, cp += optlen) {
319		int opt = cp[IPOPT_OPTVAL];
320
321		if (opt == IPOPT_EOL)
322			break;
323		if (opt == IPOPT_NOP)
324			optlen = 1;
325		else {
326			optlen = cp[IPOPT_OLEN];
327			if (optlen <= 0 || optlen > x)
328				return 0; /* invalid or truncated */
329		}
330		switch (opt) {
331
332		default:
333			break;
334
335		case IPOPT_LSRR:
336			bits |= IP_FW_IPOPT_LSRR;
337			break;
338
339		case IPOPT_SSRR:
340			bits |= IP_FW_IPOPT_SSRR;
341			break;
342
343		case IPOPT_RR:
344			bits |= IP_FW_IPOPT_RR;
345			break;
346
347		case IPOPT_TS:
348			bits |= IP_FW_IPOPT_TS;
349			break;
350		}
351	}
352	return (flags_match(cmd, bits));
353}
354
355static int
356tcpopts_match(struct ip *ip, ipfw_insn *cmd)
357{
358	int optlen, bits = 0;
359	struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
360	u_char *cp = (u_char *)(tcp + 1);
361	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
362
363	for (; x > 0; x -= optlen, cp += optlen) {
364		int opt = cp[0];
365		if (opt == TCPOPT_EOL)
366			break;
367		if (opt == TCPOPT_NOP)
368			optlen = 1;
369		else {
370			optlen = cp[1];
371			if (optlen <= 0)
372				break;
373		}
374
375		switch (opt) {
376
377		default:
378			break;
379
380		case TCPOPT_MAXSEG:
381			bits |= IP_FW_TCPOPT_MSS;
382			break;
383
384		case TCPOPT_WINDOW:
385			bits |= IP_FW_TCPOPT_WINDOW;
386			break;
387
388		case TCPOPT_SACK_PERMITTED:
389		case TCPOPT_SACK:
390			bits |= IP_FW_TCPOPT_SACK;
391			break;
392
393		case TCPOPT_TIMESTAMP:
394			bits |= IP_FW_TCPOPT_TS;
395			break;
396
397		case TCPOPT_CC:
398		case TCPOPT_CCNEW:
399		case TCPOPT_CCECHO:
400			bits |= IP_FW_TCPOPT_CC;
401			break;
402		}
403	}
404	return (flags_match(cmd, bits));
405}
406
407static int
408iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
409{
410	if (ifp == NULL)	/* no iface with this packet, match fails */
411		return 0;
412	/* Check by name or by IP address */
413	if (cmd->name[0] != '\0') { /* match by name */
414		/* Check name */
415		if (cmd->p.glob) {
416			if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
417				return(1);
418		} else {
419			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
420				return(1);
421		}
422	} else {
423		struct ifaddr *ia;
424
425		/* XXX lock? */
426		TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
427			if (ia->ifa_addr == NULL)
428				continue;
429			if (ia->ifa_addr->sa_family != AF_INET)
430				continue;
431			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
432			    (ia->ifa_addr))->sin_addr.s_addr)
433				return(1);	/* match */
434		}
435	}
436	return(0);	/* no match, fail ... */
437}
438
439/*
440 * The 'verrevpath' option checks that the interface that an IP packet
441 * arrives on is the same interface that traffic destined for the
442 * packet's source address would be routed out of. This is a measure
443 * to block forged packets. This is also commonly known as "anti-spoofing"
444 * or Unicast Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The
445 * name of the knob is purposely reminisent of the Cisco IOS command,
446 *
447 *   ip verify unicast reverse-path
448 *
449 * which implements the same functionality. But note that syntax is
450 * misleading. The check may be performed on all IP packets whether unicast,
451 * multicast, or broadcast.
452 */
453static int
454verify_rev_path(struct in_addr src, struct ifnet *ifp)
455{
456	struct route ro;
457	struct sockaddr_in *dst;
458
459	bzero(&ro, sizeof(ro));
460
461	dst = (struct sockaddr_in *)&(ro.ro_dst);
462	dst->sin_family = AF_INET;
463	dst->sin_len = sizeof(*dst);
464	dst->sin_addr = src;
465	rtalloc_ign(&ro, RTF_CLONING);
466
467	if (ro.ro_rt == NULL)
468		return 0;
469	if ((ifp == NULL) || (ro.ro_rt->rt_ifp->if_index != ifp->if_index)) {
470		RTFREE(ro.ro_rt);
471		return 0;
472	}
473	RTFREE(ro.ro_rt);
474	return 1;
475}
476
477
478static u_int64_t norule_counter;	/* counter for ipfw_log(NULL...) */
479
480#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
481#define SNP(buf) buf, sizeof(buf)
482
483/*
484 * We enter here when we have a rule with O_LOG.
485 * XXX this function alone takes about 2Kbytes of code!
486 */
487static void
488ipfw_log(struct ip_fw *f, u_int hlen, struct ether_header *eh,
489	struct mbuf *m, struct ifnet *oif)
490{
491	char *action;
492	int limit_reached = 0;
493	char action2[40], proto[48], fragment[28];
494
495	fragment[0] = '\0';
496	proto[0] = '\0';
497
498	if (f == NULL) {	/* bogus pkt */
499		if (verbose_limit != 0 && norule_counter >= verbose_limit)
500			return;
501		norule_counter++;
502		if (norule_counter == verbose_limit)
503			limit_reached = verbose_limit;
504		action = "Refuse";
505	} else {	/* O_LOG is the first action, find the real one */
506		ipfw_insn *cmd = ACTION_PTR(f);
507		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
508
509		if (l->max_log != 0 && l->log_left == 0)
510			return;
511		l->log_left--;
512		if (l->log_left == 0)
513			limit_reached = l->max_log;
514		cmd += F_LEN(cmd);	/* point to first action */
515		if (cmd->opcode == O_PROB)
516			cmd += F_LEN(cmd);
517
518		action = action2;
519		switch (cmd->opcode) {
520		case O_DENY:
521			action = "Deny";
522			break;
523
524		case O_REJECT:
525			if (cmd->arg1==ICMP_REJECT_RST)
526				action = "Reset";
527			else if (cmd->arg1==ICMP_UNREACH_HOST)
528				action = "Reject";
529			else
530				snprintf(SNPARGS(action2, 0), "Unreach %d",
531					cmd->arg1);
532			break;
533
534		case O_ACCEPT:
535			action = "Accept";
536			break;
537		case O_COUNT:
538			action = "Count";
539			break;
540		case O_DIVERT:
541			snprintf(SNPARGS(action2, 0), "Divert %d",
542				cmd->arg1);
543			break;
544		case O_TEE:
545			snprintf(SNPARGS(action2, 0), "Tee %d",
546				cmd->arg1);
547			break;
548		case O_SKIPTO:
549			snprintf(SNPARGS(action2, 0), "SkipTo %d",
550				cmd->arg1);
551			break;
552		case O_PIPE:
553			snprintf(SNPARGS(action2, 0), "Pipe %d",
554				cmd->arg1);
555			break;
556		case O_QUEUE:
557			snprintf(SNPARGS(action2, 0), "Queue %d",
558				cmd->arg1);
559			break;
560		case O_FORWARD_IP: {
561			ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
562			int len;
563
564			len = snprintf(SNPARGS(action2, 0), "Forward to %s",
565				inet_ntoa(sa->sa.sin_addr));
566			if (sa->sa.sin_port)
567				snprintf(SNPARGS(action2, len), ":%d",
568				    sa->sa.sin_port);
569			}
570			break;
571		default:
572			action = "UNKNOWN";
573			break;
574		}
575	}
576
577	if (hlen == 0) {	/* non-ip */
578		snprintf(SNPARGS(proto, 0), "MAC");
579	} else {
580		struct ip *ip = mtod(m, struct ip *);
581		/* these three are all aliases to the same thing */
582		struct icmp *const icmp = L3HDR(struct icmp, ip);
583		struct tcphdr *const tcp = (struct tcphdr *)icmp;
584		struct udphdr *const udp = (struct udphdr *)icmp;
585
586		int ip_off, offset, ip_len;
587
588		int len;
589
590		if (eh != NULL) { /* layer 2 packets are as on the wire */
591			ip_off = ntohs(ip->ip_off);
592			ip_len = ntohs(ip->ip_len);
593		} else {
594			ip_off = ip->ip_off;
595			ip_len = ip->ip_len;
596		}
597		offset = ip_off & IP_OFFMASK;
598		switch (ip->ip_p) {
599		case IPPROTO_TCP:
600			len = snprintf(SNPARGS(proto, 0), "TCP %s",
601			    inet_ntoa(ip->ip_src));
602			if (offset == 0)
603				snprintf(SNPARGS(proto, len), ":%d %s:%d",
604				    ntohs(tcp->th_sport),
605				    inet_ntoa(ip->ip_dst),
606				    ntohs(tcp->th_dport));
607			else
608				snprintf(SNPARGS(proto, len), " %s",
609				    inet_ntoa(ip->ip_dst));
610			break;
611
612		case IPPROTO_UDP:
613			len = snprintf(SNPARGS(proto, 0), "UDP %s",
614				inet_ntoa(ip->ip_src));
615			if (offset == 0)
616				snprintf(SNPARGS(proto, len), ":%d %s:%d",
617				    ntohs(udp->uh_sport),
618				    inet_ntoa(ip->ip_dst),
619				    ntohs(udp->uh_dport));
620			else
621				snprintf(SNPARGS(proto, len), " %s",
622				    inet_ntoa(ip->ip_dst));
623			break;
624
625		case IPPROTO_ICMP:
626			if (offset == 0)
627				len = snprintf(SNPARGS(proto, 0),
628				    "ICMP:%u.%u ",
629				    icmp->icmp_type, icmp->icmp_code);
630			else
631				len = snprintf(SNPARGS(proto, 0), "ICMP ");
632			len += snprintf(SNPARGS(proto, len), "%s",
633			    inet_ntoa(ip->ip_src));
634			snprintf(SNPARGS(proto, len), " %s",
635			    inet_ntoa(ip->ip_dst));
636			break;
637
638		default:
639			len = snprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
640			    inet_ntoa(ip->ip_src));
641			snprintf(SNPARGS(proto, len), " %s",
642			    inet_ntoa(ip->ip_dst));
643			break;
644		}
645
646		if (ip_off & (IP_MF | IP_OFFMASK))
647			snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
648			     ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
649			     offset << 3,
650			     (ip_off & IP_MF) ? "+" : "");
651	}
652	if (oif || m->m_pkthdr.rcvif)
653		log(LOG_SECURITY | LOG_INFO,
654		    "ipfw: %d %s %s %s via %s%s\n",
655		    f ? f->rulenum : -1,
656		    action, proto, oif ? "out" : "in",
657		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
658		    fragment);
659	else
660		log(LOG_SECURITY | LOG_INFO,
661		    "ipfw: %d %s %s [no if info]%s\n",
662		    f ? f->rulenum : -1,
663		    action, proto, fragment);
664	if (limit_reached)
665		log(LOG_SECURITY | LOG_NOTICE,
666		    "ipfw: limit %d reached on entry %d\n",
667		    limit_reached, f ? f->rulenum : -1);
668}
669
670/*
671 * IMPORTANT: the hash function for dynamic rules must be commutative
672 * in source and destination (ip,port), because rules are bidirectional
673 * and we want to find both in the same bucket.
674 */
675static __inline int
676hash_packet(struct ipfw_flow_id *id)
677{
678	u_int32_t i;
679
680	i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
681	i &= (curr_dyn_buckets - 1);
682	return i;
683}
684
685/**
686 * unlink a dynamic rule from a chain. prev is a pointer to
687 * the previous one, q is a pointer to the rule to delete,
688 * head is a pointer to the head of the queue.
689 * Modifies q and potentially also head.
690 */
691#define UNLINK_DYN_RULE(prev, head, q) {				\
692	ipfw_dyn_rule *old_q = q;					\
693									\
694	/* remove a refcount to the parent */				\
695	if (q->dyn_type == O_LIMIT)					\
696		q->parent->count--;					\
697	DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\
698		(q->id.src_ip), (q->id.src_port),			\
699		(q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); )	\
700	if (prev != NULL)						\
701		prev->next = q = q->next;				\
702	else								\
703		head = q = q->next;					\
704	dyn_count--;							\
705	free(old_q, M_IPFW); }
706
707#define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
708
709/**
710 * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
711 *
712 * If keep_me == NULL, rules are deleted even if not expired,
713 * otherwise only expired rules are removed.
714 *
715 * The value of the second parameter is also used to point to identify
716 * a rule we absolutely do not want to remove (e.g. because we are
717 * holding a reference to it -- this is the case with O_LIMIT_PARENT
718 * rules). The pointer is only used for comparison, so any non-null
719 * value will do.
720 */
721static void
722remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
723{
724	static u_int32_t last_remove = 0;
725
726#define FORCE (keep_me == NULL)
727
728	ipfw_dyn_rule *prev, *q;
729	int i, pass = 0, max_pass = 0;
730
731	IPFW_DYN_LOCK_ASSERT();
732
733	if (ipfw_dyn_v == NULL || dyn_count == 0)
734		return;
735	/* do not expire more than once per second, it is useless */
736	if (!FORCE && last_remove == time_second)
737		return;
738	last_remove = time_second;
739
740	/*
741	 * because O_LIMIT refer to parent rules, during the first pass only
742	 * remove child and mark any pending LIMIT_PARENT, and remove
743	 * them in a second pass.
744	 */
745next_pass:
746	for (i = 0 ; i < curr_dyn_buckets ; i++) {
747		for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) {
748			/*
749			 * Logic can become complex here, so we split tests.
750			 */
751			if (q == keep_me)
752				goto next;
753			if (rule != NULL && rule != q->rule)
754				goto next; /* not the one we are looking for */
755			if (q->dyn_type == O_LIMIT_PARENT) {
756				/*
757				 * handle parent in the second pass,
758				 * record we need one.
759				 */
760				max_pass = 1;
761				if (pass == 0)
762					goto next;
763				if (FORCE && q->count != 0 ) {
764					/* XXX should not happen! */
765					printf("ipfw: OUCH! cannot remove rule,"
766					     " count %d\n", q->count);
767				}
768			} else {
769				if (!FORCE &&
770				    !TIME_LEQ( q->expire, time_second ))
771					goto next;
772			}
773             if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
774                     UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
775                     continue;
776             }
777next:
778			prev=q;
779			q=q->next;
780		}
781	}
782	if (pass++ < max_pass)
783		goto next_pass;
784}
785
786
787/**
788 * lookup a dynamic rule.
789 */
790static ipfw_dyn_rule *
791lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
792	struct tcphdr *tcp)
793{
794	/*
795	 * stateful ipfw extensions.
796	 * Lookup into dynamic session queue
797	 */
798#define MATCH_REVERSE	0
799#define MATCH_FORWARD	1
800#define MATCH_NONE	2
801#define MATCH_UNKNOWN	3
802	int i, dir = MATCH_NONE;
803	ipfw_dyn_rule *prev, *q=NULL;
804
805	IPFW_DYN_LOCK_ASSERT();
806
807	if (ipfw_dyn_v == NULL)
808		goto done;	/* not found */
809	i = hash_packet( pkt );
810	for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) {
811		if (q->dyn_type == O_LIMIT_PARENT && q->count)
812			goto next;
813		if (TIME_LEQ( q->expire, time_second)) { /* expire entry */
814			UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
815			continue;
816		}
817		if (pkt->proto == q->id.proto &&
818		    q->dyn_type != O_LIMIT_PARENT) {
819			if (pkt->src_ip == q->id.src_ip &&
820			    pkt->dst_ip == q->id.dst_ip &&
821			    pkt->src_port == q->id.src_port &&
822			    pkt->dst_port == q->id.dst_port ) {
823				dir = MATCH_FORWARD;
824				break;
825			}
826			if (pkt->src_ip == q->id.dst_ip &&
827			    pkt->dst_ip == q->id.src_ip &&
828			    pkt->src_port == q->id.dst_port &&
829			    pkt->dst_port == q->id.src_port ) {
830				dir = MATCH_REVERSE;
831				break;
832			}
833		}
834next:
835		prev = q;
836		q = q->next;
837	}
838	if (q == NULL)
839		goto done; /* q = NULL, not found */
840
841	if ( prev != NULL) { /* found and not in front */
842		prev->next = q->next;
843		q->next = ipfw_dyn_v[i];
844		ipfw_dyn_v[i] = q;
845	}
846	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
847		u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
848
849#define BOTH_SYN	(TH_SYN | (TH_SYN << 8))
850#define BOTH_FIN	(TH_FIN | (TH_FIN << 8))
851		q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
852		switch (q->state) {
853		case TH_SYN:				/* opening */
854			q->expire = time_second + dyn_syn_lifetime;
855			break;
856
857		case BOTH_SYN:			/* move to established */
858		case BOTH_SYN | TH_FIN :	/* one side tries to close */
859		case BOTH_SYN | (TH_FIN << 8) :
860 			if (tcp) {
861#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
862			    u_int32_t ack = ntohl(tcp->th_ack);
863			    if (dir == MATCH_FORWARD) {
864				if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
865				    q->ack_fwd = ack;
866				else { /* ignore out-of-sequence */
867				    break;
868				}
869			    } else {
870				if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
871				    q->ack_rev = ack;
872				else { /* ignore out-of-sequence */
873				    break;
874				}
875			    }
876			}
877			q->expire = time_second + dyn_ack_lifetime;
878			break;
879
880		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
881			if (dyn_fin_lifetime >= dyn_keepalive_period)
882				dyn_fin_lifetime = dyn_keepalive_period - 1;
883			q->expire = time_second + dyn_fin_lifetime;
884			break;
885
886		default:
887#if 0
888			/*
889			 * reset or some invalid combination, but can also
890			 * occur if we use keep-state the wrong way.
891			 */
892			if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
893				printf("invalid state: 0x%x\n", q->state);
894#endif
895			if (dyn_rst_lifetime >= dyn_keepalive_period)
896				dyn_rst_lifetime = dyn_keepalive_period - 1;
897			q->expire = time_second + dyn_rst_lifetime;
898			break;
899		}
900	} else if (pkt->proto == IPPROTO_UDP) {
901		q->expire = time_second + dyn_udp_lifetime;
902	} else {
903		/* other protocols */
904		q->expire = time_second + dyn_short_lifetime;
905	}
906done:
907	if (match_direction)
908		*match_direction = dir;
909	return q;
910}
911
912static ipfw_dyn_rule *
913lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
914	struct tcphdr *tcp)
915{
916	ipfw_dyn_rule *q;
917
918	IPFW_DYN_LOCK();
919	q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
920	if (q == NULL)
921		IPFW_DYN_UNLOCK();
922	/* NB: return table locked when q is not NULL */
923	return q;
924}
925
926static void
927realloc_dynamic_table(void)
928{
929	IPFW_DYN_LOCK_ASSERT();
930
931	/*
932	 * Try reallocation, make sure we have a power of 2 and do
933	 * not allow more than 64k entries. In case of overflow,
934	 * default to 1024.
935	 */
936
937	if (dyn_buckets > 65536)
938		dyn_buckets = 1024;
939	if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */
940		dyn_buckets = curr_dyn_buckets; /* reset */
941		return;
942	}
943	curr_dyn_buckets = dyn_buckets;
944	if (ipfw_dyn_v != NULL)
945		free(ipfw_dyn_v, M_IPFW);
946	for (;;) {
947		ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
948		       M_IPFW, M_NOWAIT | M_ZERO);
949		if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2)
950			break;
951		curr_dyn_buckets /= 2;
952	}
953}
954
955/**
956 * Install state of type 'type' for a dynamic session.
957 * The hash table contains two type of rules:
958 * - regular rules (O_KEEP_STATE)
959 * - rules for sessions with limited number of sess per user
960 *   (O_LIMIT). When they are created, the parent is
961 *   increased by 1, and decreased on delete. In this case,
962 *   the third parameter is the parent rule and not the chain.
963 * - "parent" rules for the above (O_LIMIT_PARENT).
964 */
965static ipfw_dyn_rule *
966add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
967{
968	ipfw_dyn_rule *r;
969	int i;
970
971	IPFW_DYN_LOCK_ASSERT();
972
973	if (ipfw_dyn_v == NULL ||
974	    (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) {
975		realloc_dynamic_table();
976		if (ipfw_dyn_v == NULL)
977			return NULL; /* failed ! */
978	}
979	i = hash_packet(id);
980
981	r = malloc(sizeof *r, M_IPFW, M_NOWAIT | M_ZERO);
982	if (r == NULL) {
983		printf ("ipfw: sorry cannot allocate state\n");
984		return NULL;
985	}
986
987	/* increase refcount on parent, and set pointer */
988	if (dyn_type == O_LIMIT) {
989		ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
990		if ( parent->dyn_type != O_LIMIT_PARENT)
991			panic("invalid parent");
992		parent->count++;
993		r->parent = parent;
994		rule = parent->rule;
995	}
996
997	r->id = *id;
998	r->expire = time_second + dyn_syn_lifetime;
999	r->rule = rule;
1000	r->dyn_type = dyn_type;
1001	r->pcnt = r->bcnt = 0;
1002	r->count = 0;
1003
1004	r->bucket = i;
1005	r->next = ipfw_dyn_v[i];
1006	ipfw_dyn_v[i] = r;
1007	dyn_count++;
1008	DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n",
1009	   dyn_type,
1010	   (r->id.src_ip), (r->id.src_port),
1011	   (r->id.dst_ip), (r->id.dst_port),
1012	   dyn_count ); )
1013	return r;
1014}
1015
1016/**
1017 * lookup dynamic parent rule using pkt and rule as search keys.
1018 * If the lookup fails, then install one.
1019 */
1020static ipfw_dyn_rule *
1021lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
1022{
1023	ipfw_dyn_rule *q;
1024	int i;
1025
1026	IPFW_DYN_LOCK_ASSERT();
1027
1028	if (ipfw_dyn_v) {
1029		i = hash_packet( pkt );
1030		for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next)
1031			if (q->dyn_type == O_LIMIT_PARENT &&
1032			    rule== q->rule &&
1033			    pkt->proto == q->id.proto &&
1034			    pkt->src_ip == q->id.src_ip &&
1035			    pkt->dst_ip == q->id.dst_ip &&
1036			    pkt->src_port == q->id.src_port &&
1037			    pkt->dst_port == q->id.dst_port) {
1038				q->expire = time_second + dyn_short_lifetime;
1039				DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
1040				return q;
1041			}
1042	}
1043	return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
1044}
1045
1046/**
1047 * Install dynamic state for rule type cmd->o.opcode
1048 *
1049 * Returns 1 (failure) if state is not installed because of errors or because
1050 * session limitations are enforced.
1051 */
1052static int
1053install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
1054	struct ip_fw_args *args)
1055{
1056	static int last_log;
1057
1058	ipfw_dyn_rule *q;
1059
1060	DEB(printf("ipfw: install state type %d 0x%08x %u -> 0x%08x %u\n",
1061	    cmd->o.opcode,
1062	    (args->f_id.src_ip), (args->f_id.src_port),
1063	    (args->f_id.dst_ip), (args->f_id.dst_port) );)
1064
1065	IPFW_DYN_LOCK();
1066
1067	q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
1068
1069	if (q != NULL) { /* should never occur */
1070		if (last_log != time_second) {
1071			last_log = time_second;
1072			printf("ipfw: install_state: entry already present, done\n");
1073		}
1074		IPFW_DYN_UNLOCK();
1075		return 0;
1076	}
1077
1078	if (dyn_count >= dyn_max)
1079		/*
1080		 * Run out of slots, try to remove any expired rule.
1081		 */
1082		remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
1083
1084	if (dyn_count >= dyn_max) {
1085		if (last_log != time_second) {
1086			last_log = time_second;
1087			printf("ipfw: install_state: Too many dynamic rules\n");
1088		}
1089		IPFW_DYN_UNLOCK();
1090		return 1; /* cannot install, notify caller */
1091	}
1092
1093	switch (cmd->o.opcode) {
1094	case O_KEEP_STATE: /* bidir rule */
1095		add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
1096		break;
1097
1098	case O_LIMIT: /* limit number of sessions */
1099	    {
1100		u_int16_t limit_mask = cmd->limit_mask;
1101		struct ipfw_flow_id id;
1102		ipfw_dyn_rule *parent;
1103
1104		DEB(printf("ipfw: installing dyn-limit rule %d\n",
1105		    cmd->conn_limit);)
1106
1107		id.dst_ip = id.src_ip = 0;
1108		id.dst_port = id.src_port = 0;
1109		id.proto = args->f_id.proto;
1110
1111		if (limit_mask & DYN_SRC_ADDR)
1112			id.src_ip = args->f_id.src_ip;
1113		if (limit_mask & DYN_DST_ADDR)
1114			id.dst_ip = args->f_id.dst_ip;
1115		if (limit_mask & DYN_SRC_PORT)
1116			id.src_port = args->f_id.src_port;
1117		if (limit_mask & DYN_DST_PORT)
1118			id.dst_port = args->f_id.dst_port;
1119		parent = lookup_dyn_parent(&id, rule);
1120		if (parent == NULL) {
1121			printf("ipfw: add parent failed\n");
1122			return 1;
1123		}
1124		if (parent->count >= cmd->conn_limit) {
1125			/*
1126			 * See if we can remove some expired rule.
1127			 */
1128			remove_dyn_rule(rule, parent);
1129			if (parent->count >= cmd->conn_limit) {
1130				if (fw_verbose && last_log != time_second) {
1131					last_log = time_second;
1132					log(LOG_SECURITY | LOG_DEBUG,
1133					    "drop session, too many entries\n");
1134				}
1135				IPFW_DYN_UNLOCK();
1136				return 1;
1137			}
1138		}
1139		add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
1140	    }
1141		break;
1142	default:
1143		printf("ipfw: unknown dynamic rule type %u\n", cmd->o.opcode);
1144		IPFW_DYN_UNLOCK();
1145		return 1;
1146	}
1147	lookup_dyn_rule_locked(&args->f_id, NULL, NULL); /* XXX just set lifetime */
1148	IPFW_DYN_UNLOCK();
1149	return 0;
1150}
1151
1152/*
1153 * Transmit a TCP packet, containing either a RST or a keepalive.
1154 * When flags & TH_RST, we are sending a RST packet, because of a
1155 * "reset" action matched the packet.
1156 * Otherwise we are sending a keepalive, and flags & TH_
1157 */
1158static void
1159send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags)
1160{
1161	struct mbuf *m;
1162	struct ip *ip;
1163	struct tcphdr *tcp;
1164
1165	MGETHDR(m, M_DONTWAIT, MT_HEADER);
1166	if (m == 0)
1167		return;
1168	m->m_pkthdr.rcvif = (struct ifnet *)0;
1169	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
1170	m->m_data += max_linkhdr;
1171
1172	ip = mtod(m, struct ip *);
1173	bzero(ip, m->m_len);
1174	tcp = (struct tcphdr *)(ip + 1); /* no IP options */
1175	ip->ip_p = IPPROTO_TCP;
1176	tcp->th_off = 5;
1177	/*
1178	 * Assume we are sending a RST (or a keepalive in the reverse
1179	 * direction), swap src and destination addresses and ports.
1180	 */
1181	ip->ip_src.s_addr = htonl(id->dst_ip);
1182	ip->ip_dst.s_addr = htonl(id->src_ip);
1183	tcp->th_sport = htons(id->dst_port);
1184	tcp->th_dport = htons(id->src_port);
1185	if (flags & TH_RST) {	/* we are sending a RST */
1186		if (flags & TH_ACK) {
1187			tcp->th_seq = htonl(ack);
1188			tcp->th_ack = htonl(0);
1189			tcp->th_flags = TH_RST;
1190		} else {
1191			if (flags & TH_SYN)
1192				seq++;
1193			tcp->th_seq = htonl(0);
1194			tcp->th_ack = htonl(seq);
1195			tcp->th_flags = TH_RST | TH_ACK;
1196		}
1197	} else {
1198		/*
1199		 * We are sending a keepalive. flags & TH_SYN determines
1200		 * the direction, forward if set, reverse if clear.
1201		 * NOTE: seq and ack are always assumed to be correct
1202		 * as set by the caller. This may be confusing...
1203		 */
1204		if (flags & TH_SYN) {
1205			/*
1206			 * we have to rewrite the correct addresses!
1207			 */
1208			ip->ip_dst.s_addr = htonl(id->dst_ip);
1209			ip->ip_src.s_addr = htonl(id->src_ip);
1210			tcp->th_dport = htons(id->dst_port);
1211			tcp->th_sport = htons(id->src_port);
1212		}
1213		tcp->th_seq = htonl(seq);
1214		tcp->th_ack = htonl(ack);
1215		tcp->th_flags = TH_ACK;
1216	}
1217	/*
1218	 * set ip_len to the payload size so we can compute
1219	 * the tcp checksum on the pseudoheader
1220	 * XXX check this, could save a couple of words ?
1221	 */
1222	ip->ip_len = htons(sizeof(struct tcphdr));
1223	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
1224	/*
1225	 * now fill fields left out earlier
1226	 */
1227	ip->ip_ttl = ip_defttl;
1228	ip->ip_len = m->m_pkthdr.len;
1229	m->m_flags |= M_SKIP_FIREWALL;
1230	ip_output(m, NULL, NULL, 0, NULL, NULL);
1231}
1232
1233/*
1234 * sends a reject message, consuming the mbuf passed as an argument.
1235 */
1236static void
1237send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
1238{
1239
1240	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
1241		/* We need the IP header in host order for icmp_error(). */
1242		if (args->eh != NULL) {
1243			struct ip *ip = mtod(args->m, struct ip *);
1244			ip->ip_len = ntohs(ip->ip_len);
1245			ip->ip_off = ntohs(ip->ip_off);
1246		}
1247		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
1248	} else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
1249		struct tcphdr *const tcp =
1250		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
1251		if ( (tcp->th_flags & TH_RST) == 0)
1252			send_pkt(&(args->f_id), ntohl(tcp->th_seq),
1253				ntohl(tcp->th_ack),
1254				tcp->th_flags | TH_RST);
1255		m_freem(args->m);
1256	} else
1257		m_freem(args->m);
1258	args->m = NULL;
1259}
1260
1261/**
1262 *
1263 * Given an ip_fw *, lookup_next_rule will return a pointer
1264 * to the next rule, which can be either the jump
1265 * target (for skipto instructions) or the next one in the list (in
1266 * all other cases including a missing jump target).
1267 * The result is also written in the "next_rule" field of the rule.
1268 * Backward jumps are not allowed, so start looking from the next
1269 * rule...
1270 *
1271 * This never returns NULL -- in case we do not have an exact match,
1272 * the next rule is returned. When the ruleset is changed,
1273 * pointers are flushed so we are always correct.
1274 */
1275
1276static struct ip_fw *
1277lookup_next_rule(struct ip_fw *me)
1278{
1279	struct ip_fw *rule = NULL;
1280	ipfw_insn *cmd;
1281
1282	/* look for action, in case it is a skipto */
1283	cmd = ACTION_PTR(me);
1284	if (cmd->opcode == O_LOG)
1285		cmd += F_LEN(cmd);
1286	if ( cmd->opcode == O_SKIPTO )
1287		for (rule = me->next; rule ; rule = rule->next)
1288			if (rule->rulenum >= cmd->arg1)
1289				break;
1290	if (rule == NULL)			/* failure or not a skipto */
1291		rule = me->next;
1292	me->next_rule = rule;
1293	return rule;
1294}
1295
1296static int
1297check_uidgid(ipfw_insn_u32 *insn,
1298	int proto, struct ifnet *oif,
1299	struct in_addr dst_ip, u_int16_t dst_port,
1300	struct in_addr src_ip, u_int16_t src_port)
1301{
1302	struct inpcbinfo *pi;
1303	int wildcard;
1304	struct inpcb *pcb;
1305	int match;
1306
1307	if (proto == IPPROTO_TCP) {
1308		wildcard = 0;
1309		pi = &tcbinfo;
1310	} else if (proto == IPPROTO_UDP) {
1311		wildcard = 1;
1312		pi = &udbinfo;
1313	} else
1314		return 0;
1315
1316	match = 0;
1317
1318	INP_INFO_RLOCK(pi);	/* XXX LOR with IPFW */
1319	pcb =  (oif) ?
1320		in_pcblookup_hash(pi,
1321		    dst_ip, htons(dst_port),
1322		    src_ip, htons(src_port),
1323		    wildcard, oif) :
1324		in_pcblookup_hash(pi,
1325		    src_ip, htons(src_port),
1326		    dst_ip, htons(dst_port),
1327		    wildcard, NULL);
1328	if (pcb != NULL) {
1329		INP_LOCK(pcb);
1330		if (pcb->inp_socket != NULL) {
1331#if __FreeBSD_version < 500034
1332#define socheckuid(a,b)	((a)->so_cred->cr_uid != (b))
1333#endif
1334			if (insn->o.opcode == O_UID) {
1335				match = !socheckuid(pcb->inp_socket,
1336				   (uid_t)insn->d[0]);
1337			} else  {
1338				match = groupmember((uid_t)insn->d[0],
1339				    pcb->inp_socket->so_cred);
1340			}
1341		}
1342		INP_UNLOCK(pcb);
1343	}
1344	INP_INFO_RUNLOCK(pi);
1345
1346	return match;
1347}
1348
1349/*
1350 * The main check routine for the firewall.
1351 *
1352 * All arguments are in args so we can modify them and return them
1353 * back to the caller.
1354 *
1355 * Parameters:
1356 *
1357 *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
1358 *		Starts with the IP header.
1359 *	args->eh (in)	Mac header if present, or NULL for layer3 packet.
1360 *	args->oif	Outgoing interface, or NULL if packet is incoming.
1361 *		The incoming interface is in the mbuf. (in)
1362 *	args->divert_rule (in/out)
1363 *		Skip up to the first rule past this rule number;
1364 *		upon return, non-zero port number for divert or tee.
1365 *
1366 *	args->rule	Pointer to the last matching rule (in/out)
1367 *	args->next_hop	Socket we are forwarding to (out).
1368 *	args->f_id	Addresses grabbed from the packet (out)
1369 *
1370 * Return value:
1371 *
1372 *	IP_FW_PORT_DENY_FLAG	the packet must be dropped.
1373 *	0	The packet is to be accepted and routed normally OR
1374 *      	the packet was denied/rejected and has been dropped;
1375 *		in the latter case, *m is equal to NULL upon return.
1376 *	port	Divert the packet to port, with these caveats:
1377 *
1378 *		- If IP_FW_PORT_TEE_FLAG is set, tee the packet instead
1379 *		  of diverting it (ie, 'ipfw tee').
1380 *
1381 *		- If IP_FW_PORT_DYNT_FLAG is set, interpret the lower
1382 *		  16 bits as a dummynet pipe number instead of diverting
1383 */
1384
1385static int
1386ipfw_chk(struct ip_fw_args *args)
1387{
1388	/*
1389	 * Local variables hold state during the processing of a packet.
1390	 *
1391	 * IMPORTANT NOTE: to speed up the processing of rules, there
1392	 * are some assumption on the values of the variables, which
1393	 * are documented here. Should you change them, please check
1394	 * the implementation of the various instructions to make sure
1395	 * that they still work.
1396	 *
1397	 * args->eh	The MAC header. It is non-null for a layer2
1398	 *	packet, it is NULL for a layer-3 packet.
1399	 *
1400	 * m | args->m	Pointer to the mbuf, as received from the caller.
1401	 *	It may change if ipfw_chk() does an m_pullup, or if it
1402	 *	consumes the packet because it calls send_reject().
1403	 *	XXX This has to change, so that ipfw_chk() never modifies
1404	 *	or consumes the buffer.
1405	 * ip	is simply an alias of the value of m, and it is kept
1406	 *	in sync with it (the packet is	supposed to start with
1407	 *	the ip header).
1408	 */
1409	struct mbuf *m = args->m;
1410	struct ip *ip = mtod(m, struct ip *);
1411
1412	/*
1413	 * oif | args->oif	If NULL, ipfw_chk has been called on the
1414	 *	inbound path (ether_input, bdg_forward, ip_input).
1415	 *	If non-NULL, ipfw_chk has been called on the outbound path
1416	 *	(ether_output, ip_output).
1417	 */
1418	struct ifnet *oif = args->oif;
1419
1420	struct ip_fw *f = NULL;		/* matching rule */
1421	int retval = 0;
1422
1423	/*
1424	 * hlen	The length of the IPv4 header.
1425	 *	hlen >0 means we have an IPv4 packet.
1426	 */
1427	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
1428
1429	/*
1430	 * offset	The offset of a fragment. offset != 0 means that
1431	 *	we have a fragment at this offset of an IPv4 packet.
1432	 *	offset == 0 means that (if this is an IPv4 packet)
1433	 *	this is the first or only fragment.
1434	 */
1435	u_short offset = 0;
1436
1437	/*
1438	 * Local copies of addresses. They are only valid if we have
1439	 * an IP packet.
1440	 *
1441	 * proto	The protocol. Set to 0 for non-ip packets,
1442	 *	or to the protocol read from the packet otherwise.
1443	 *	proto != 0 means that we have an IPv4 packet.
1444	 *
1445	 * src_port, dst_port	port numbers, in HOST format. Only
1446	 *	valid for TCP and UDP packets.
1447	 *
1448	 * src_ip, dst_ip	ip addresses, in NETWORK format.
1449	 *	Only valid for IPv4 packets.
1450	 */
1451	u_int8_t proto;
1452	u_int16_t src_port = 0, dst_port = 0;	/* NOTE: host format	*/
1453	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
1454	u_int16_t ip_len=0;
1455	int pktlen;
1456	int dyn_dir = MATCH_UNKNOWN;
1457	ipfw_dyn_rule *q = NULL;
1458	struct ip_fw_chain *chain = &layer3_chain;
1459
1460	if (m->m_flags & M_SKIP_FIREWALL)
1461		return 0;	/* accept */
1462	/*
1463	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
1464	 * 	MATCH_NONE when checked and not matched (q = NULL),
1465	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
1466	 */
1467
1468	pktlen = m->m_pkthdr.len;
1469	if (args->eh == NULL ||		/* layer 3 packet */
1470		( m->m_pkthdr.len >= sizeof(struct ip) &&
1471		    ntohs(args->eh->ether_type) == ETHERTYPE_IP))
1472			hlen = ip->ip_hl << 2;
1473
1474	/*
1475	 * Collect parameters into local variables for faster matching.
1476	 */
1477	if (hlen == 0) {	/* do not grab addresses for non-ip pkts */
1478		proto = args->f_id.proto = 0;	/* mark f_id invalid */
1479		goto after_ip_checks;
1480	}
1481
1482	proto = args->f_id.proto = ip->ip_p;
1483	src_ip = ip->ip_src;
1484	dst_ip = ip->ip_dst;
1485	if (args->eh != NULL) { /* layer 2 packets are as on the wire */
1486		offset = ntohs(ip->ip_off) & IP_OFFMASK;
1487		ip_len = ntohs(ip->ip_len);
1488	} else {
1489		offset = ip->ip_off & IP_OFFMASK;
1490		ip_len = ip->ip_len;
1491	}
1492	pktlen = ip_len < pktlen ? ip_len : pktlen;
1493
1494#define PULLUP_TO(len)						\
1495		do {						\
1496			if ((m)->m_len < (len)) {		\
1497			    args->m = m = m_pullup(m, (len));	\
1498			    if (m == 0)				\
1499				goto pullup_failed;		\
1500			    ip = mtod(m, struct ip *);		\
1501			}					\
1502		} while (0)
1503
1504	if (offset == 0) {
1505		switch (proto) {
1506		case IPPROTO_TCP:
1507		    {
1508			struct tcphdr *tcp;
1509
1510			PULLUP_TO(hlen + sizeof(struct tcphdr));
1511			tcp = L3HDR(struct tcphdr, ip);
1512			dst_port = tcp->th_dport;
1513			src_port = tcp->th_sport;
1514			args->f_id.flags = tcp->th_flags;
1515			}
1516			break;
1517
1518		case IPPROTO_UDP:
1519		    {
1520			struct udphdr *udp;
1521
1522			PULLUP_TO(hlen + sizeof(struct udphdr));
1523			udp = L3HDR(struct udphdr, ip);
1524			dst_port = udp->uh_dport;
1525			src_port = udp->uh_sport;
1526			}
1527			break;
1528
1529		case IPPROTO_ICMP:
1530			PULLUP_TO(hlen + 4);	/* type, code and checksum. */
1531			args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
1532			break;
1533
1534		default:
1535			break;
1536		}
1537#undef PULLUP_TO
1538	}
1539
1540	args->f_id.src_ip = ntohl(src_ip.s_addr);
1541	args->f_id.dst_ip = ntohl(dst_ip.s_addr);
1542	args->f_id.src_port = src_port = ntohs(src_port);
1543	args->f_id.dst_port = dst_port = ntohs(dst_port);
1544
1545after_ip_checks:
1546	IPFW_LOCK(chain);		/* XXX expensive? can we run lock free? */
1547	if (args->rule) {
1548		/*
1549		 * Packet has already been tagged. Look for the next rule
1550		 * to restart processing.
1551		 *
1552		 * If fw_one_pass != 0 then just accept it.
1553		 * XXX should not happen here, but optimized out in
1554		 * the caller.
1555		 */
1556		if (fw_one_pass) {
1557			IPFW_UNLOCK(chain);	/* XXX optimize */
1558			return 0;
1559		}
1560
1561		f = args->rule->next_rule;
1562		if (f == NULL)
1563			f = lookup_next_rule(args->rule);
1564	} else {
1565		/*
1566		 * Find the starting rule. It can be either the first
1567		 * one, or the one after divert_rule if asked so.
1568		 */
1569		int skipto = args->divert_rule;
1570
1571		f = chain->rules;
1572		if (args->eh == NULL && skipto != 0) {
1573			if (skipto >= IPFW_DEFAULT_RULE) {
1574				IPFW_UNLOCK(chain);
1575				return(IP_FW_PORT_DENY_FLAG); /* invalid */
1576			}
1577			while (f && f->rulenum <= skipto)
1578				f = f->next;
1579			if (f == NULL) {	/* drop packet */
1580				IPFW_UNLOCK(chain);
1581				return(IP_FW_PORT_DENY_FLAG);
1582			}
1583		}
1584	}
1585	args->divert_rule = 0;	/* reset to avoid confusion later */
1586
1587	/*
1588	 * Now scan the rules, and parse microinstructions for each rule.
1589	 */
1590	for (; f; f = f->next) {
1591		int l, cmdlen;
1592		ipfw_insn *cmd;
1593		int skip_or; /* skip rest of OR block */
1594
1595again:
1596		if (set_disable & (1 << f->set) )
1597			continue;
1598
1599		skip_or = 0;
1600		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
1601		    l -= cmdlen, cmd += cmdlen) {
1602			int match;
1603
1604			/*
1605			 * check_body is a jump target used when we find a
1606			 * CHECK_STATE, and need to jump to the body of
1607			 * the target rule.
1608			 */
1609
1610check_body:
1611			cmdlen = F_LEN(cmd);
1612			/*
1613			 * An OR block (insn_1 || .. || insn_n) has the
1614			 * F_OR bit set in all but the last instruction.
1615			 * The first match will set "skip_or", and cause
1616			 * the following instructions to be skipped until
1617			 * past the one with the F_OR bit clear.
1618			 */
1619			if (skip_or) {		/* skip this instruction */
1620				if ((cmd->len & F_OR) == 0)
1621					skip_or = 0;	/* next one is good */
1622				continue;
1623			}
1624			match = 0; /* set to 1 if we succeed */
1625
1626			switch (cmd->opcode) {
1627			/*
1628			 * The first set of opcodes compares the packet's
1629			 * fields with some pattern, setting 'match' if a
1630			 * match is found. At the end of the loop there is
1631			 * logic to deal with F_NOT and F_OR flags associated
1632			 * with the opcode.
1633			 */
1634			case O_NOP:
1635				match = 1;
1636				break;
1637
1638			case O_FORWARD_MAC:
1639				printf("ipfw: opcode %d unimplemented\n",
1640				    cmd->opcode);
1641				break;
1642
1643			case O_GID:
1644			case O_UID:
1645				/*
1646				 * We only check offset == 0 && proto != 0,
1647				 * as this ensures that we have an IPv4
1648				 * packet with the ports info.
1649				 */
1650				if (offset!=0)
1651					break;
1652				if (proto == IPPROTO_TCP ||
1653				    proto == IPPROTO_UDP)
1654					match = check_uidgid(
1655						    (ipfw_insn_u32 *)cmd,
1656						    proto, oif,
1657						    dst_ip, dst_port,
1658						    src_ip, src_port);
1659				break;
1660
1661			case O_RECV:
1662				match = iface_match(m->m_pkthdr.rcvif,
1663				    (ipfw_insn_if *)cmd);
1664				break;
1665
1666			case O_XMIT:
1667				match = iface_match(oif, (ipfw_insn_if *)cmd);
1668				break;
1669
1670			case O_VIA:
1671				match = iface_match(oif ? oif :
1672				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
1673				break;
1674
1675			case O_MACADDR2:
1676				if (args->eh != NULL) {	/* have MAC header */
1677					u_int32_t *want = (u_int32_t *)
1678						((ipfw_insn_mac *)cmd)->addr;
1679					u_int32_t *mask = (u_int32_t *)
1680						((ipfw_insn_mac *)cmd)->mask;
1681					u_int32_t *hdr = (u_int32_t *)args->eh;
1682
1683					match =
1684					    ( want[0] == (hdr[0] & mask[0]) &&
1685					      want[1] == (hdr[1] & mask[1]) &&
1686					      want[2] == (hdr[2] & mask[2]) );
1687				}
1688				break;
1689
1690			case O_MAC_TYPE:
1691				if (args->eh != NULL) {
1692					u_int16_t t =
1693					    ntohs(args->eh->ether_type);
1694					u_int16_t *p =
1695					    ((ipfw_insn_u16 *)cmd)->ports;
1696					int i;
1697
1698					for (i = cmdlen - 1; !match && i>0;
1699					    i--, p += 2)
1700						match = (t>=p[0] && t<=p[1]);
1701				}
1702				break;
1703
1704			case O_FRAG:
1705				match = (hlen > 0 && offset != 0);
1706				break;
1707
1708			case O_IN:	/* "out" is "not in" */
1709				match = (oif == NULL);
1710				break;
1711
1712			case O_LAYER2:
1713				match = (args->eh != NULL);
1714				break;
1715
1716			case O_PROTO:
1717				/*
1718				 * We do not allow an arg of 0 so the
1719				 * check of "proto" only suffices.
1720				 */
1721				match = (proto == cmd->arg1);
1722				break;
1723
1724			case O_IP_SRC:
1725				match = (hlen > 0 &&
1726				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
1727				    src_ip.s_addr);
1728				break;
1729
1730			case O_IP_SRC_MASK:
1731			case O_IP_DST_MASK:
1732				if (hlen > 0) {
1733				    uint32_t a =
1734					(cmd->opcode == O_IP_DST_MASK) ?
1735					    dst_ip.s_addr : src_ip.s_addr;
1736				    uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
1737				    int i = cmdlen-1;
1738
1739				    for (; !match && i>0; i-= 2, p+= 2)
1740					match = (p[0] == (a & p[1]));
1741				}
1742				break;
1743
1744			case O_IP_SRC_ME:
1745				if (hlen > 0) {
1746					struct ifnet *tif;
1747
1748					INADDR_TO_IFP(src_ip, tif);
1749					match = (tif != NULL);
1750				}
1751				break;
1752
1753			case O_IP_DST_SET:
1754			case O_IP_SRC_SET:
1755				if (hlen > 0) {
1756					u_int32_t *d = (u_int32_t *)(cmd+1);
1757					u_int32_t addr =
1758					    cmd->opcode == O_IP_DST_SET ?
1759						args->f_id.dst_ip :
1760						args->f_id.src_ip;
1761
1762					    if (addr < d[0])
1763						    break;
1764					    addr -= d[0]; /* subtract base */
1765					    match = (addr < cmd->arg1) &&
1766						( d[ 1 + (addr>>5)] &
1767						  (1<<(addr & 0x1f)) );
1768				}
1769				break;
1770
1771			case O_IP_DST:
1772				match = (hlen > 0 &&
1773				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
1774				    dst_ip.s_addr);
1775				break;
1776
1777			case O_IP_DST_ME:
1778				if (hlen > 0) {
1779					struct ifnet *tif;
1780
1781					INADDR_TO_IFP(dst_ip, tif);
1782					match = (tif != NULL);
1783				}
1784				break;
1785
1786			case O_IP_SRCPORT:
1787			case O_IP_DSTPORT:
1788				/*
1789				 * offset == 0 && proto != 0 is enough
1790				 * to guarantee that we have an IPv4
1791				 * packet with port info.
1792				 */
1793				if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
1794				    && offset == 0) {
1795					u_int16_t x =
1796					    (cmd->opcode == O_IP_SRCPORT) ?
1797						src_port : dst_port ;
1798					u_int16_t *p =
1799					    ((ipfw_insn_u16 *)cmd)->ports;
1800					int i;
1801
1802					for (i = cmdlen - 1; !match && i>0;
1803					    i--, p += 2)
1804						match = (x>=p[0] && x<=p[1]);
1805				}
1806				break;
1807
1808			case O_ICMPTYPE:
1809				match = (offset == 0 && proto==IPPROTO_ICMP &&
1810				    icmptype_match(ip, (ipfw_insn_u32 *)cmd) );
1811				break;
1812
1813			case O_IPOPT:
1814				match = (hlen > 0 && ipopts_match(ip, cmd) );
1815				break;
1816
1817			case O_IPVER:
1818				match = (hlen > 0 && cmd->arg1 == ip->ip_v);
1819				break;
1820
1821			case O_IPID:
1822			case O_IPLEN:
1823			case O_IPTTL:
1824				if (hlen > 0) {	/* only for IP packets */
1825				    uint16_t x;
1826				    uint16_t *p;
1827				    int i;
1828
1829				    if (cmd->opcode == O_IPLEN)
1830					x = ip_len;
1831				    else if (cmd->opcode == O_IPTTL)
1832					x = ip->ip_ttl;
1833				    else /* must be IPID */
1834					x = ntohs(ip->ip_id);
1835				    if (cmdlen == 1) {
1836					match = (cmd->arg1 == x);
1837					break;
1838				    }
1839				    /* otherwise we have ranges */
1840				    p = ((ipfw_insn_u16 *)cmd)->ports;
1841				    i = cmdlen - 1;
1842				    for (; !match && i>0; i--, p += 2)
1843					match = (x >= p[0] && x <= p[1]);
1844				}
1845				break;
1846
1847			case O_IPPRECEDENCE:
1848				match = (hlen > 0 &&
1849				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
1850				break;
1851
1852			case O_IPTOS:
1853				match = (hlen > 0 &&
1854				    flags_match(cmd, ip->ip_tos));
1855				break;
1856
1857			case O_TCPFLAGS:
1858				match = (proto == IPPROTO_TCP && offset == 0 &&
1859				    flags_match(cmd,
1860					L3HDR(struct tcphdr,ip)->th_flags));
1861				break;
1862
1863			case O_TCPOPTS:
1864				match = (proto == IPPROTO_TCP && offset == 0 &&
1865				    tcpopts_match(ip, cmd));
1866				break;
1867
1868			case O_TCPSEQ:
1869				match = (proto == IPPROTO_TCP && offset == 0 &&
1870				    ((ipfw_insn_u32 *)cmd)->d[0] ==
1871					L3HDR(struct tcphdr,ip)->th_seq);
1872				break;
1873
1874			case O_TCPACK:
1875				match = (proto == IPPROTO_TCP && offset == 0 &&
1876				    ((ipfw_insn_u32 *)cmd)->d[0] ==
1877					L3HDR(struct tcphdr,ip)->th_ack);
1878				break;
1879
1880			case O_TCPWIN:
1881				match = (proto == IPPROTO_TCP && offset == 0 &&
1882				    cmd->arg1 ==
1883					L3HDR(struct tcphdr,ip)->th_win);
1884				break;
1885
1886			case O_ESTAB:
1887				/* reject packets which have SYN only */
1888				/* XXX should i also check for TH_ACK ? */
1889				match = (proto == IPPROTO_TCP && offset == 0 &&
1890				    (L3HDR(struct tcphdr,ip)->th_flags &
1891				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
1892				break;
1893
1894			case O_LOG:
1895				if (fw_verbose)
1896					ipfw_log(f, hlen, args->eh, m, oif);
1897				match = 1;
1898				break;
1899
1900			case O_PROB:
1901				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
1902				break;
1903
1904			case O_VERREVPATH:
1905				/* Outgoing packets automatically pass/match */
1906				match = ((oif != NULL) ||
1907				    (m->m_pkthdr.rcvif == NULL) ||
1908				    verify_rev_path(src_ip, m->m_pkthdr.rcvif));
1909				break;
1910
1911			case O_IPSEC:
1912#ifdef FAST_IPSEC
1913				match = (m_tag_find(m,
1914				    PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
1915#endif
1916#ifdef IPSEC
1917				match = (ipsec_getnhist(m) != NULL);
1918#endif
1919				/* otherwise no match */
1920				break;
1921
1922			/*
1923			 * The second set of opcodes represents 'actions',
1924			 * i.e. the terminal part of a rule once the packet
1925			 * matches all previous patterns.
1926			 * Typically there is only one action for each rule,
1927			 * and the opcode is stored at the end of the rule
1928			 * (but there are exceptions -- see below).
1929			 *
1930			 * In general, here we set retval and terminate the
1931			 * outer loop (would be a 'break 3' in some language,
1932			 * but we need to do a 'goto done').
1933			 *
1934			 * Exceptions:
1935			 * O_COUNT and O_SKIPTO actions:
1936			 *   instead of terminating, we jump to the next rule
1937			 *   ('goto next_rule', equivalent to a 'break 2'),
1938			 *   or to the SKIPTO target ('goto again' after
1939			 *   having set f, cmd and l), respectively.
1940			 *
1941			 * O_LIMIT and O_KEEP_STATE: these opcodes are
1942			 *   not real 'actions', and are stored right
1943			 *   before the 'action' part of the rule.
1944			 *   These opcodes try to install an entry in the
1945			 *   state tables; if successful, we continue with
1946			 *   the next opcode (match=1; break;), otherwise
1947			 *   the packet *   must be dropped
1948			 *   ('goto done' after setting retval);
1949			 *
1950			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
1951			 *   cause a lookup of the state table, and a jump
1952			 *   to the 'action' part of the parent rule
1953			 *   ('goto check_body') if an entry is found, or
1954			 *   (CHECK_STATE only) a jump to the next rule if
1955			 *   the entry is not found ('goto next_rule').
1956			 *   The result of the lookup is cached to make
1957			 *   further instances of these opcodes are
1958			 *   effectively NOPs.
1959			 */
1960			case O_LIMIT:
1961			case O_KEEP_STATE:
1962				if (install_state(f,
1963				    (ipfw_insn_limit *)cmd, args)) {
1964					retval = IP_FW_PORT_DENY_FLAG;
1965					goto done; /* error/limit violation */
1966				}
1967				match = 1;
1968				break;
1969
1970			case O_PROBE_STATE:
1971			case O_CHECK_STATE:
1972				/*
1973				 * dynamic rules are checked at the first
1974				 * keep-state or check-state occurrence,
1975				 * with the result being stored in dyn_dir.
1976				 * The compiler introduces a PROBE_STATE
1977				 * instruction for us when we have a
1978				 * KEEP_STATE (because PROBE_STATE needs
1979				 * to be run first).
1980				 */
1981				if (dyn_dir == MATCH_UNKNOWN &&
1982				    (q = lookup_dyn_rule(&args->f_id,
1983				     &dyn_dir, proto == IPPROTO_TCP ?
1984					L3HDR(struct tcphdr, ip) : NULL))
1985					!= NULL) {
1986					/*
1987					 * Found dynamic entry, update stats
1988					 * and jump to the 'action' part of
1989					 * the parent rule.
1990					 */
1991					q->pcnt++;
1992					q->bcnt += pktlen;
1993					f = q->rule;
1994					cmd = ACTION_PTR(f);
1995					l = f->cmd_len - f->act_ofs;
1996					IPFW_DYN_UNLOCK();
1997					goto check_body;
1998				}
1999				/*
2000				 * Dynamic entry not found. If CHECK_STATE,
2001				 * skip to next rule, if PROBE_STATE just
2002				 * ignore and continue with next opcode.
2003				 */
2004				if (cmd->opcode == O_CHECK_STATE)
2005					goto next_rule;
2006				match = 1;
2007				break;
2008
2009			case O_ACCEPT:
2010				retval = 0;	/* accept */
2011				goto done;
2012
2013			case O_PIPE:
2014			case O_QUEUE:
2015				args->rule = f; /* report matching rule */
2016				retval = cmd->arg1 | IP_FW_PORT_DYNT_FLAG;
2017				goto done;
2018
2019			case O_DIVERT:
2020			case O_TEE:
2021				if (args->eh) /* not on layer 2 */
2022					break;
2023				args->divert_rule = f->rulenum;
2024				retval = (cmd->opcode == O_DIVERT) ?
2025				    cmd->arg1 :
2026				    cmd->arg1 | IP_FW_PORT_TEE_FLAG;
2027				goto done;
2028
2029			case O_COUNT:
2030			case O_SKIPTO:
2031				f->pcnt++;	/* update stats */
2032				f->bcnt += pktlen;
2033				f->timestamp = time_second;
2034				if (cmd->opcode == O_COUNT)
2035					goto next_rule;
2036				/* handle skipto */
2037				if (f->next_rule == NULL)
2038					lookup_next_rule(f);
2039				f = f->next_rule;
2040				goto again;
2041
2042			case O_REJECT:
2043				/*
2044				 * Drop the packet and send a reject notice
2045				 * if the packet is not ICMP (or is an ICMP
2046				 * query), and it is not multicast/broadcast.
2047				 */
2048				if (hlen > 0 &&
2049				    (proto != IPPROTO_ICMP ||
2050				     is_icmp_query(ip)) &&
2051				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
2052				    !IN_MULTICAST(dst_ip.s_addr)) {
2053					send_reject(args, cmd->arg1,
2054					    offset,ip_len);
2055					m = args->m;
2056				}
2057				/* FALLTHROUGH */
2058			case O_DENY:
2059				retval = IP_FW_PORT_DENY_FLAG;
2060				goto done;
2061
2062			case O_FORWARD_IP:
2063				if (args->eh)	/* not valid on layer2 pkts */
2064					break;
2065				if (!q || dyn_dir == MATCH_FORWARD)
2066					args->next_hop =
2067					    &((ipfw_insn_sa *)cmd)->sa;
2068				retval = 0;
2069				goto done;
2070
2071			default:
2072				panic("-- unknown opcode %d\n", cmd->opcode);
2073			} /* end of switch() on opcodes */
2074
2075			if (cmd->len & F_NOT)
2076				match = !match;
2077
2078			if (match) {
2079				if (cmd->len & F_OR)
2080					skip_or = 1;
2081			} else {
2082				if (!(cmd->len & F_OR)) /* not an OR block, */
2083					break;		/* try next rule    */
2084			}
2085
2086		}	/* end of inner for, scan opcodes */
2087
2088next_rule:;		/* try next rule		*/
2089
2090	}		/* end of outer for, scan rules */
2091	printf("ipfw: ouch!, skip past end of rules, denying packet\n");
2092	IPFW_UNLOCK(chain);
2093	return(IP_FW_PORT_DENY_FLAG);
2094
2095done:
2096	/* Update statistics */
2097	f->pcnt++;
2098	f->bcnt += pktlen;
2099	f->timestamp = time_second;
2100	IPFW_UNLOCK(chain);
2101	return retval;
2102
2103pullup_failed:
2104	if (fw_verbose)
2105		printf("ipfw: pullup failed\n");
2106	return(IP_FW_PORT_DENY_FLAG);
2107}
2108
2109/*
2110 * When a rule is added/deleted, clear the next_rule pointers in all rules.
2111 * These will be reconstructed on the fly as packets are matched.
2112 */
2113static void
2114flush_rule_ptrs(struct ip_fw_chain *chain)
2115{
2116	struct ip_fw *rule;
2117
2118	IPFW_LOCK_ASSERT(chain);
2119
2120	for (rule = chain->rules; rule; rule = rule->next)
2121		rule->next_rule = NULL;
2122}
2123
2124/*
2125 * When pipes/queues are deleted, clear the "pipe_ptr" pointer to a given
2126 * pipe/queue, or to all of them (match == NULL).
2127 */
2128void
2129flush_pipe_ptrs(struct dn_flow_set *match)
2130{
2131	struct ip_fw *rule;
2132
2133	IPFW_LOCK(&layer3_chain);
2134	for (rule = layer3_chain.rules; rule; rule = rule->next) {
2135		ipfw_insn_pipe *cmd = (ipfw_insn_pipe *)ACTION_PTR(rule);
2136
2137		if (cmd->o.opcode != O_PIPE && cmd->o.opcode != O_QUEUE)
2138			continue;
2139		/*
2140		 * XXX Use bcmp/bzero to handle pipe_ptr to overcome
2141		 * possible alignment problems on 64-bit architectures.
2142		 * This code is seldom used so we do not worry too
2143		 * much about efficiency.
2144		 */
2145		if (match == NULL ||
2146		    !bcmp(&cmd->pipe_ptr, &match, sizeof(match)) )
2147			bzero(&cmd->pipe_ptr, sizeof(cmd->pipe_ptr));
2148	}
2149	IPFW_UNLOCK(&layer3_chain);
2150}
2151
2152/*
2153 * Add a new rule to the list. Copy the rule into a malloc'ed area, then
2154 * possibly create a rule number and add the rule to the list.
2155 * Update the rule_number in the input struct so the caller knows it as well.
2156 */
2157static int
2158add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
2159{
2160	struct ip_fw *rule, *f, *prev;
2161	int l = RULESIZE(input_rule);
2162
2163	if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE)
2164		return (EINVAL);
2165
2166	rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO);
2167	if (rule == NULL)
2168		return (ENOSPC);
2169
2170	bcopy(input_rule, rule, l);
2171
2172	rule->next = NULL;
2173	rule->next_rule = NULL;
2174
2175	rule->pcnt = 0;
2176	rule->bcnt = 0;
2177	rule->timestamp = 0;
2178
2179	IPFW_LOCK(chain);
2180
2181	if (chain->rules == NULL) {	/* default rule */
2182		chain->rules = rule;
2183		goto done;
2184        }
2185
2186	/*
2187	 * If rulenum is 0, find highest numbered rule before the
2188	 * default rule, and add autoinc_step
2189	 */
2190	if (autoinc_step < 1)
2191		autoinc_step = 1;
2192	else if (autoinc_step > 1000)
2193		autoinc_step = 1000;
2194	if (rule->rulenum == 0) {
2195		/*
2196		 * locate the highest numbered rule before default
2197		 */
2198		for (f = chain->rules; f; f = f->next) {
2199			if (f->rulenum == IPFW_DEFAULT_RULE)
2200				break;
2201			rule->rulenum = f->rulenum;
2202		}
2203		if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step)
2204			rule->rulenum += autoinc_step;
2205		input_rule->rulenum = rule->rulenum;
2206	}
2207
2208	/*
2209	 * Now insert the new rule in the right place in the sorted list.
2210	 */
2211	for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) {
2212		if (f->rulenum > rule->rulenum) { /* found the location */
2213			if (prev) {
2214				rule->next = f;
2215				prev->next = rule;
2216			} else { /* head insert */
2217				rule->next = chain->rules;
2218				chain->rules = rule;
2219			}
2220			break;
2221		}
2222	}
2223	flush_rule_ptrs(chain);
2224done:
2225	static_count++;
2226	static_len += l;
2227	IPFW_UNLOCK(chain);
2228	DEB(printf("ipfw: installed rule %d, static count now %d\n",
2229		rule->rulenum, static_count);)
2230	return (0);
2231}
2232
2233/**
2234 * Remove a static rule (including derived * dynamic rules)
2235 * and place it on the ``reap list'' for later reclamation.
2236 * The caller is in charge of clearing rule pointers to avoid
2237 * dangling pointers.
2238 * @return a pointer to the next entry.
2239 * Arguments are not checked, so they better be correct.
2240 */
2241static struct ip_fw *
2242remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule, struct ip_fw *prev)
2243{
2244	struct ip_fw *n;
2245	int l = RULESIZE(rule);
2246
2247	IPFW_LOCK_ASSERT(chain);
2248
2249	n = rule->next;
2250	IPFW_DYN_LOCK();
2251	remove_dyn_rule(rule, NULL /* force removal */);
2252	IPFW_DYN_UNLOCK();
2253	if (prev == NULL)
2254		chain->rules = n;
2255	else
2256		prev->next = n;
2257	static_count--;
2258	static_len -= l;
2259
2260	rule->next = chain->reap;
2261	chain->reap = rule;
2262
2263	return n;
2264}
2265
2266/**
2267 * Reclaim storage associated with a list of rules.  This is
2268 * typically the list created using remove_rule.
2269 */
2270static void
2271reap_rules(struct ip_fw *head)
2272{
2273	struct ip_fw *rule;
2274
2275	while ((rule = head) != NULL) {
2276		head = head->next;
2277		if (DUMMYNET_LOADED)
2278			ip_dn_ruledel_ptr(rule);
2279		free(rule, M_IPFW);
2280	}
2281}
2282
2283/*
2284 * Remove all rules from a chain (except rules in set RESVD_SET
2285 * unless kill_default = 1).  The caller is responsible for
2286 * reclaiming storage for the rules left in chain->reap.
2287 */
2288static void
2289free_chain(struct ip_fw_chain *chain, int kill_default)
2290{
2291	struct ip_fw *prev, *rule;
2292
2293	IPFW_LOCK_ASSERT(chain);
2294
2295	flush_rule_ptrs(chain); /* more efficient to do outside the loop */
2296	for (prev = NULL, rule = chain->rules; rule ; )
2297		if (kill_default || rule->set != RESVD_SET)
2298			rule = remove_rule(chain, rule, prev);
2299		else {
2300			prev = rule;
2301			rule = rule->next;
2302		}
2303}
2304
2305/**
2306 * Remove all rules with given number, and also do set manipulation.
2307 * Assumes chain != NULL && *chain != NULL.
2308 *
2309 * The argument is an u_int32_t. The low 16 bit are the rule or set number,
2310 * the next 8 bits are the new set, the top 8 bits are the command:
2311 *
2312 *	0	delete rules with given number
2313 *	1	delete rules with given set number
2314 *	2	move rules with given number to new set
2315 *	3	move rules with given set number to new set
2316 *	4	swap sets with given numbers
2317 */
2318static int
2319del_entry(struct ip_fw_chain *chain, u_int32_t arg)
2320{
2321	struct ip_fw *prev = NULL, *rule;
2322	u_int16_t rulenum;	/* rule or old_set */
2323	u_int8_t cmd, new_set;
2324
2325	rulenum = arg & 0xffff;
2326	cmd = (arg >> 24) & 0xff;
2327	new_set = (arg >> 16) & 0xff;
2328
2329	if (cmd > 4)
2330		return EINVAL;
2331	if (new_set > RESVD_SET)
2332		return EINVAL;
2333	if (cmd == 0 || cmd == 2) {
2334		if (rulenum >= IPFW_DEFAULT_RULE)
2335			return EINVAL;
2336	} else {
2337		if (rulenum > RESVD_SET)	/* old_set */
2338			return EINVAL;
2339	}
2340
2341	IPFW_LOCK(chain);
2342	rule = chain->rules;
2343	chain->reap = NULL;
2344	switch (cmd) {
2345	case 0:	/* delete rules with given number */
2346		/*
2347		 * locate first rule to delete
2348		 */
2349		for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
2350			;
2351		if (rule->rulenum != rulenum) {
2352			IPFW_UNLOCK(chain);
2353			return EINVAL;
2354		}
2355
2356		/*
2357		 * flush pointers outside the loop, then delete all matching
2358		 * rules. prev remains the same throughout the cycle.
2359		 */
2360		flush_rule_ptrs(chain);
2361		while (rule->rulenum == rulenum)
2362			rule = remove_rule(chain, rule, prev);
2363		break;
2364
2365	case 1:	/* delete all rules with given set number */
2366		flush_rule_ptrs(chain);
2367		rule = chain->rules;
2368		while (rule->rulenum < IPFW_DEFAULT_RULE)
2369			if (rule->set == rulenum)
2370				rule = remove_rule(chain, rule, prev);
2371			else {
2372				prev = rule;
2373				rule = rule->next;
2374			}
2375		break;
2376
2377	case 2:	/* move rules with given number to new set */
2378		rule = chain->rules;
2379		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
2380			if (rule->rulenum == rulenum)
2381				rule->set = new_set;
2382		break;
2383
2384	case 3: /* move rules with given set number to new set */
2385		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
2386			if (rule->set == rulenum)
2387				rule->set = new_set;
2388		break;
2389
2390	case 4: /* swap two sets */
2391		for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
2392			if (rule->set == rulenum)
2393				rule->set = new_set;
2394			else if (rule->set == new_set)
2395				rule->set = rulenum;
2396		break;
2397	}
2398	/*
2399	 * Look for rules to reclaim.  We grab the list before
2400	 * releasing the lock then reclaim them w/o the lock to
2401	 * avoid a LOR with dummynet.
2402	 */
2403	rule = chain->reap;
2404	chain->reap = NULL;
2405	IPFW_UNLOCK(chain);
2406	if (rule)
2407		reap_rules(rule);
2408	return 0;
2409}
2410
2411/*
2412 * Clear counters for a specific rule.
2413 * The enclosing "table" is assumed locked.
2414 */
2415static void
2416clear_counters(struct ip_fw *rule, int log_only)
2417{
2418	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
2419
2420	if (log_only == 0) {
2421		rule->bcnt = rule->pcnt = 0;
2422		rule->timestamp = 0;
2423	}
2424	if (l->o.opcode == O_LOG)
2425		l->log_left = l->max_log;
2426}
2427
2428/**
2429 * Reset some or all counters on firewall rules.
2430 * @arg frwl is null to clear all entries, or contains a specific
2431 * rule number.
2432 * @arg log_only is 1 if we only want to reset logs, zero otherwise.
2433 */
2434static int
2435zero_entry(struct ip_fw_chain *chain, int rulenum, int log_only)
2436{
2437	struct ip_fw *rule;
2438	char *msg;
2439
2440	IPFW_LOCK(chain);
2441	if (rulenum == 0) {
2442		norule_counter = 0;
2443		for (rule = chain->rules; rule; rule = rule->next)
2444			clear_counters(rule, log_only);
2445		msg = log_only ? "ipfw: All logging counts reset.\n" :
2446				"ipfw: Accounting cleared.\n";
2447	} else {
2448		int cleared = 0;
2449		/*
2450		 * We can have multiple rules with the same number, so we
2451		 * need to clear them all.
2452		 */
2453		for (rule = chain->rules; rule; rule = rule->next)
2454			if (rule->rulenum == rulenum) {
2455				while (rule && rule->rulenum == rulenum) {
2456					clear_counters(rule, log_only);
2457					rule = rule->next;
2458				}
2459				cleared = 1;
2460				break;
2461			}
2462		if (!cleared) {	/* we did not find any matching rules */
2463			IPFW_UNLOCK(chain);
2464			return (EINVAL);
2465		}
2466		msg = log_only ? "ipfw: Entry %d logging count reset.\n" :
2467				"ipfw: Entry %d cleared.\n";
2468	}
2469	IPFW_UNLOCK(chain);
2470
2471	if (fw_verbose)
2472		log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
2473	return (0);
2474}
2475
2476/*
2477 * Check validity of the structure before insert.
2478 * Fortunately rules are simple, so this mostly need to check rule sizes.
2479 */
2480static int
2481check_ipfw_struct(struct ip_fw *rule, int size)
2482{
2483	int l, cmdlen = 0;
2484	int have_action=0;
2485	ipfw_insn *cmd;
2486
2487	if (size < sizeof(*rule)) {
2488		printf("ipfw: rule too short\n");
2489		return (EINVAL);
2490	}
2491	/* first, check for valid size */
2492	l = RULESIZE(rule);
2493	if (l != size) {
2494		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
2495		return (EINVAL);
2496	}
2497	/*
2498	 * Now go for the individual checks. Very simple ones, basically only
2499	 * instruction sizes.
2500	 */
2501	for (l = rule->cmd_len, cmd = rule->cmd ;
2502			l > 0 ; l -= cmdlen, cmd += cmdlen) {
2503		cmdlen = F_LEN(cmd);
2504		if (cmdlen > l) {
2505			printf("ipfw: opcode %d size truncated\n",
2506			    cmd->opcode);
2507			return EINVAL;
2508		}
2509		DEB(printf("ipfw: opcode %d\n", cmd->opcode);)
2510		switch (cmd->opcode) {
2511		case O_PROBE_STATE:
2512		case O_KEEP_STATE:
2513		case O_PROTO:
2514		case O_IP_SRC_ME:
2515		case O_IP_DST_ME:
2516		case O_LAYER2:
2517		case O_IN:
2518		case O_FRAG:
2519		case O_IPOPT:
2520		case O_IPTOS:
2521		case O_IPPRECEDENCE:
2522		case O_IPVER:
2523		case O_TCPWIN:
2524		case O_TCPFLAGS:
2525		case O_TCPOPTS:
2526		case O_ESTAB:
2527		case O_VERREVPATH:
2528		case O_IPSEC:
2529			if (cmdlen != F_INSN_SIZE(ipfw_insn))
2530				goto bad_size;
2531			break;
2532
2533		case O_UID:
2534		case O_GID:
2535		case O_IP_SRC:
2536		case O_IP_DST:
2537		case O_TCPSEQ:
2538		case O_TCPACK:
2539		case O_PROB:
2540		case O_ICMPTYPE:
2541			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
2542				goto bad_size;
2543			break;
2544
2545		case O_LIMIT:
2546			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
2547				goto bad_size;
2548			break;
2549
2550		case O_LOG:
2551			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
2552				goto bad_size;
2553
2554			((ipfw_insn_log *)cmd)->log_left =
2555			    ((ipfw_insn_log *)cmd)->max_log;
2556
2557			break;
2558
2559		case O_IP_SRC_MASK:
2560		case O_IP_DST_MASK:
2561			/* only odd command lengths */
2562			if ( !(cmdlen & 1) || cmdlen > 31)
2563				goto bad_size;
2564			break;
2565
2566		case O_IP_SRC_SET:
2567		case O_IP_DST_SET:
2568			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
2569				printf("ipfw: invalid set size %d\n",
2570					cmd->arg1);
2571				return EINVAL;
2572			}
2573			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
2574			    (cmd->arg1+31)/32 )
2575				goto bad_size;
2576			break;
2577
2578		case O_MACADDR2:
2579			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
2580				goto bad_size;
2581			break;
2582
2583		case O_NOP:
2584		case O_IPID:
2585		case O_IPTTL:
2586		case O_IPLEN:
2587			if (cmdlen < 1 || cmdlen > 31)
2588				goto bad_size;
2589			break;
2590
2591		case O_MAC_TYPE:
2592		case O_IP_SRCPORT:
2593		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
2594			if (cmdlen < 2 || cmdlen > 31)
2595				goto bad_size;
2596			break;
2597
2598		case O_RECV:
2599		case O_XMIT:
2600		case O_VIA:
2601			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
2602				goto bad_size;
2603			break;
2604
2605		case O_PIPE:
2606		case O_QUEUE:
2607			if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
2608				goto bad_size;
2609			goto check_action;
2610
2611		case O_FORWARD_IP:
2612			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
2613				goto bad_size;
2614			goto check_action;
2615
2616		case O_FORWARD_MAC: /* XXX not implemented yet */
2617		case O_CHECK_STATE:
2618		case O_COUNT:
2619		case O_ACCEPT:
2620		case O_DENY:
2621		case O_REJECT:
2622		case O_SKIPTO:
2623		case O_DIVERT:
2624		case O_TEE:
2625			if (cmdlen != F_INSN_SIZE(ipfw_insn))
2626				goto bad_size;
2627check_action:
2628			if (have_action) {
2629				printf("ipfw: opcode %d, multiple actions"
2630					" not allowed\n",
2631					cmd->opcode);
2632				return EINVAL;
2633			}
2634			have_action = 1;
2635			if (l != cmdlen) {
2636				printf("ipfw: opcode %d, action must be"
2637					" last opcode\n",
2638					cmd->opcode);
2639				return EINVAL;
2640			}
2641			break;
2642		default:
2643			printf("ipfw: opcode %d, unknown opcode\n",
2644				cmd->opcode);
2645			return EINVAL;
2646		}
2647	}
2648	if (have_action == 0) {
2649		printf("ipfw: missing action\n");
2650		return EINVAL;
2651	}
2652	return 0;
2653
2654bad_size:
2655	printf("ipfw: opcode %d size %d wrong\n",
2656		cmd->opcode, cmdlen);
2657	return EINVAL;
2658}
2659
2660/*
2661 * Copy the static and dynamic rules to the supplied buffer
2662 * and return the amount of space actually used.
2663 */
2664static size_t
2665ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
2666{
2667	char *bp = buf;
2668	char *ep = bp + space;
2669	struct ip_fw *rule;
2670	int i;
2671
2672	/* XXX this can take a long time and locking will block packet flow */
2673	IPFW_LOCK(chain);
2674	for (rule = chain->rules; rule ; rule = rule->next) {
2675		/*
2676		 * Verify the entry fits in the buffer in case the
2677		 * rules changed between calculating buffer space and
2678		 * now.  This would be better done using a generation
2679		 * number but should suffice for now.
2680		 */
2681		i = RULESIZE(rule);
2682		if (bp + i <= ep) {
2683			bcopy(rule, bp, i);
2684			bcopy(&set_disable, &(((struct ip_fw *)bp)->next_rule),
2685			    sizeof(set_disable));
2686			bp += i;
2687		}
2688	}
2689	IPFW_UNLOCK(chain);
2690	if (ipfw_dyn_v) {
2691		ipfw_dyn_rule *p, *last = NULL;
2692
2693		IPFW_DYN_LOCK();
2694		for (i = 0 ; i < curr_dyn_buckets; i++)
2695			for (p = ipfw_dyn_v[i] ; p != NULL; p = p->next) {
2696				if (bp + sizeof *p <= ep) {
2697					ipfw_dyn_rule *dst =
2698						(ipfw_dyn_rule *)bp;
2699					bcopy(p, dst, sizeof *p);
2700					bcopy(&(p->rule->rulenum), &(dst->rule),
2701					    sizeof(p->rule->rulenum));
2702					/*
2703					 * store a non-null value in "next".
2704					 * The userland code will interpret a
2705					 * NULL here as a marker
2706					 * for the last dynamic rule.
2707					 */
2708					bcopy(&dst, &dst->next, sizeof(dst));
2709					last = dst;
2710					dst->expire =
2711					    TIME_LEQ(dst->expire, time_second) ?
2712						0 : dst->expire - time_second ;
2713					bp += sizeof(ipfw_dyn_rule);
2714				}
2715			}
2716		IPFW_DYN_UNLOCK();
2717		if (last != NULL) /* mark last dynamic rule */
2718			bzero(&last->next, sizeof(last));
2719	}
2720	return (bp - (char *)buf);
2721}
2722
2723
2724/**
2725 * {set|get}sockopt parser.
2726 */
2727static int
2728ipfw_ctl(struct sockopt *sopt)
2729{
2730#define	RULE_MAXSIZE	(256*sizeof(u_int32_t))
2731	int error, rule_num;
2732	size_t size;
2733	struct ip_fw *buf, *rule;
2734	u_int32_t rulenum[2];
2735
2736	/*
2737	 * Disallow modifications in really-really secure mode, but still allow
2738	 * the logging counters to be reset.
2739	 */
2740	if (sopt->sopt_name == IP_FW_ADD ||
2741	    (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
2742#if __FreeBSD_version >= 500034
2743		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
2744		if (error)
2745			return (error);
2746#else /* FreeBSD 4.x */
2747		if (securelevel >= 3)
2748			return (EPERM);
2749#endif
2750	}
2751
2752	error = 0;
2753
2754	switch (sopt->sopt_name) {
2755	case IP_FW_GET:
2756		/*
2757		 * pass up a copy of the current rules. Static rules
2758		 * come first (the last of which has number IPFW_DEFAULT_RULE),
2759		 * followed by a possibly empty list of dynamic rule.
2760		 * The last dynamic rule has NULL in the "next" field.
2761		 *
2762		 * Note that the calculated size is used to bound the
2763		 * amount of data returned to the user.  The rule set may
2764		 * change between calculating the size and returning the
2765		 * data in which case we'll just return what fits.
2766		 */
2767		size = static_len;	/* size of static rules */
2768		if (ipfw_dyn_v)		/* add size of dyn.rules */
2769			size += (dyn_count * sizeof(ipfw_dyn_rule));
2770
2771		/*
2772		 * XXX todo: if the user passes a short length just to know
2773		 * how much room is needed, do not bother filling up the
2774		 * buffer, just jump to the sooptcopyout.
2775		 */
2776		buf = malloc(size, M_TEMP, M_WAITOK);
2777		error = sooptcopyout(sopt, buf,
2778				ipfw_getrules(&layer3_chain, buf, size));
2779		free(buf, M_TEMP);
2780		break;
2781
2782	case IP_FW_FLUSH:
2783		/*
2784		 * Normally we cannot release the lock on each iteration.
2785		 * We could do it here only because we start from the head all
2786		 * the times so there is no risk of missing some entries.
2787		 * On the other hand, the risk is that we end up with
2788		 * a very inconsistent ruleset, so better keep the lock
2789		 * around the whole cycle.
2790		 *
2791		 * XXX this code can be improved by resetting the head of
2792		 * the list to point to the default rule, and then freeing
2793		 * the old list without the need for a lock.
2794		 */
2795
2796		IPFW_LOCK(&layer3_chain);
2797		layer3_chain.reap = NULL;
2798		free_chain(&layer3_chain, 0 /* keep default rule */);
2799		rule = layer3_chain.reap, layer3_chain.reap = NULL;
2800		IPFW_UNLOCK(&layer3_chain);
2801		if (layer3_chain.reap != NULL)
2802			reap_rules(rule);
2803		break;
2804
2805	case IP_FW_ADD:
2806		rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
2807		error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
2808			sizeof(struct ip_fw) );
2809		if (error == 0)
2810			error = check_ipfw_struct(rule, sopt->sopt_valsize);
2811		if (error == 0) {
2812			error = add_rule(&layer3_chain, rule);
2813			size = RULESIZE(rule);
2814			if (!error && sopt->sopt_dir == SOPT_GET)
2815				error = sooptcopyout(sopt, rule, size);
2816		}
2817		free(rule, M_TEMP);
2818		break;
2819
2820	case IP_FW_DEL:
2821		/*
2822		 * IP_FW_DEL is used for deleting single rules or sets,
2823		 * and (ab)used to atomically manipulate sets. Argument size
2824		 * is used to distinguish between the two:
2825		 *    sizeof(u_int32_t)
2826		 *	delete single rule or set of rules,
2827		 *	or reassign rules (or sets) to a different set.
2828		 *    2*sizeof(u_int32_t)
2829		 *	atomic disable/enable sets.
2830		 *	first u_int32_t contains sets to be disabled,
2831		 *	second u_int32_t contains sets to be enabled.
2832		 */
2833		error = sooptcopyin(sopt, rulenum,
2834			2*sizeof(u_int32_t), sizeof(u_int32_t));
2835		if (error)
2836			break;
2837		size = sopt->sopt_valsize;
2838		if (size == sizeof(u_int32_t))	/* delete or reassign */
2839			error = del_entry(&layer3_chain, rulenum[0]);
2840		else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */
2841			set_disable =
2842			    (set_disable | rulenum[0]) & ~rulenum[1] &
2843			    ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
2844		else
2845			error = EINVAL;
2846		break;
2847
2848	case IP_FW_ZERO:
2849	case IP_FW_RESETLOG: /* argument is an int, the rule number */
2850		rule_num = 0;
2851		if (sopt->sopt_val != 0) {
2852		    error = sooptcopyin(sopt, &rule_num,
2853			    sizeof(int), sizeof(int));
2854		    if (error)
2855			break;
2856		}
2857		error = zero_entry(&layer3_chain, rule_num,
2858			sopt->sopt_name == IP_FW_RESETLOG);
2859		break;
2860
2861	default:
2862		printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
2863		error = EINVAL;
2864	}
2865
2866	return (error);
2867#undef RULE_MAXSIZE
2868}
2869
2870/**
2871 * dummynet needs a reference to the default rule, because rules can be
2872 * deleted while packets hold a reference to them. When this happens,
2873 * dummynet changes the reference to the default rule (it could well be a
2874 * NULL pointer, but this way we do not need to check for the special
2875 * case, plus here he have info on the default behaviour).
2876 */
2877struct ip_fw *ip_fw_default_rule;
2878
2879/*
2880 * This procedure is only used to handle keepalives. It is invoked
2881 * every dyn_keepalive_period
2882 */
2883static void
2884ipfw_tick(void * __unused unused)
2885{
2886	int i;
2887	ipfw_dyn_rule *q;
2888
2889	if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0)
2890		goto done;
2891
2892	IPFW_DYN_LOCK();
2893	for (i = 0 ; i < curr_dyn_buckets ; i++) {
2894		for (q = ipfw_dyn_v[i] ; q ; q = q->next ) {
2895			if (q->dyn_type == O_LIMIT_PARENT)
2896				continue;
2897			if (q->id.proto != IPPROTO_TCP)
2898				continue;
2899			if ( (q->state & BOTH_SYN) != BOTH_SYN)
2900				continue;
2901			if (TIME_LEQ( time_second+dyn_keepalive_interval,
2902			    q->expire))
2903				continue;	/* too early */
2904			if (TIME_LEQ(q->expire, time_second))
2905				continue;	/* too late, rule expired */
2906
2907			send_pkt(&(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN);
2908			send_pkt(&(q->id), q->ack_fwd - 1, q->ack_rev, 0);
2909		}
2910	}
2911	IPFW_DYN_UNLOCK();
2912done:
2913	callout_reset(&ipfw_timeout, dyn_keepalive_period*hz, ipfw_tick, NULL);
2914}
2915
2916static int
2917ipfw_init(void)
2918{
2919	struct ip_fw default_rule;
2920	int error;
2921
2922	layer3_chain.rules = NULL;
2923	IPFW_LOCK_INIT(&layer3_chain);
2924	IPFW_DYN_LOCK_INIT();
2925	callout_init(&ipfw_timeout, debug_mpsafenet ? CALLOUT_MPSAFE : 0);
2926
2927	bzero(&default_rule, sizeof default_rule);
2928
2929	default_rule.act_ofs = 0;
2930	default_rule.rulenum = IPFW_DEFAULT_RULE;
2931	default_rule.cmd_len = 1;
2932	default_rule.set = RESVD_SET;
2933
2934	default_rule.cmd[0].len = 1;
2935	default_rule.cmd[0].opcode =
2936#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
2937				1 ? O_ACCEPT :
2938#endif
2939				O_DENY;
2940
2941	error = add_rule(&layer3_chain, &default_rule);
2942	if (error != 0) {
2943		printf("ipfw2: error %u initializing default rule "
2944			"(support disabled)\n", error);
2945		IPFW_DYN_LOCK_DESTROY();
2946		IPFW_LOCK_DESTROY(&layer3_chain);
2947		return (error);
2948	}
2949
2950	ip_fw_default_rule = layer3_chain.rules;
2951	printf("ipfw2 initialized, divert %s, "
2952		"rule-based forwarding enabled, default to %s, logging ",
2953#ifdef IPDIVERT
2954		"enabled",
2955#else
2956		"disabled",
2957#endif
2958		default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny");
2959
2960#ifdef IPFIREWALL_VERBOSE
2961	fw_verbose = 1;
2962#endif
2963#ifdef IPFIREWALL_VERBOSE_LIMIT
2964	verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
2965#endif
2966	if (fw_verbose == 0)
2967		printf("disabled\n");
2968	else if (verbose_limit == 0)
2969		printf("unlimited\n");
2970	else
2971		printf("limited to %d packets/entry by default\n",
2972		    verbose_limit);
2973
2974	ip_fw_chk_ptr = ipfw_chk;
2975	ip_fw_ctl_ptr = ipfw_ctl;
2976	callout_reset(&ipfw_timeout, hz, ipfw_tick, NULL);
2977
2978	return (0);
2979}
2980
2981static void
2982ipfw_destroy(void)
2983{
2984	struct ip_fw *reap;
2985
2986	IPFW_LOCK(&layer3_chain);
2987	callout_stop(&ipfw_timeout);
2988	ip_fw_chk_ptr = NULL;
2989	ip_fw_ctl_ptr = NULL;
2990	layer3_chain.reap = NULL;
2991	free_chain(&layer3_chain, 1 /* kill default rule */);
2992	reap = layer3_chain.reap, layer3_chain.reap = NULL;
2993	IPFW_UNLOCK(&layer3_chain);
2994	if (reap != NULL)
2995		reap_rules(reap);
2996
2997	IPFW_DYN_LOCK_DESTROY();
2998	IPFW_LOCK_DESTROY(&layer3_chain);
2999	printf("IP firewall unloaded\n");
3000}
3001
3002static int
3003ipfw_modevent(module_t mod, int type, void *unused)
3004{
3005	int err = 0;
3006
3007	switch (type) {
3008	case MOD_LOAD:
3009		if (IPFW_LOADED) {
3010			printf("IP firewall already loaded\n");
3011			err = EEXIST;
3012		} else {
3013			err = ipfw_init();
3014		}
3015		break;
3016
3017	case MOD_UNLOAD:
3018		ipfw_destroy();
3019		err = 0;
3020		break;
3021	default:
3022		break;
3023	}
3024	return err;
3025}
3026
3027static moduledata_t ipfwmod = {
3028	"ipfw",
3029	ipfw_modevent,
3030	0
3031};
3032DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PSEUDO, SI_ORDER_ANY);
3033MODULE_VERSION(ipfw, 1);
3034#endif /* IPFW2 */
3035