ip_fw2.c revision 117241
1/*
2 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: head/sys/netinet/ip_fw2.c 117241 2003-07-04 21:42:32Z luigi $
26 */
27
28#define        DEB(x)
29#define        DDB(x) x
30
31/*
32 * Implement IP packet firewall (new version)
33 */
34
35#if !defined(KLD_MODULE)
36#include "opt_ipfw.h"
37#include "opt_ipdn.h"
38#include "opt_ipdivert.h"
39#include "opt_inet.h"
40#ifndef INET
41#error IPFIREWALL requires INET.
42#endif /* INET */
43#endif
44
45#define IPFW2	1
46#if IPFW2
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/malloc.h>
50#include <sys/mbuf.h>
51#include <sys/kernel.h>
52#include <sys/proc.h>
53#include <sys/socket.h>
54#include <sys/socketvar.h>
55#include <sys/sysctl.h>
56#include <sys/syslog.h>
57#include <sys/ucred.h>
58#include <net/if.h>
59#include <net/route.h>
60#include <netinet/in.h>
61#include <netinet/in_systm.h>
62#include <netinet/in_var.h>
63#include <netinet/in_pcb.h>
64#include <netinet/ip.h>
65#include <netinet/ip_var.h>
66#include <netinet/ip_icmp.h>
67#include <netinet/ip_fw.h>
68#include <netinet/ip_dummynet.h>
69#include <netinet/tcp.h>
70#include <netinet/tcp_timer.h>
71#include <netinet/tcp_var.h>
72#include <netinet/tcpip.h>
73#include <netinet/udp.h>
74#include <netinet/udp_var.h>
75
76#ifdef IPSEC
77#include <netinet6/ipsec.h>
78#endif
79
80#include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
81
82#include <machine/in_cksum.h>	/* XXX for in_cksum */
83
84/*
85 * XXX This one should go in sys/mbuf.h. It is used to avoid that
86 * a firewall-generated packet loops forever through the firewall.
87 */
88#ifndef	M_SKIP_FIREWALL
89#define M_SKIP_FIREWALL         0x4000
90#endif
91
92/*
93 * set_disable contains one bit per set value (0..31).
94 * If the bit is set, all rules with the corresponding set
95 * are disabled. Set 31 is reserved for the default rule
96 * and CANNOT be disabled.
97 */
98static u_int32_t set_disable;
99
100static int fw_verbose;
101static int verbose_limit;
102
103static struct callout_handle ipfw_timeout_h;
104#define	IPFW_DEFAULT_RULE	65535
105
106/*
107 * list of rules for layer 3
108 */
109static struct ip_fw *layer3_chain;
110
111MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
112
113static int fw_debug = 1;
114static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
115
116#ifdef SYSCTL_NODE
117SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
118SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable,
119    CTLFLAG_RW | CTLFLAG_SECURE3,
120    &fw_enable, 0, "Enable ipfw");
121SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW,
122    &autoinc_step, 0, "Rule number autincrement step");
123SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
124    CTLFLAG_RW | CTLFLAG_SECURE3,
125    &fw_one_pass, 0,
126    "Only do a single pass through ipfw when using dummynet(4)");
127SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
128    &fw_debug, 0, "Enable printing of debug ip_fw statements");
129SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
130    CTLFLAG_RW | CTLFLAG_SECURE3,
131    &fw_verbose, 0, "Log matches to ipfw rules");
132SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
133    &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
134
135/*
136 * Description of dynamic rules.
137 *
138 * Dynamic rules are stored in lists accessed through a hash table
139 * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
140 * be modified through the sysctl variable dyn_buckets which is
141 * updated when the table becomes empty.
142 *
143 * XXX currently there is only one list, ipfw_dyn.
144 *
145 * When a packet is received, its address fields are first masked
146 * with the mask defined for the rule, then hashed, then matched
147 * against the entries in the corresponding list.
148 * Dynamic rules can be used for different purposes:
149 *  + stateful rules;
150 *  + enforcing limits on the number of sessions;
151 *  + in-kernel NAT (not implemented yet)
152 *
153 * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
154 * measured in seconds and depending on the flags.
155 *
156 * The total number of dynamic rules is stored in dyn_count.
157 * The max number of dynamic rules is dyn_max. When we reach
158 * the maximum number of rules we do not create anymore. This is
159 * done to avoid consuming too much memory, but also too much
160 * time when searching on each packet (ideally, we should try instead
161 * to put a limit on the length of the list on each bucket...).
162 *
163 * Each dynamic rule holds a pointer to the parent ipfw rule so
164 * we know what action to perform. Dynamic rules are removed when
165 * the parent rule is deleted. XXX we should make them survive.
166 *
167 * There are some limitations with dynamic rules -- we do not
168 * obey the 'randomized match', and we do not do multiple
169 * passes through the firewall. XXX check the latter!!!
170 */
171static ipfw_dyn_rule **ipfw_dyn_v = NULL;
172static u_int32_t dyn_buckets = 256; /* must be power of 2 */
173static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */
174
175/*
176 * Timeouts for various events in handing dynamic rules.
177 */
178static u_int32_t dyn_ack_lifetime = 300;
179static u_int32_t dyn_syn_lifetime = 20;
180static u_int32_t dyn_fin_lifetime = 1;
181static u_int32_t dyn_rst_lifetime = 1;
182static u_int32_t dyn_udp_lifetime = 10;
183static u_int32_t dyn_short_lifetime = 5;
184
185/*
186 * Keepalives are sent if dyn_keepalive is set. They are sent every
187 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
188 * seconds of lifetime of a rule.
189 * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
190 * than dyn_keepalive_period.
191 */
192
193static u_int32_t dyn_keepalive_interval = 20;
194static u_int32_t dyn_keepalive_period = 5;
195static u_int32_t dyn_keepalive = 1;	/* do send keepalives */
196
197static u_int32_t static_count;	/* # of static rules */
198static u_int32_t static_len;	/* size in bytes of static rules */
199static u_int32_t dyn_count;		/* # of dynamic rules */
200static u_int32_t dyn_max = 4096;	/* max # of dynamic rules */
201
202SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW,
203    &dyn_buckets, 0, "Number of dyn. buckets");
204SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD,
205    &curr_dyn_buckets, 0, "Current Number of dyn. buckets");
206SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD,
207    &dyn_count, 0, "Number of dyn. rules");
208SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW,
209    &dyn_max, 0, "Max number of dyn. rules");
210SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
211    &static_count, 0, "Number of static rules");
212SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
213    &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
214SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
215    &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
216SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
217    &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
218SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
219    &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
220SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
221    &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
222SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
223    &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
224SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
225    &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
226
227#endif /* SYSCTL_NODE */
228
229
230static ip_fw_chk_t	ipfw_chk;
231
232ip_dn_ruledel_t *ip_dn_ruledel_ptr = NULL;	/* hook into dummynet */
233
234/*
235 * This macro maps an ip pointer into a layer3 header pointer of type T
236 */
237#define	L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
238
239static __inline int
240icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
241{
242	int type = L3HDR(struct icmp,ip)->icmp_type;
243
244	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
245}
246
247#define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
248    (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
249
250static int
251is_icmp_query(struct ip *ip)
252{
253	int type = L3HDR(struct icmp, ip)->icmp_type;
254	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
255}
256#undef TT
257
258/*
259 * The following checks use two arrays of 8 or 16 bits to store the
260 * bits that we want set or clear, respectively. They are in the
261 * low and high half of cmd->arg1 or cmd->d[0].
262 *
263 * We scan options and store the bits we find set. We succeed if
264 *
265 *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
266 *
267 * The code is sometimes optimized not to store additional variables.
268 */
269
270static int
271flags_match(ipfw_insn *cmd, u_int8_t bits)
272{
273	u_char want_clear;
274	bits = ~bits;
275
276	if ( ((cmd->arg1 & 0xff) & bits) != 0)
277		return 0; /* some bits we want set were clear */
278	want_clear = (cmd->arg1 >> 8) & 0xff;
279	if ( (want_clear & bits) != want_clear)
280		return 0; /* some bits we want clear were set */
281	return 1;
282}
283
284static int
285ipopts_match(struct ip *ip, ipfw_insn *cmd)
286{
287	int optlen, bits = 0;
288	u_char *cp = (u_char *)(ip + 1);
289	int x = (ip->ip_hl << 2) - sizeof (struct ip);
290
291	for (; x > 0; x -= optlen, cp += optlen) {
292		int opt = cp[IPOPT_OPTVAL];
293
294		if (opt == IPOPT_EOL)
295			break;
296		if (opt == IPOPT_NOP)
297			optlen = 1;
298		else {
299			optlen = cp[IPOPT_OLEN];
300			if (optlen <= 0 || optlen > x)
301				return 0; /* invalid or truncated */
302		}
303		switch (opt) {
304
305		default:
306			break;
307
308		case IPOPT_LSRR:
309			bits |= IP_FW_IPOPT_LSRR;
310			break;
311
312		case IPOPT_SSRR:
313			bits |= IP_FW_IPOPT_SSRR;
314			break;
315
316		case IPOPT_RR:
317			bits |= IP_FW_IPOPT_RR;
318			break;
319
320		case IPOPT_TS:
321			bits |= IP_FW_IPOPT_TS;
322			break;
323		}
324	}
325	return (flags_match(cmd, bits));
326}
327
328static int
329tcpopts_match(struct ip *ip, ipfw_insn *cmd)
330{
331	int optlen, bits = 0;
332	struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
333	u_char *cp = (u_char *)(tcp + 1);
334	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
335
336	for (; x > 0; x -= optlen, cp += optlen) {
337		int opt = cp[0];
338		if (opt == TCPOPT_EOL)
339			break;
340		if (opt == TCPOPT_NOP)
341			optlen = 1;
342		else {
343			optlen = cp[1];
344			if (optlen <= 0)
345				break;
346		}
347
348		switch (opt) {
349
350		default:
351			break;
352
353		case TCPOPT_MAXSEG:
354			bits |= IP_FW_TCPOPT_MSS;
355			break;
356
357		case TCPOPT_WINDOW:
358			bits |= IP_FW_TCPOPT_WINDOW;
359			break;
360
361		case TCPOPT_SACK_PERMITTED:
362		case TCPOPT_SACK:
363			bits |= IP_FW_TCPOPT_SACK;
364			break;
365
366		case TCPOPT_TIMESTAMP:
367			bits |= IP_FW_TCPOPT_TS;
368			break;
369
370		case TCPOPT_CC:
371		case TCPOPT_CCNEW:
372		case TCPOPT_CCECHO:
373			bits |= IP_FW_TCPOPT_CC;
374			break;
375		}
376	}
377	return (flags_match(cmd, bits));
378}
379
380static int
381iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
382{
383	if (ifp == NULL)	/* no iface with this packet, match fails */
384		return 0;
385	/* Check by name or by IP address */
386	if (cmd->name[0] != '\0') { /* match by name */
387		/* Check unit number (-1 is wildcard) */
388		if (cmd->p.unit != -1 && cmd->p.unit != ifp->if_unit)
389			return(0);
390		/* Check name */
391		if (!strncmp(ifp->if_name, cmd->name, IFNAMSIZ))
392			return(1);
393	} else {
394		struct ifaddr *ia;
395
396		TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
397			if (ia->ifa_addr == NULL)
398				continue;
399			if (ia->ifa_addr->sa_family != AF_INET)
400				continue;
401			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
402			    (ia->ifa_addr))->sin_addr.s_addr)
403				return(1);	/* match */
404		}
405	}
406	return(0);	/* no match, fail ... */
407}
408
409/*
410 * The 'verrevpath' option checks that the interface that an IP packet
411 * arrives on is the same interface that traffic destined for the
412 * packet's source address would be routed out of. This is a measure
413 * to block forged packets. This is also commonly known as "anti-spoofing"
414 * or Unicast Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The
415 * name of the knob is purposely reminisent of the Cisco IOS command,
416 *
417 *   ip verify unicast reverse-path
418 *
419 * which implements the same functionality. But note that syntax is
420 * misleading. The check may be performed on all IP packets whether unicast,
421 * multicast, or broadcast.
422 */
423static int
424verify_rev_path(struct in_addr src, struct ifnet *ifp)
425{
426	static struct route ro;
427	struct sockaddr_in *dst;
428
429	dst = (struct sockaddr_in *)&(ro.ro_dst);
430
431	/* Check if we've cached the route from the previous call. */
432	if (src.s_addr != dst->sin_addr.s_addr) {
433		ro.ro_rt = NULL;
434
435		bzero(dst, sizeof(*dst));
436		dst->sin_family = AF_INET;
437		dst->sin_len = sizeof(*dst);
438		dst->sin_addr = src;
439
440		rtalloc_ign(&ro, RTF_CLONING|RTF_PRCLONING);
441	}
442
443	if ((ro.ro_rt == NULL) || (ifp == NULL) ||
444	    (ro.ro_rt->rt_ifp->if_index != ifp->if_index))
445		return 0;
446
447	return 1;
448}
449
450
451static u_int64_t norule_counter;	/* counter for ipfw_log(NULL...) */
452
453#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
454#define SNP(buf) buf, sizeof(buf)
455
456/*
457 * We enter here when we have a rule with O_LOG.
458 * XXX this function alone takes about 2Kbytes of code!
459 */
460static void
461ipfw_log(struct ip_fw *f, u_int hlen, struct ether_header *eh,
462	struct mbuf *m, struct ifnet *oif)
463{
464	char *action;
465	int limit_reached = 0;
466	char action2[40], proto[48], fragment[28];
467
468	fragment[0] = '\0';
469	proto[0] = '\0';
470
471	if (f == NULL) {	/* bogus pkt */
472		if (verbose_limit != 0 && norule_counter >= verbose_limit)
473			return;
474		norule_counter++;
475		if (norule_counter == verbose_limit)
476			limit_reached = verbose_limit;
477		action = "Refuse";
478	} else {	/* O_LOG is the first action, find the real one */
479		ipfw_insn *cmd = ACTION_PTR(f);
480		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
481
482		if (l->max_log != 0 && l->log_left == 0)
483			return;
484		l->log_left--;
485		if (l->log_left == 0)
486			limit_reached = l->max_log;
487		cmd += F_LEN(cmd);	/* point to first action */
488		if (cmd->opcode == O_PROB)
489			cmd += F_LEN(cmd);
490
491		action = action2;
492		switch (cmd->opcode) {
493		case O_DENY:
494			action = "Deny";
495			break;
496
497		case O_REJECT:
498			if (cmd->arg1==ICMP_REJECT_RST)
499				action = "Reset";
500			else if (cmd->arg1==ICMP_UNREACH_HOST)
501				action = "Reject";
502			else
503				snprintf(SNPARGS(action2, 0), "Unreach %d",
504					cmd->arg1);
505			break;
506
507		case O_ACCEPT:
508			action = "Accept";
509			break;
510		case O_COUNT:
511			action = "Count";
512			break;
513		case O_DIVERT:
514			snprintf(SNPARGS(action2, 0), "Divert %d",
515				cmd->arg1);
516			break;
517		case O_TEE:
518			snprintf(SNPARGS(action2, 0), "Tee %d",
519				cmd->arg1);
520			break;
521		case O_SKIPTO:
522			snprintf(SNPARGS(action2, 0), "SkipTo %d",
523				cmd->arg1);
524			break;
525		case O_PIPE:
526			snprintf(SNPARGS(action2, 0), "Pipe %d",
527				cmd->arg1);
528			break;
529		case O_QUEUE:
530			snprintf(SNPARGS(action2, 0), "Queue %d",
531				cmd->arg1);
532			break;
533		case O_FORWARD_IP: {
534			ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
535			int len;
536
537			len = snprintf(SNPARGS(action2, 0), "Forward to %s",
538				inet_ntoa(sa->sa.sin_addr));
539			if (sa->sa.sin_port)
540				snprintf(SNPARGS(action2, len), ":%d",
541				    sa->sa.sin_port);
542			}
543			break;
544		default:
545			action = "UNKNOWN";
546			break;
547		}
548	}
549
550	if (hlen == 0) {	/* non-ip */
551		snprintf(SNPARGS(proto, 0), "MAC");
552	} else {
553		struct ip *ip = mtod(m, struct ip *);
554		/* these three are all aliases to the same thing */
555		struct icmp *const icmp = L3HDR(struct icmp, ip);
556		struct tcphdr *const tcp = (struct tcphdr *)icmp;
557		struct udphdr *const udp = (struct udphdr *)icmp;
558
559		int ip_off, offset, ip_len;
560
561		int len;
562
563		if (eh != NULL) { /* layer 2 packets are as on the wire */
564			ip_off = ntohs(ip->ip_off);
565			ip_len = ntohs(ip->ip_len);
566		} else {
567			ip_off = ip->ip_off;
568			ip_len = ip->ip_len;
569		}
570		offset = ip_off & IP_OFFMASK;
571		switch (ip->ip_p) {
572		case IPPROTO_TCP:
573			len = snprintf(SNPARGS(proto, 0), "TCP %s",
574			    inet_ntoa(ip->ip_src));
575			if (offset == 0)
576				snprintf(SNPARGS(proto, len), ":%d %s:%d",
577				    ntohs(tcp->th_sport),
578				    inet_ntoa(ip->ip_dst),
579				    ntohs(tcp->th_dport));
580			else
581				snprintf(SNPARGS(proto, len), " %s",
582				    inet_ntoa(ip->ip_dst));
583			break;
584
585		case IPPROTO_UDP:
586			len = snprintf(SNPARGS(proto, 0), "UDP %s",
587				inet_ntoa(ip->ip_src));
588			if (offset == 0)
589				snprintf(SNPARGS(proto, len), ":%d %s:%d",
590				    ntohs(udp->uh_sport),
591				    inet_ntoa(ip->ip_dst),
592				    ntohs(udp->uh_dport));
593			else
594				snprintf(SNPARGS(proto, len), " %s",
595				    inet_ntoa(ip->ip_dst));
596			break;
597
598		case IPPROTO_ICMP:
599			if (offset == 0)
600				len = snprintf(SNPARGS(proto, 0),
601				    "ICMP:%u.%u ",
602				    icmp->icmp_type, icmp->icmp_code);
603			else
604				len = snprintf(SNPARGS(proto, 0), "ICMP ");
605			len += snprintf(SNPARGS(proto, len), "%s",
606			    inet_ntoa(ip->ip_src));
607			snprintf(SNPARGS(proto, len), " %s",
608			    inet_ntoa(ip->ip_dst));
609			break;
610
611		default:
612			len = snprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
613			    inet_ntoa(ip->ip_src));
614			snprintf(SNPARGS(proto, len), " %s",
615			    inet_ntoa(ip->ip_dst));
616			break;
617		}
618
619		if (ip_off & (IP_MF | IP_OFFMASK))
620			snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
621			     ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
622			     offset << 3,
623			     (ip_off & IP_MF) ? "+" : "");
624	}
625	if (oif || m->m_pkthdr.rcvif)
626		log(LOG_SECURITY | LOG_INFO,
627		    "ipfw: %d %s %s %s via %s%d%s\n",
628		    f ? f->rulenum : -1,
629		    action, proto, oif ? "out" : "in",
630		    oif ? oif->if_name : m->m_pkthdr.rcvif->if_name,
631		    oif ? oif->if_unit : m->m_pkthdr.rcvif->if_unit,
632		    fragment);
633	else
634		log(LOG_SECURITY | LOG_INFO,
635		    "ipfw: %d %s %s [no if info]%s\n",
636		    f ? f->rulenum : -1,
637		    action, proto, fragment);
638	if (limit_reached)
639		log(LOG_SECURITY | LOG_NOTICE,
640		    "ipfw: limit %d reached on entry %d\n",
641		    limit_reached, f ? f->rulenum : -1);
642}
643
644/*
645 * IMPORTANT: the hash function for dynamic rules must be commutative
646 * in source and destination (ip,port), because rules are bidirectional
647 * and we want to find both in the same bucket.
648 */
649static __inline int
650hash_packet(struct ipfw_flow_id *id)
651{
652	u_int32_t i;
653
654	i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
655	i &= (curr_dyn_buckets - 1);
656	return i;
657}
658
659/**
660 * unlink a dynamic rule from a chain. prev is a pointer to
661 * the previous one, q is a pointer to the rule to delete,
662 * head is a pointer to the head of the queue.
663 * Modifies q and potentially also head.
664 */
665#define UNLINK_DYN_RULE(prev, head, q) {				\
666	ipfw_dyn_rule *old_q = q;					\
667									\
668	/* remove a refcount to the parent */				\
669	if (q->dyn_type == O_LIMIT)					\
670		q->parent->count--;					\
671	DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\
672		(q->id.src_ip), (q->id.src_port),			\
673		(q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); )	\
674	if (prev != NULL)						\
675		prev->next = q = q->next;				\
676	else								\
677		head = q = q->next;					\
678	dyn_count--;							\
679	free(old_q, M_IPFW); }
680
681#define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
682
683/**
684 * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
685 *
686 * If keep_me == NULL, rules are deleted even if not expired,
687 * otherwise only expired rules are removed.
688 *
689 * The value of the second parameter is also used to point to identify
690 * a rule we absolutely do not want to remove (e.g. because we are
691 * holding a reference to it -- this is the case with O_LIMIT_PARENT
692 * rules). The pointer is only used for comparison, so any non-null
693 * value will do.
694 */
695static void
696remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
697{
698	static u_int32_t last_remove = 0;
699
700#define FORCE (keep_me == NULL)
701
702	ipfw_dyn_rule *prev, *q;
703	int i, pass = 0, max_pass = 0;
704
705	if (ipfw_dyn_v == NULL || dyn_count == 0)
706		return;
707	/* do not expire more than once per second, it is useless */
708	if (!FORCE && last_remove == time_second)
709		return;
710	last_remove = time_second;
711
712	/*
713	 * because O_LIMIT refer to parent rules, during the first pass only
714	 * remove child and mark any pending LIMIT_PARENT, and remove
715	 * them in a second pass.
716	 */
717next_pass:
718	for (i = 0 ; i < curr_dyn_buckets ; i++) {
719		for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) {
720			/*
721			 * Logic can become complex here, so we split tests.
722			 */
723			if (q == keep_me)
724				goto next;
725			if (rule != NULL && rule != q->rule)
726				goto next; /* not the one we are looking for */
727			if (q->dyn_type == O_LIMIT_PARENT) {
728				/*
729				 * handle parent in the second pass,
730				 * record we need one.
731				 */
732				max_pass = 1;
733				if (pass == 0)
734					goto next;
735				if (FORCE && q->count != 0 ) {
736					/* XXX should not happen! */
737					printf("ipfw: OUCH! cannot remove rule,"
738					     " count %d\n", q->count);
739				}
740			} else {
741				if (!FORCE &&
742				    !TIME_LEQ( q->expire, time_second ))
743					goto next;
744			}
745			UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
746			continue;
747next:
748			prev=q;
749			q=q->next;
750		}
751	}
752	if (pass++ < max_pass)
753		goto next_pass;
754}
755
756
757/**
758 * lookup a dynamic rule.
759 */
760static ipfw_dyn_rule *
761lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
762	struct tcphdr *tcp)
763{
764	/*
765	 * stateful ipfw extensions.
766	 * Lookup into dynamic session queue
767	 */
768#define MATCH_REVERSE	0
769#define MATCH_FORWARD	1
770#define MATCH_NONE	2
771#define MATCH_UNKNOWN	3
772	int i, dir = MATCH_NONE;
773	ipfw_dyn_rule *prev, *q=NULL;
774
775	if (ipfw_dyn_v == NULL)
776		goto done;	/* not found */
777	i = hash_packet( pkt );
778	for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) {
779		if (q->dyn_type == O_LIMIT_PARENT)
780			goto next;
781		if (TIME_LEQ( q->expire, time_second)) { /* expire entry */
782			UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
783			continue;
784		}
785		if ( pkt->proto == q->id.proto) {
786			if (pkt->src_ip == q->id.src_ip &&
787			    pkt->dst_ip == q->id.dst_ip &&
788			    pkt->src_port == q->id.src_port &&
789			    pkt->dst_port == q->id.dst_port ) {
790				dir = MATCH_FORWARD;
791				break;
792			}
793			if (pkt->src_ip == q->id.dst_ip &&
794			    pkt->dst_ip == q->id.src_ip &&
795			    pkt->src_port == q->id.dst_port &&
796			    pkt->dst_port == q->id.src_port ) {
797				dir = MATCH_REVERSE;
798				break;
799			}
800		}
801next:
802		prev = q;
803		q = q->next;
804	}
805	if (q == NULL)
806		goto done; /* q = NULL, not found */
807
808	if ( prev != NULL) { /* found and not in front */
809		prev->next = q->next;
810		q->next = ipfw_dyn_v[i];
811		ipfw_dyn_v[i] = q;
812	}
813	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
814		u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
815
816#define BOTH_SYN	(TH_SYN | (TH_SYN << 8))
817#define BOTH_FIN	(TH_FIN | (TH_FIN << 8))
818		q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
819		switch (q->state) {
820		case TH_SYN:				/* opening */
821			q->expire = time_second + dyn_syn_lifetime;
822			break;
823
824		case BOTH_SYN:			/* move to established */
825		case BOTH_SYN | TH_FIN :	/* one side tries to close */
826		case BOTH_SYN | (TH_FIN << 8) :
827 			if (tcp) {
828#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
829			    u_int32_t ack = ntohl(tcp->th_ack);
830			    if (dir == MATCH_FORWARD) {
831				if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
832				    q->ack_fwd = ack;
833				else { /* ignore out-of-sequence */
834				    break;
835				}
836			    } else {
837				if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
838				    q->ack_rev = ack;
839				else { /* ignore out-of-sequence */
840				    break;
841				}
842			    }
843			}
844			q->expire = time_second + dyn_ack_lifetime;
845			break;
846
847		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
848			if (dyn_fin_lifetime >= dyn_keepalive_period)
849				dyn_fin_lifetime = dyn_keepalive_period - 1;
850			q->expire = time_second + dyn_fin_lifetime;
851			break;
852
853		default:
854#if 0
855			/*
856			 * reset or some invalid combination, but can also
857			 * occur if we use keep-state the wrong way.
858			 */
859			if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
860				printf("invalid state: 0x%x\n", q->state);
861#endif
862			if (dyn_rst_lifetime >= dyn_keepalive_period)
863				dyn_rst_lifetime = dyn_keepalive_period - 1;
864			q->expire = time_second + dyn_rst_lifetime;
865			break;
866		}
867	} else if (pkt->proto == IPPROTO_UDP) {
868		q->expire = time_second + dyn_udp_lifetime;
869	} else {
870		/* other protocols */
871		q->expire = time_second + dyn_short_lifetime;
872	}
873done:
874	if (match_direction)
875		*match_direction = dir;
876	return q;
877}
878
879static void
880realloc_dynamic_table(void)
881{
882	/*
883	 * Try reallocation, make sure we have a power of 2 and do
884	 * not allow more than 64k entries. In case of overflow,
885	 * default to 1024.
886	 */
887
888	if (dyn_buckets > 65536)
889		dyn_buckets = 1024;
890	if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */
891		dyn_buckets = curr_dyn_buckets; /* reset */
892		return;
893	}
894	curr_dyn_buckets = dyn_buckets;
895	if (ipfw_dyn_v != NULL)
896		free(ipfw_dyn_v, M_IPFW);
897	for (;;) {
898		ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
899		       M_IPFW, M_NOWAIT | M_ZERO);
900		if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2)
901			break;
902		curr_dyn_buckets /= 2;
903	}
904}
905
906/**
907 * Install state of type 'type' for a dynamic session.
908 * The hash table contains two type of rules:
909 * - regular rules (O_KEEP_STATE)
910 * - rules for sessions with limited number of sess per user
911 *   (O_LIMIT). When they are created, the parent is
912 *   increased by 1, and decreased on delete. In this case,
913 *   the third parameter is the parent rule and not the chain.
914 * - "parent" rules for the above (O_LIMIT_PARENT).
915 */
916static ipfw_dyn_rule *
917add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
918{
919	ipfw_dyn_rule *r;
920	int i;
921
922	if (ipfw_dyn_v == NULL ||
923	    (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) {
924		realloc_dynamic_table();
925		if (ipfw_dyn_v == NULL)
926			return NULL; /* failed ! */
927	}
928	i = hash_packet(id);
929
930	r = malloc(sizeof *r, M_IPFW, M_NOWAIT | M_ZERO);
931	if (r == NULL) {
932		printf ("ipfw: sorry cannot allocate state\n");
933		return NULL;
934	}
935
936	/* increase refcount on parent, and set pointer */
937	if (dyn_type == O_LIMIT) {
938		ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
939		if ( parent->dyn_type != O_LIMIT_PARENT)
940			panic("invalid parent");
941		parent->count++;
942		r->parent = parent;
943		rule = parent->rule;
944	}
945
946	r->id = *id;
947	r->expire = time_second + dyn_syn_lifetime;
948	r->rule = rule;
949	r->dyn_type = dyn_type;
950	r->pcnt = r->bcnt = 0;
951	r->count = 0;
952
953	r->bucket = i;
954	r->next = ipfw_dyn_v[i];
955	ipfw_dyn_v[i] = r;
956	dyn_count++;
957	DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n",
958	   dyn_type,
959	   (r->id.src_ip), (r->id.src_port),
960	   (r->id.dst_ip), (r->id.dst_port),
961	   dyn_count ); )
962	return r;
963}
964
965/**
966 * lookup dynamic parent rule using pkt and rule as search keys.
967 * If the lookup fails, then install one.
968 */
969static ipfw_dyn_rule *
970lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
971{
972	ipfw_dyn_rule *q;
973	int i;
974
975	if (ipfw_dyn_v) {
976		i = hash_packet( pkt );
977		for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next)
978			if (q->dyn_type == O_LIMIT_PARENT &&
979			    rule== q->rule &&
980			    pkt->proto == q->id.proto &&
981			    pkt->src_ip == q->id.src_ip &&
982			    pkt->dst_ip == q->id.dst_ip &&
983			    pkt->src_port == q->id.src_port &&
984			    pkt->dst_port == q->id.dst_port) {
985				q->expire = time_second + dyn_short_lifetime;
986				DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
987				return q;
988			}
989	}
990	return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
991}
992
993/**
994 * Install dynamic state for rule type cmd->o.opcode
995 *
996 * Returns 1 (failure) if state is not installed because of errors or because
997 * session limitations are enforced.
998 */
999static int
1000install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
1001	struct ip_fw_args *args)
1002{
1003	static int last_log;
1004
1005	ipfw_dyn_rule *q;
1006
1007	DEB(printf("ipfw: install state type %d 0x%08x %u -> 0x%08x %u\n",
1008	    cmd->o.opcode,
1009	    (args->f_id.src_ip), (args->f_id.src_port),
1010	    (args->f_id.dst_ip), (args->f_id.dst_port) );)
1011
1012	q = lookup_dyn_rule(&args->f_id, NULL, NULL);
1013
1014	if (q != NULL) { /* should never occur */
1015		if (last_log != time_second) {
1016			last_log = time_second;
1017			printf("ipfw: install_state: entry already present, done\n");
1018		}
1019		return 0;
1020	}
1021
1022	if (dyn_count >= dyn_max)
1023		/*
1024		 * Run out of slots, try to remove any expired rule.
1025		 */
1026		remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
1027
1028	if (dyn_count >= dyn_max) {
1029		if (last_log != time_second) {
1030			last_log = time_second;
1031			printf("ipfw: install_state: Too many dynamic rules\n");
1032		}
1033		return 1; /* cannot install, notify caller */
1034	}
1035
1036	switch (cmd->o.opcode) {
1037	case O_KEEP_STATE: /* bidir rule */
1038		add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
1039		break;
1040
1041	case O_LIMIT: /* limit number of sessions */
1042	    {
1043		u_int16_t limit_mask = cmd->limit_mask;
1044		struct ipfw_flow_id id;
1045		ipfw_dyn_rule *parent;
1046
1047		DEB(printf("ipfw: installing dyn-limit rule %d\n",
1048		    cmd->conn_limit);)
1049
1050		id.dst_ip = id.src_ip = 0;
1051		id.dst_port = id.src_port = 0;
1052		id.proto = args->f_id.proto;
1053
1054		if (limit_mask & DYN_SRC_ADDR)
1055			id.src_ip = args->f_id.src_ip;
1056		if (limit_mask & DYN_DST_ADDR)
1057			id.dst_ip = args->f_id.dst_ip;
1058		if (limit_mask & DYN_SRC_PORT)
1059			id.src_port = args->f_id.src_port;
1060		if (limit_mask & DYN_DST_PORT)
1061			id.dst_port = args->f_id.dst_port;
1062		parent = lookup_dyn_parent(&id, rule);
1063		if (parent == NULL) {
1064			printf("ipfw: add parent failed\n");
1065			return 1;
1066		}
1067		if (parent->count >= cmd->conn_limit) {
1068			/*
1069			 * See if we can remove some expired rule.
1070			 */
1071			remove_dyn_rule(rule, parent);
1072			if (parent->count >= cmd->conn_limit) {
1073				if (fw_verbose && last_log != time_second) {
1074					last_log = time_second;
1075					log(LOG_SECURITY | LOG_DEBUG,
1076					    "drop session, too many entries\n");
1077				}
1078				return 1;
1079			}
1080		}
1081		add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
1082	    }
1083		break;
1084	default:
1085		printf("ipfw: unknown dynamic rule type %u\n", cmd->o.opcode);
1086		return 1;
1087	}
1088	lookup_dyn_rule(&args->f_id, NULL, NULL); /* XXX just set lifetime */
1089	return 0;
1090}
1091
1092/*
1093 * Transmit a TCP packet, containing either a RST or a keepalive.
1094 * When flags & TH_RST, we are sending a RST packet, because of a
1095 * "reset" action matched the packet.
1096 * Otherwise we are sending a keepalive, and flags & TH_
1097 */
1098static void
1099send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags)
1100{
1101	struct mbuf *m;
1102	struct ip *ip;
1103	struct tcphdr *tcp;
1104	struct route sro;	/* fake route */
1105
1106	MGETHDR(m, M_DONTWAIT, MT_HEADER);
1107	if (m == 0)
1108		return;
1109	m->m_pkthdr.rcvif = (struct ifnet *)0;
1110	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
1111	m->m_data += max_linkhdr;
1112
1113	ip = mtod(m, struct ip *);
1114	bzero(ip, m->m_len);
1115	tcp = (struct tcphdr *)(ip + 1); /* no IP options */
1116	ip->ip_p = IPPROTO_TCP;
1117	tcp->th_off = 5;
1118	/*
1119	 * Assume we are sending a RST (or a keepalive in the reverse
1120	 * direction), swap src and destination addresses and ports.
1121	 */
1122	ip->ip_src.s_addr = htonl(id->dst_ip);
1123	ip->ip_dst.s_addr = htonl(id->src_ip);
1124	tcp->th_sport = htons(id->dst_port);
1125	tcp->th_dport = htons(id->src_port);
1126	if (flags & TH_RST) {	/* we are sending a RST */
1127		if (flags & TH_ACK) {
1128			tcp->th_seq = htonl(ack);
1129			tcp->th_ack = htonl(0);
1130			tcp->th_flags = TH_RST;
1131		} else {
1132			if (flags & TH_SYN)
1133				seq++;
1134			tcp->th_seq = htonl(0);
1135			tcp->th_ack = htonl(seq);
1136			tcp->th_flags = TH_RST | TH_ACK;
1137		}
1138	} else {
1139		/*
1140		 * We are sending a keepalive. flags & TH_SYN determines
1141		 * the direction, forward if set, reverse if clear.
1142		 * NOTE: seq and ack are always assumed to be correct
1143		 * as set by the caller. This may be confusing...
1144		 */
1145		if (flags & TH_SYN) {
1146			/*
1147			 * we have to rewrite the correct addresses!
1148			 */
1149			ip->ip_dst.s_addr = htonl(id->dst_ip);
1150			ip->ip_src.s_addr = htonl(id->src_ip);
1151			tcp->th_dport = htons(id->dst_port);
1152			tcp->th_sport = htons(id->src_port);
1153		}
1154		tcp->th_seq = htonl(seq);
1155		tcp->th_ack = htonl(ack);
1156		tcp->th_flags = TH_ACK;
1157	}
1158	/*
1159	 * set ip_len to the payload size so we can compute
1160	 * the tcp checksum on the pseudoheader
1161	 * XXX check this, could save a couple of words ?
1162	 */
1163	ip->ip_len = htons(sizeof(struct tcphdr));
1164	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
1165	/*
1166	 * now fill fields left out earlier
1167	 */
1168	ip->ip_ttl = ip_defttl;
1169	ip->ip_len = m->m_pkthdr.len;
1170	bzero (&sro, sizeof (sro));
1171	ip_rtaddr(ip->ip_dst, &sro);
1172	m->m_flags |= M_SKIP_FIREWALL;
1173	ip_output(m, NULL, &sro, 0, NULL, NULL);
1174	if (sro.ro_rt)
1175		RTFREE(sro.ro_rt);
1176}
1177
1178/*
1179 * sends a reject message, consuming the mbuf passed as an argument.
1180 */
1181static void
1182send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
1183{
1184
1185	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
1186		/* We need the IP header in host order for icmp_error(). */
1187		if (args->eh != NULL) {
1188			struct ip *ip = mtod(args->m, struct ip *);
1189			ip->ip_len = ntohs(ip->ip_len);
1190			ip->ip_off = ntohs(ip->ip_off);
1191		}
1192		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
1193	} else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
1194		struct tcphdr *const tcp =
1195		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
1196		if ( (tcp->th_flags & TH_RST) == 0)
1197			send_pkt(&(args->f_id), ntohl(tcp->th_seq),
1198				ntohl(tcp->th_ack),
1199				tcp->th_flags | TH_RST);
1200		m_freem(args->m);
1201	} else
1202		m_freem(args->m);
1203	args->m = NULL;
1204}
1205
1206/**
1207 *
1208 * Given an ip_fw *, lookup_next_rule will return a pointer
1209 * to the next rule, which can be either the jump
1210 * target (for skipto instructions) or the next one in the list (in
1211 * all other cases including a missing jump target).
1212 * The result is also written in the "next_rule" field of the rule.
1213 * Backward jumps are not allowed, so start looking from the next
1214 * rule...
1215 *
1216 * This never returns NULL -- in case we do not have an exact match,
1217 * the next rule is returned. When the ruleset is changed,
1218 * pointers are flushed so we are always correct.
1219 */
1220
1221static struct ip_fw *
1222lookup_next_rule(struct ip_fw *me)
1223{
1224	struct ip_fw *rule = NULL;
1225	ipfw_insn *cmd;
1226
1227	/* look for action, in case it is a skipto */
1228	cmd = ACTION_PTR(me);
1229	if (cmd->opcode == O_LOG)
1230		cmd += F_LEN(cmd);
1231	if ( cmd->opcode == O_SKIPTO )
1232		for (rule = me->next; rule ; rule = rule->next)
1233			if (rule->rulenum >= cmd->arg1)
1234				break;
1235	if (rule == NULL)			/* failure or not a skipto */
1236		rule = me->next;
1237	me->next_rule = rule;
1238	return rule;
1239}
1240
1241/*
1242 * The main check routine for the firewall.
1243 *
1244 * All arguments are in args so we can modify them and return them
1245 * back to the caller.
1246 *
1247 * Parameters:
1248 *
1249 *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
1250 *		Starts with the IP header.
1251 *	args->eh (in)	Mac header if present, or NULL for layer3 packet.
1252 *	args->oif	Outgoing interface, or NULL if packet is incoming.
1253 *		The incoming interface is in the mbuf. (in)
1254 *	args->divert_rule (in/out)
1255 *		Skip up to the first rule past this rule number;
1256 *		upon return, non-zero port number for divert or tee.
1257 *
1258 *	args->rule	Pointer to the last matching rule (in/out)
1259 *	args->next_hop	Socket we are forwarding to (out).
1260 *	args->f_id	Addresses grabbed from the packet (out)
1261 *
1262 * Return value:
1263 *
1264 *	IP_FW_PORT_DENY_FLAG	the packet must be dropped.
1265 *	0	The packet is to be accepted and routed normally OR
1266 *      	the packet was denied/rejected and has been dropped;
1267 *		in the latter case, *m is equal to NULL upon return.
1268 *	port	Divert the packet to port, with these caveats:
1269 *
1270 *		- If IP_FW_PORT_TEE_FLAG is set, tee the packet instead
1271 *		  of diverting it (ie, 'ipfw tee').
1272 *
1273 *		- If IP_FW_PORT_DYNT_FLAG is set, interpret the lower
1274 *		  16 bits as a dummynet pipe number instead of diverting
1275 */
1276
1277static int
1278ipfw_chk(struct ip_fw_args *args)
1279{
1280	/*
1281	 * Local variables hold state during the processing of a packet.
1282	 *
1283	 * IMPORTANT NOTE: to speed up the processing of rules, there
1284	 * are some assumption on the values of the variables, which
1285	 * are documented here. Should you change them, please check
1286	 * the implementation of the various instructions to make sure
1287	 * that they still work.
1288	 *
1289	 * args->eh	The MAC header. It is non-null for a layer2
1290	 *	packet, it is NULL for a layer-3 packet.
1291	 *
1292	 * m | args->m	Pointer to the mbuf, as received from the caller.
1293	 *	It may change if ipfw_chk() does an m_pullup, or if it
1294	 *	consumes the packet because it calls send_reject().
1295	 *	XXX This has to change, so that ipfw_chk() never modifies
1296	 *	or consumes the buffer.
1297	 * ip	is simply an alias of the value of m, and it is kept
1298	 *	in sync with it (the packet is	supposed to start with
1299	 *	the ip header).
1300	 */
1301	struct mbuf *m = args->m;
1302	struct ip *ip = mtod(m, struct ip *);
1303
1304	/*
1305	 * oif | args->oif	If NULL, ipfw_chk has been called on the
1306	 *	inbound path (ether_input, bdg_forward, ip_input).
1307	 *	If non-NULL, ipfw_chk has been called on the outbound path
1308	 *	(ether_output, ip_output).
1309	 */
1310	struct ifnet *oif = args->oif;
1311
1312	struct ip_fw *f = NULL;		/* matching rule */
1313	int retval = 0;
1314
1315	/*
1316	 * hlen	The length of the IPv4 header.
1317	 *	hlen >0 means we have an IPv4 packet.
1318	 */
1319	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
1320
1321	/*
1322	 * offset	The offset of a fragment. offset != 0 means that
1323	 *	we have a fragment at this offset of an IPv4 packet.
1324	 *	offset == 0 means that (if this is an IPv4 packet)
1325	 *	this is the first or only fragment.
1326	 */
1327	u_short offset = 0;
1328
1329	/*
1330	 * Local copies of addresses. They are only valid if we have
1331	 * an IP packet.
1332	 *
1333	 * proto	The protocol. Set to 0 for non-ip packets,
1334	 *	or to the protocol read from the packet otherwise.
1335	 *	proto != 0 means that we have an IPv4 packet.
1336	 *
1337	 * src_port, dst_port	port numbers, in HOST format. Only
1338	 *	valid for TCP and UDP packets.
1339	 *
1340	 * src_ip, dst_ip	ip addresses, in NETWORK format.
1341	 *	Only valid for IPv4 packets.
1342	 */
1343	u_int8_t proto;
1344	u_int16_t src_port = 0, dst_port = 0;	/* NOTE: host format	*/
1345	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
1346	u_int16_t ip_len=0;
1347	int pktlen;
1348	int dyn_dir = MATCH_UNKNOWN;
1349	ipfw_dyn_rule *q = NULL;
1350
1351	if (m->m_flags & M_SKIP_FIREWALL)
1352		return 0;	/* accept */
1353	/*
1354	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
1355	 * 	MATCH_NONE when checked and not matched (q = NULL),
1356	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
1357	 */
1358
1359	pktlen = m->m_pkthdr.len;
1360	if (args->eh == NULL ||		/* layer 3 packet */
1361		( m->m_pkthdr.len >= sizeof(struct ip) &&
1362		    ntohs(args->eh->ether_type) == ETHERTYPE_IP))
1363			hlen = ip->ip_hl << 2;
1364
1365	/*
1366	 * Collect parameters into local variables for faster matching.
1367	 */
1368	if (hlen == 0) {	/* do not grab addresses for non-ip pkts */
1369		proto = args->f_id.proto = 0;	/* mark f_id invalid */
1370		goto after_ip_checks;
1371	}
1372
1373	proto = args->f_id.proto = ip->ip_p;
1374	src_ip = ip->ip_src;
1375	dst_ip = ip->ip_dst;
1376	if (args->eh != NULL) { /* layer 2 packets are as on the wire */
1377		offset = ntohs(ip->ip_off) & IP_OFFMASK;
1378		ip_len = ntohs(ip->ip_len);
1379	} else {
1380		offset = ip->ip_off & IP_OFFMASK;
1381		ip_len = ip->ip_len;
1382	}
1383	pktlen = ip_len < pktlen ? ip_len : pktlen;
1384
1385#define PULLUP_TO(len)						\
1386		do {						\
1387			if ((m)->m_len < (len)) {		\
1388			    args->m = m = m_pullup(m, (len));	\
1389			    if (m == 0)				\
1390				goto pullup_failed;		\
1391			    ip = mtod(m, struct ip *);		\
1392			}					\
1393		} while (0)
1394
1395	if (offset == 0) {
1396		switch (proto) {
1397		case IPPROTO_TCP:
1398		    {
1399			struct tcphdr *tcp;
1400
1401			PULLUP_TO(hlen + sizeof(struct tcphdr));
1402			tcp = L3HDR(struct tcphdr, ip);
1403			dst_port = tcp->th_dport;
1404			src_port = tcp->th_sport;
1405			args->f_id.flags = tcp->th_flags;
1406			}
1407			break;
1408
1409		case IPPROTO_UDP:
1410		    {
1411			struct udphdr *udp;
1412
1413			PULLUP_TO(hlen + sizeof(struct udphdr));
1414			udp = L3HDR(struct udphdr, ip);
1415			dst_port = udp->uh_dport;
1416			src_port = udp->uh_sport;
1417			}
1418			break;
1419
1420		case IPPROTO_ICMP:
1421			PULLUP_TO(hlen + 4);	/* type, code and checksum. */
1422			args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
1423			break;
1424
1425		default:
1426			break;
1427		}
1428#undef PULLUP_TO
1429	}
1430
1431	args->f_id.src_ip = ntohl(src_ip.s_addr);
1432	args->f_id.dst_ip = ntohl(dst_ip.s_addr);
1433	args->f_id.src_port = src_port = ntohs(src_port);
1434	args->f_id.dst_port = dst_port = ntohs(dst_port);
1435
1436after_ip_checks:
1437	if (args->rule) {
1438		/*
1439		 * Packet has already been tagged. Look for the next rule
1440		 * to restart processing.
1441		 *
1442		 * If fw_one_pass != 0 then just accept it.
1443		 * XXX should not happen here, but optimized out in
1444		 * the caller.
1445		 */
1446		if (fw_one_pass)
1447			return 0;
1448
1449		f = args->rule->next_rule;
1450		if (f == NULL)
1451			f = lookup_next_rule(args->rule);
1452	} else {
1453		/*
1454		 * Find the starting rule. It can be either the first
1455		 * one, or the one after divert_rule if asked so.
1456		 */
1457		int skipto = args->divert_rule;
1458
1459		f = layer3_chain;
1460		if (args->eh == NULL && skipto != 0) {
1461			if (skipto >= IPFW_DEFAULT_RULE)
1462				return(IP_FW_PORT_DENY_FLAG); /* invalid */
1463			while (f && f->rulenum <= skipto)
1464				f = f->next;
1465			if (f == NULL)	/* drop packet */
1466				return(IP_FW_PORT_DENY_FLAG);
1467		}
1468	}
1469	args->divert_rule = 0;	/* reset to avoid confusion later */
1470
1471	/*
1472	 * Now scan the rules, and parse microinstructions for each rule.
1473	 */
1474	for (; f; f = f->next) {
1475		int l, cmdlen;
1476		ipfw_insn *cmd;
1477		int skip_or; /* skip rest of OR block */
1478
1479again:
1480		if (set_disable & (1 << f->set) )
1481			continue;
1482
1483		skip_or = 0;
1484		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
1485		    l -= cmdlen, cmd += cmdlen) {
1486			int match;
1487
1488			/*
1489			 * check_body is a jump target used when we find a
1490			 * CHECK_STATE, and need to jump to the body of
1491			 * the target rule.
1492			 */
1493
1494check_body:
1495			cmdlen = F_LEN(cmd);
1496			/*
1497			 * An OR block (insn_1 || .. || insn_n) has the
1498			 * F_OR bit set in all but the last instruction.
1499			 * The first match will set "skip_or", and cause
1500			 * the following instructions to be skipped until
1501			 * past the one with the F_OR bit clear.
1502			 */
1503			if (skip_or) {		/* skip this instruction */
1504				if ((cmd->len & F_OR) == 0)
1505					skip_or = 0;	/* next one is good */
1506				continue;
1507			}
1508			match = 0; /* set to 1 if we succeed */
1509
1510			switch (cmd->opcode) {
1511			/*
1512			 * The first set of opcodes compares the packet's
1513			 * fields with some pattern, setting 'match' if a
1514			 * match is found. At the end of the loop there is
1515			 * logic to deal with F_NOT and F_OR flags associated
1516			 * with the opcode.
1517			 */
1518			case O_NOP:
1519				match = 1;
1520				break;
1521
1522			case O_FORWARD_MAC:
1523				printf("ipfw: opcode %d unimplemented\n",
1524				    cmd->opcode);
1525				break;
1526
1527			case O_GID:
1528			case O_UID:
1529				/*
1530				 * We only check offset == 0 && proto != 0,
1531				 * as this ensures that we have an IPv4
1532				 * packet with the ports info.
1533				 */
1534				if (offset!=0)
1535					break;
1536			    {
1537				struct inpcbinfo *pi;
1538				int wildcard;
1539				struct inpcb *pcb;
1540
1541				if (proto == IPPROTO_TCP) {
1542					wildcard = 0;
1543					pi = &tcbinfo;
1544				} else if (proto == IPPROTO_UDP) {
1545					wildcard = 1;
1546					pi = &udbinfo;
1547				} else
1548					break;
1549
1550				pcb =  (oif) ?
1551					in_pcblookup_hash(pi,
1552					    dst_ip, htons(dst_port),
1553					    src_ip, htons(src_port),
1554					    wildcard, oif) :
1555					in_pcblookup_hash(pi,
1556					    src_ip, htons(src_port),
1557					    dst_ip, htons(dst_port),
1558					    wildcard, NULL);
1559
1560				if (pcb == NULL || pcb->inp_socket == NULL)
1561					break;
1562#if __FreeBSD_version < 500034
1563#define socheckuid(a,b)	((a)->so_cred->cr_uid != (b))
1564#endif
1565				if (cmd->opcode == O_UID) {
1566					match =
1567					  !socheckuid(pcb->inp_socket,
1568					   (uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
1569				} else  {
1570					match = groupmember(
1571					    (uid_t)((ipfw_insn_u32 *)cmd)->d[0],
1572					    pcb->inp_socket->so_cred);
1573				}
1574			    }
1575				break;
1576
1577			case O_RECV:
1578				match = iface_match(m->m_pkthdr.rcvif,
1579				    (ipfw_insn_if *)cmd);
1580				break;
1581
1582			case O_XMIT:
1583				match = iface_match(oif, (ipfw_insn_if *)cmd);
1584				break;
1585
1586			case O_VIA:
1587				match = iface_match(oif ? oif :
1588				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
1589				break;
1590
1591			case O_MACADDR2:
1592				if (args->eh != NULL) {	/* have MAC header */
1593					u_int32_t *want = (u_int32_t *)
1594						((ipfw_insn_mac *)cmd)->addr;
1595					u_int32_t *mask = (u_int32_t *)
1596						((ipfw_insn_mac *)cmd)->mask;
1597					u_int32_t *hdr = (u_int32_t *)args->eh;
1598
1599					match =
1600					    ( want[0] == (hdr[0] & mask[0]) &&
1601					      want[1] == (hdr[1] & mask[1]) &&
1602					      want[2] == (hdr[2] & mask[2]) );
1603				}
1604				break;
1605
1606			case O_MAC_TYPE:
1607				if (args->eh != NULL) {
1608					u_int16_t t =
1609					    ntohs(args->eh->ether_type);
1610					u_int16_t *p =
1611					    ((ipfw_insn_u16 *)cmd)->ports;
1612					int i;
1613
1614					for (i = cmdlen - 1; !match && i>0;
1615					    i--, p += 2)
1616						match = (t>=p[0] && t<=p[1]);
1617				}
1618				break;
1619
1620			case O_FRAG:
1621				match = (hlen > 0 && offset != 0);
1622				break;
1623
1624			case O_IN:	/* "out" is "not in" */
1625				match = (oif == NULL);
1626				break;
1627
1628			case O_LAYER2:
1629				match = (args->eh != NULL);
1630				break;
1631
1632			case O_PROTO:
1633				/*
1634				 * We do not allow an arg of 0 so the
1635				 * check of "proto" only suffices.
1636				 */
1637				match = (proto == cmd->arg1);
1638				break;
1639
1640			case O_IP_SRC:
1641				match = (hlen > 0 &&
1642				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
1643				    src_ip.s_addr);
1644				break;
1645
1646			case O_IP_SRC_MASK:
1647				match = (hlen > 0 &&
1648				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
1649				     (src_ip.s_addr &
1650				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
1651				break;
1652
1653			case O_IP_SRC_ME:
1654				if (hlen > 0) {
1655					struct ifnet *tif;
1656
1657					INADDR_TO_IFP(src_ip, tif);
1658					match = (tif != NULL);
1659				}
1660				break;
1661
1662			case O_IP_DST_SET:
1663			case O_IP_SRC_SET:
1664				if (hlen > 0) {
1665					u_int32_t *d = (u_int32_t *)(cmd+1);
1666					u_int32_t addr =
1667					    cmd->opcode == O_IP_DST_SET ?
1668						args->f_id.dst_ip :
1669						args->f_id.src_ip;
1670
1671					    if (addr < d[0])
1672						    break;
1673					    addr -= d[0]; /* subtract base */
1674					    match = (addr < cmd->arg1) &&
1675						( d[ 1 + (addr>>5)] &
1676						  (1<<(addr & 0x1f)) );
1677				}
1678				break;
1679
1680			case O_IP_DST:
1681				match = (hlen > 0 &&
1682				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
1683				    dst_ip.s_addr);
1684				break;
1685
1686			case O_IP_DST_MASK:
1687				match = (hlen > 0) &&
1688				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
1689				     (dst_ip.s_addr &
1690				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
1691				break;
1692
1693			case O_IP_DST_ME:
1694				if (hlen > 0) {
1695					struct ifnet *tif;
1696
1697					INADDR_TO_IFP(dst_ip, tif);
1698					match = (tif != NULL);
1699				}
1700				break;
1701
1702			case O_IP_SRCPORT:
1703			case O_IP_DSTPORT:
1704				/*
1705				 * offset == 0 && proto != 0 is enough
1706				 * to guarantee that we have an IPv4
1707				 * packet with port info.
1708				 */
1709				if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
1710				    && offset == 0) {
1711					u_int16_t x =
1712					    (cmd->opcode == O_IP_SRCPORT) ?
1713						src_port : dst_port ;
1714					u_int16_t *p =
1715					    ((ipfw_insn_u16 *)cmd)->ports;
1716					int i;
1717
1718					for (i = cmdlen - 1; !match && i>0;
1719					    i--, p += 2)
1720						match = (x>=p[0] && x<=p[1]);
1721				}
1722				break;
1723
1724			case O_ICMPTYPE:
1725				match = (offset == 0 && proto==IPPROTO_ICMP &&
1726				    icmptype_match(ip, (ipfw_insn_u32 *)cmd) );
1727				break;
1728
1729			case O_IPOPT:
1730				match = (hlen > 0 && ipopts_match(ip, cmd) );
1731				break;
1732
1733			case O_IPVER:
1734				match = (hlen > 0 && cmd->arg1 == ip->ip_v);
1735				break;
1736
1737			case O_IPID:
1738			case O_IPLEN:
1739			case O_IPTTL:
1740				if (hlen > 0) {	/* only for IP packets */
1741				    uint16_t x;
1742				    uint16_t *p;
1743				    int i;
1744
1745				    if (cmd->opcode == O_IPLEN)
1746					x = ip_len;
1747				    else if (cmd->opcode == O_IPTTL)
1748					x = ip->ip_ttl;
1749				    else /* must be IPID */
1750					x = ntohs(ip->ip_id);
1751				    if (cmdlen == 1) {
1752					match = (cmd->arg1 == x);
1753					break;
1754				    }
1755				    /* otherwise we have ranges */
1756				    p = ((ipfw_insn_u16 *)cmd)->ports;
1757				    i = cmdlen - 1;
1758				    for (; !match && i>0; i--, p += 2)
1759					match = (x >= p[0] && x <= p[1]);
1760				}
1761				break;
1762
1763			case O_IPPRECEDENCE:
1764				match = (hlen > 0 &&
1765				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
1766				break;
1767
1768			case O_IPTOS:
1769				match = (hlen > 0 &&
1770				    flags_match(cmd, ip->ip_tos));
1771				break;
1772
1773			case O_TCPFLAGS:
1774				match = (proto == IPPROTO_TCP && offset == 0 &&
1775				    flags_match(cmd,
1776					L3HDR(struct tcphdr,ip)->th_flags));
1777				break;
1778
1779			case O_TCPOPTS:
1780				match = (proto == IPPROTO_TCP && offset == 0 &&
1781				    tcpopts_match(ip, cmd));
1782				break;
1783
1784			case O_TCPSEQ:
1785				match = (proto == IPPROTO_TCP && offset == 0 &&
1786				    ((ipfw_insn_u32 *)cmd)->d[0] ==
1787					L3HDR(struct tcphdr,ip)->th_seq);
1788				break;
1789
1790			case O_TCPACK:
1791				match = (proto == IPPROTO_TCP && offset == 0 &&
1792				    ((ipfw_insn_u32 *)cmd)->d[0] ==
1793					L3HDR(struct tcphdr,ip)->th_ack);
1794				break;
1795
1796			case O_TCPWIN:
1797				match = (proto == IPPROTO_TCP && offset == 0 &&
1798				    cmd->arg1 ==
1799					L3HDR(struct tcphdr,ip)->th_win);
1800				break;
1801
1802			case O_ESTAB:
1803				/* reject packets which have SYN only */
1804				/* XXX should i also check for TH_ACK ? */
1805				match = (proto == IPPROTO_TCP && offset == 0 &&
1806				    (L3HDR(struct tcphdr,ip)->th_flags &
1807				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
1808				break;
1809
1810			case O_LOG:
1811				if (fw_verbose)
1812					ipfw_log(f, hlen, args->eh, m, oif);
1813				match = 1;
1814				break;
1815
1816			case O_PROB:
1817				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
1818				break;
1819
1820			case O_VERREVPATH:
1821				/* Outgoing packets automatically pass/match */
1822				match = ((oif != NULL) ||
1823				    (m->m_pkthdr.rcvif == NULL) ||
1824				    verify_rev_path(src_ip, m->m_pkthdr.rcvif));
1825				break;
1826
1827			case O_IPSEC:
1828#ifdef FAST_IPSEC
1829				match = (m_tag_find(m,
1830				    PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
1831#endif
1832#ifdef IPSEC
1833				match = (ipsec_gethist(m, NULL) != NULL);
1834#endif
1835				/* otherwise no match */
1836				break;
1837
1838			/*
1839			 * The second set of opcodes represents 'actions',
1840			 * i.e. the terminal part of a rule once the packet
1841			 * matches all previous patterns.
1842			 * Typically there is only one action for each rule,
1843			 * and the opcode is stored at the end of the rule
1844			 * (but there are exceptions -- see below).
1845			 *
1846			 * In general, here we set retval and terminate the
1847			 * outer loop (would be a 'break 3' in some language,
1848			 * but we need to do a 'goto done').
1849			 *
1850			 * Exceptions:
1851			 * O_COUNT and O_SKIPTO actions:
1852			 *   instead of terminating, we jump to the next rule
1853			 *   ('goto next_rule', equivalent to a 'break 2'),
1854			 *   or to the SKIPTO target ('goto again' after
1855			 *   having set f, cmd and l), respectively.
1856			 *
1857			 * O_LIMIT and O_KEEP_STATE: these opcodes are
1858			 *   not real 'actions', and are stored right
1859			 *   before the 'action' part of the rule.
1860			 *   These opcodes try to install an entry in the
1861			 *   state tables; if successful, we continue with
1862			 *   the next opcode (match=1; break;), otherwise
1863			 *   the packet *   must be dropped
1864			 *   ('goto done' after setting retval);
1865			 *
1866			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
1867			 *   cause a lookup of the state table, and a jump
1868			 *   to the 'action' part of the parent rule
1869			 *   ('goto check_body') if an entry is found, or
1870			 *   (CHECK_STATE only) a jump to the next rule if
1871			 *   the entry is not found ('goto next_rule').
1872			 *   The result of the lookup is cached to make
1873			 *   further instances of these opcodes are
1874			 *   effectively NOPs.
1875			 */
1876			case O_LIMIT:
1877			case O_KEEP_STATE:
1878				if (install_state(f,
1879				    (ipfw_insn_limit *)cmd, args)) {
1880					retval = IP_FW_PORT_DENY_FLAG;
1881					goto done; /* error/limit violation */
1882				}
1883				match = 1;
1884				break;
1885
1886			case O_PROBE_STATE:
1887			case O_CHECK_STATE:
1888				/*
1889				 * dynamic rules are checked at the first
1890				 * keep-state or check-state occurrence,
1891				 * with the result being stored in dyn_dir.
1892				 * The compiler introduces a PROBE_STATE
1893				 * instruction for us when we have a
1894				 * KEEP_STATE (because PROBE_STATE needs
1895				 * to be run first).
1896				 */
1897				if (dyn_dir == MATCH_UNKNOWN &&
1898				    (q = lookup_dyn_rule(&args->f_id,
1899				     &dyn_dir, proto == IPPROTO_TCP ?
1900					L3HDR(struct tcphdr, ip) : NULL))
1901					!= NULL) {
1902					/*
1903					 * Found dynamic entry, update stats
1904					 * and jump to the 'action' part of
1905					 * the parent rule.
1906					 */
1907					q->pcnt++;
1908					q->bcnt += pktlen;
1909					f = q->rule;
1910					cmd = ACTION_PTR(f);
1911					l = f->cmd_len - f->act_ofs;
1912					goto check_body;
1913				}
1914				/*
1915				 * Dynamic entry not found. If CHECK_STATE,
1916				 * skip to next rule, if PROBE_STATE just
1917				 * ignore and continue with next opcode.
1918				 */
1919				if (cmd->opcode == O_CHECK_STATE)
1920					goto next_rule;
1921				match = 1;
1922				break;
1923
1924			case O_ACCEPT:
1925				retval = 0;	/* accept */
1926				goto done;
1927
1928			case O_PIPE:
1929			case O_QUEUE:
1930				args->rule = f; /* report matching rule */
1931				retval = cmd->arg1 | IP_FW_PORT_DYNT_FLAG;
1932				goto done;
1933
1934			case O_DIVERT:
1935			case O_TEE:
1936				if (args->eh) /* not on layer 2 */
1937					break;
1938				args->divert_rule = f->rulenum;
1939				retval = (cmd->opcode == O_DIVERT) ?
1940				    cmd->arg1 :
1941				    cmd->arg1 | IP_FW_PORT_TEE_FLAG;
1942				goto done;
1943
1944			case O_COUNT:
1945			case O_SKIPTO:
1946				f->pcnt++;	/* update stats */
1947				f->bcnt += pktlen;
1948				f->timestamp = time_second;
1949				if (cmd->opcode == O_COUNT)
1950					goto next_rule;
1951				/* handle skipto */
1952				if (f->next_rule == NULL)
1953					lookup_next_rule(f);
1954				f = f->next_rule;
1955				goto again;
1956
1957			case O_REJECT:
1958				/*
1959				 * Drop the packet and send a reject notice
1960				 * if the packet is not ICMP (or is an ICMP
1961				 * query), and it is not multicast/broadcast.
1962				 */
1963				if (hlen > 0 &&
1964				    (proto != IPPROTO_ICMP ||
1965				     is_icmp_query(ip)) &&
1966				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
1967				    !IN_MULTICAST(dst_ip.s_addr)) {
1968					send_reject(args, cmd->arg1,
1969					    offset,ip_len);
1970					m = args->m;
1971				}
1972				/* FALLTHROUGH */
1973			case O_DENY:
1974				retval = IP_FW_PORT_DENY_FLAG;
1975				goto done;
1976
1977			case O_FORWARD_IP:
1978				if (args->eh)	/* not valid on layer2 pkts */
1979					break;
1980				if (!q || dyn_dir == MATCH_FORWARD)
1981					args->next_hop =
1982					    &((ipfw_insn_sa *)cmd)->sa;
1983				retval = 0;
1984				goto done;
1985
1986			default:
1987				panic("-- unknown opcode %d\n", cmd->opcode);
1988			} /* end of switch() on opcodes */
1989
1990			if (cmd->len & F_NOT)
1991				match = !match;
1992
1993			if (match) {
1994				if (cmd->len & F_OR)
1995					skip_or = 1;
1996			} else {
1997				if (!(cmd->len & F_OR)) /* not an OR block, */
1998					break;		/* try next rule    */
1999			}
2000
2001		}	/* end of inner for, scan opcodes */
2002
2003next_rule:;		/* try next rule		*/
2004
2005	}		/* end of outer for, scan rules */
2006	printf("ipfw: ouch!, skip past end of rules, denying packet\n");
2007	return(IP_FW_PORT_DENY_FLAG);
2008
2009done:
2010	/* Update statistics */
2011	f->pcnt++;
2012	f->bcnt += pktlen;
2013	f->timestamp = time_second;
2014	return retval;
2015
2016pullup_failed:
2017	if (fw_verbose)
2018		printf("ipfw: pullup failed\n");
2019	return(IP_FW_PORT_DENY_FLAG);
2020}
2021
2022/*
2023 * When a rule is added/deleted, clear the next_rule pointers in all rules.
2024 * These will be reconstructed on the fly as packets are matched.
2025 * Must be called at splimp().
2026 */
2027static void
2028flush_rule_ptrs(void)
2029{
2030	struct ip_fw *rule;
2031
2032	for (rule = layer3_chain; rule; rule = rule->next)
2033		rule->next_rule = NULL;
2034}
2035
2036/*
2037 * When pipes/queues are deleted, clear the "pipe_ptr" pointer to a given
2038 * pipe/queue, or to all of them (match == NULL).
2039 * Must be called at splimp().
2040 */
2041void
2042flush_pipe_ptrs(struct dn_flow_set *match)
2043{
2044	struct ip_fw *rule;
2045
2046	for (rule = layer3_chain; rule; rule = rule->next) {
2047		ipfw_insn_pipe *cmd = (ipfw_insn_pipe *)ACTION_PTR(rule);
2048
2049		if (cmd->o.opcode != O_PIPE && cmd->o.opcode != O_QUEUE)
2050			continue;
2051		/*
2052		 * XXX Use bcmp/bzero to handle pipe_ptr to overcome
2053		 * possible alignment problems on 64-bit architectures.
2054		 * This code is seldom used so we do not worry too
2055		 * much about efficiency.
2056		 */
2057		if (match == NULL ||
2058		    !bcmp(&cmd->pipe_ptr, &match, sizeof(match)) )
2059			bzero(&cmd->pipe_ptr, sizeof(cmd->pipe_ptr));
2060	}
2061}
2062
2063/*
2064 * Add a new rule to the list. Copy the rule into a malloc'ed area, then
2065 * possibly create a rule number and add the rule to the list.
2066 * Update the rule_number in the input struct so the caller knows it as well.
2067 */
2068static int
2069add_rule(struct ip_fw **head, struct ip_fw *input_rule)
2070{
2071	struct ip_fw *rule, *f, *prev;
2072	int s;
2073	int l = RULESIZE(input_rule);
2074
2075	if (*head == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE)
2076		return (EINVAL);
2077
2078	rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO);
2079	if (rule == NULL)
2080		return (ENOSPC);
2081
2082	bcopy(input_rule, rule, l);
2083
2084	rule->next = NULL;
2085	rule->next_rule = NULL;
2086
2087	rule->pcnt = 0;
2088	rule->bcnt = 0;
2089	rule->timestamp = 0;
2090
2091	s = splimp();
2092
2093	if (*head == NULL) {	/* default rule */
2094		*head = rule;
2095		goto done;
2096        }
2097
2098	/*
2099	 * If rulenum is 0, find highest numbered rule before the
2100	 * default rule, and add autoinc_step
2101	 */
2102	if (autoinc_step < 1)
2103		autoinc_step = 1;
2104	else if (autoinc_step > 1000)
2105		autoinc_step = 1000;
2106	if (rule->rulenum == 0) {
2107		/*
2108		 * locate the highest numbered rule before default
2109		 */
2110		for (f = *head; f; f = f->next) {
2111			if (f->rulenum == IPFW_DEFAULT_RULE)
2112				break;
2113			rule->rulenum = f->rulenum;
2114		}
2115		if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step)
2116			rule->rulenum += autoinc_step;
2117		input_rule->rulenum = rule->rulenum;
2118	}
2119
2120	/*
2121	 * Now insert the new rule in the right place in the sorted list.
2122	 */
2123	for (prev = NULL, f = *head; f; prev = f, f = f->next) {
2124		if (f->rulenum > rule->rulenum) { /* found the location */
2125			if (prev) {
2126				rule->next = f;
2127				prev->next = rule;
2128			} else { /* head insert */
2129				rule->next = *head;
2130				*head = rule;
2131			}
2132			break;
2133		}
2134	}
2135	flush_rule_ptrs();
2136done:
2137	static_count++;
2138	static_len += l;
2139	splx(s);
2140	DEB(printf("ipfw: installed rule %d, static count now %d\n",
2141		rule->rulenum, static_count);)
2142	return (0);
2143}
2144
2145/**
2146 * Free storage associated with a static rule (including derived
2147 * dynamic rules).
2148 * The caller is in charge of clearing rule pointers to avoid
2149 * dangling pointers.
2150 * @return a pointer to the next entry.
2151 * Arguments are not checked, so they better be correct.
2152 * Must be called at splimp().
2153 */
2154static struct ip_fw *
2155delete_rule(struct ip_fw **head, struct ip_fw *prev, struct ip_fw *rule)
2156{
2157	struct ip_fw *n;
2158	int l = RULESIZE(rule);
2159
2160	n = rule->next;
2161	remove_dyn_rule(rule, NULL /* force removal */);
2162	if (prev == NULL)
2163		*head = n;
2164	else
2165		prev->next = n;
2166	static_count--;
2167	static_len -= l;
2168
2169	if (DUMMYNET_LOADED)
2170		ip_dn_ruledel_ptr(rule);
2171	free(rule, M_IPFW);
2172	return n;
2173}
2174
2175/*
2176 * Deletes all rules from a chain (including the default rule
2177 * if the second argument is set).
2178 * Must be called at splimp().
2179 */
2180static void
2181free_chain(struct ip_fw **chain, int kill_default)
2182{
2183	struct ip_fw *rule;
2184
2185	flush_rule_ptrs(); /* more efficient to do outside the loop */
2186
2187	while ( (rule = *chain) != NULL &&
2188	    (kill_default || rule->rulenum != IPFW_DEFAULT_RULE) )
2189		delete_rule(chain, NULL, rule);
2190}
2191
2192/**
2193 * Remove all rules with given number, and also do set manipulation.
2194 *
2195 * The argument is an u_int32_t. The low 16 bit are the rule or set number,
2196 * the next 8 bits are the new set, the top 8 bits are the command:
2197 *
2198 *	0	delete rules with given number
2199 *	1	delete rules with given set number
2200 *	2	move rules with given number to new set
2201 *	3	move rules with given set number to new set
2202 *	4	swap sets with given numbers
2203 */
2204static int
2205del_entry(struct ip_fw **chain, u_int32_t arg)
2206{
2207	struct ip_fw *prev, *rule;
2208	int s;
2209	u_int16_t rulenum;
2210	u_int8_t cmd, new_set;
2211
2212	rulenum = arg & 0xffff;
2213	cmd = (arg >> 24) & 0xff;
2214	new_set = (arg >> 16) & 0xff;
2215
2216	if (cmd > 4)
2217		return EINVAL;
2218	if (new_set > 30)
2219		return EINVAL;
2220	if (cmd == 0 || cmd == 2) {
2221		if (rulenum == IPFW_DEFAULT_RULE)
2222			return EINVAL;
2223	} else {
2224		if (rulenum > 30)
2225			return EINVAL;
2226	}
2227
2228	switch (cmd) {
2229	case 0:	/* delete rules with given number */
2230		/*
2231		 * locate first rule to delete
2232		 */
2233		for (prev = NULL, rule = *chain;
2234		    rule && rule->rulenum < rulenum;
2235		     prev = rule, rule = rule->next)
2236			;
2237		if (rule->rulenum != rulenum)
2238			return EINVAL;
2239
2240		s = splimp(); /* no access to rules while removing */
2241		/*
2242		 * flush pointers outside the loop, then delete all matching
2243		 * rules. prev remains the same throughout the cycle.
2244		 */
2245		flush_rule_ptrs();
2246		while (rule && rule->rulenum == rulenum)
2247			rule = delete_rule(chain, prev, rule);
2248		splx(s);
2249		break;
2250
2251	case 1:	/* delete all rules with given set number */
2252		s = splimp();
2253		flush_rule_ptrs();
2254		for (prev = NULL, rule = *chain; rule ; )
2255			if (rule->set == rulenum)
2256				rule = delete_rule(chain, prev, rule);
2257			else {
2258				prev = rule;
2259				rule = rule->next;
2260			}
2261		splx(s);
2262		break;
2263
2264	case 2:	/* move rules with given number to new set */
2265		s = splimp();
2266		for (rule = *chain; rule ; rule = rule->next)
2267			if (rule->rulenum == rulenum)
2268				rule->set = new_set;
2269		splx(s);
2270		break;
2271
2272	case 3: /* move rules with given set number to new set */
2273		s = splimp();
2274		for (rule = *chain; rule ; rule = rule->next)
2275			if (rule->set == rulenum)
2276				rule->set = new_set;
2277		splx(s);
2278		break;
2279
2280	case 4: /* swap two sets */
2281		s = splimp();
2282		for (rule = *chain; rule ; rule = rule->next)
2283			if (rule->set == rulenum)
2284				rule->set = new_set;
2285			else if (rule->set == new_set)
2286				rule->set = rulenum;
2287		splx(s);
2288		break;
2289	}
2290	return 0;
2291}
2292
2293/*
2294 * Clear counters for a specific rule.
2295 */
2296static void
2297clear_counters(struct ip_fw *rule, int log_only)
2298{
2299	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
2300
2301	if (log_only == 0) {
2302		rule->bcnt = rule->pcnt = 0;
2303		rule->timestamp = 0;
2304	}
2305	if (l->o.opcode == O_LOG)
2306		l->log_left = l->max_log;
2307}
2308
2309/**
2310 * Reset some or all counters on firewall rules.
2311 * @arg frwl is null to clear all entries, or contains a specific
2312 * rule number.
2313 * @arg log_only is 1 if we only want to reset logs, zero otherwise.
2314 */
2315static int
2316zero_entry(int rulenum, int log_only)
2317{
2318	struct ip_fw *rule;
2319	int s;
2320	char *msg;
2321
2322	if (rulenum == 0) {
2323		s = splimp();
2324		norule_counter = 0;
2325		for (rule = layer3_chain; rule; rule = rule->next)
2326			clear_counters(rule, log_only);
2327		splx(s);
2328		msg = log_only ? "ipfw: All logging counts reset.\n" :
2329				"ipfw: Accounting cleared.\n";
2330	} else {
2331		int cleared = 0;
2332		/*
2333		 * We can have multiple rules with the same number, so we
2334		 * need to clear them all.
2335		 */
2336		for (rule = layer3_chain; rule; rule = rule->next)
2337			if (rule->rulenum == rulenum) {
2338				s = splimp();
2339				while (rule && rule->rulenum == rulenum) {
2340					clear_counters(rule, log_only);
2341					rule = rule->next;
2342				}
2343				splx(s);
2344				cleared = 1;
2345				break;
2346			}
2347		if (!cleared)	/* we did not find any matching rules */
2348			return (EINVAL);
2349		msg = log_only ? "ipfw: Entry %d logging count reset.\n" :
2350				"ipfw: Entry %d cleared.\n";
2351	}
2352	if (fw_verbose)
2353		log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
2354	return (0);
2355}
2356
2357/*
2358 * Check validity of the structure before insert.
2359 * Fortunately rules are simple, so this mostly need to check rule sizes.
2360 */
2361static int
2362check_ipfw_struct(struct ip_fw *rule, int size)
2363{
2364	int l, cmdlen = 0;
2365	int have_action=0;
2366	ipfw_insn *cmd;
2367
2368	if (size < sizeof(*rule)) {
2369		printf("ipfw: rule too short\n");
2370		return (EINVAL);
2371	}
2372	/* first, check for valid size */
2373	l = RULESIZE(rule);
2374	if (l != size) {
2375		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
2376		return (EINVAL);
2377	}
2378	/*
2379	 * Now go for the individual checks. Very simple ones, basically only
2380	 * instruction sizes.
2381	 */
2382	for (l = rule->cmd_len, cmd = rule->cmd ;
2383			l > 0 ; l -= cmdlen, cmd += cmdlen) {
2384		cmdlen = F_LEN(cmd);
2385		if (cmdlen > l) {
2386			printf("ipfw: opcode %d size truncated\n",
2387			    cmd->opcode);
2388			return EINVAL;
2389		}
2390		DEB(printf("ipfw: opcode %d\n", cmd->opcode);)
2391		switch (cmd->opcode) {
2392		case O_NOP:
2393		case O_PROBE_STATE:
2394		case O_KEEP_STATE:
2395		case O_PROTO:
2396		case O_IP_SRC_ME:
2397		case O_IP_DST_ME:
2398		case O_LAYER2:
2399		case O_IN:
2400		case O_FRAG:
2401		case O_IPOPT:
2402		case O_IPTOS:
2403		case O_IPPRECEDENCE:
2404		case O_IPVER:
2405		case O_TCPWIN:
2406		case O_TCPFLAGS:
2407		case O_TCPOPTS:
2408		case O_ESTAB:
2409		case O_VERREVPATH:
2410		case O_IPSEC:
2411			if (cmdlen != F_INSN_SIZE(ipfw_insn))
2412				goto bad_size;
2413			break;
2414
2415		case O_UID:
2416		case O_GID:
2417		case O_IP_SRC:
2418		case O_IP_DST:
2419		case O_TCPSEQ:
2420		case O_TCPACK:
2421		case O_PROB:
2422		case O_ICMPTYPE:
2423			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
2424				goto bad_size;
2425			break;
2426
2427		case O_LIMIT:
2428			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
2429				goto bad_size;
2430			break;
2431
2432		case O_LOG:
2433			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
2434				goto bad_size;
2435
2436			((ipfw_insn_log *)cmd)->log_left =
2437			    ((ipfw_insn_log *)cmd)->max_log;
2438
2439			break;
2440
2441		case O_IP_SRC_MASK:
2442		case O_IP_DST_MASK:
2443			if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
2444				goto bad_size;
2445			if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
2446				printf("ipfw: opcode %d, useless rule\n",
2447					cmd->opcode);
2448				return EINVAL;
2449			}
2450			break;
2451
2452		case O_IP_SRC_SET:
2453		case O_IP_DST_SET:
2454			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
2455				printf("ipfw: invalid set size %d\n",
2456					cmd->arg1);
2457				return EINVAL;
2458			}
2459			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
2460			    (cmd->arg1+31)/32 )
2461				goto bad_size;
2462			break;
2463
2464		case O_MACADDR2:
2465			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
2466				goto bad_size;
2467			break;
2468
2469		case O_IPID:
2470		case O_IPTTL:
2471		case O_IPLEN:
2472			if (cmdlen < 1 || cmdlen > 31)
2473				goto bad_size;
2474			break;
2475
2476		case O_MAC_TYPE:
2477		case O_IP_SRCPORT:
2478		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
2479			if (cmdlen < 2 || cmdlen > 31)
2480				goto bad_size;
2481			break;
2482
2483		case O_RECV:
2484		case O_XMIT:
2485		case O_VIA:
2486			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
2487				goto bad_size;
2488			break;
2489
2490		case O_PIPE:
2491		case O_QUEUE:
2492			if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
2493				goto bad_size;
2494			goto check_action;
2495
2496		case O_FORWARD_IP:
2497			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
2498				goto bad_size;
2499			goto check_action;
2500
2501		case O_FORWARD_MAC: /* XXX not implemented yet */
2502		case O_CHECK_STATE:
2503		case O_COUNT:
2504		case O_ACCEPT:
2505		case O_DENY:
2506		case O_REJECT:
2507		case O_SKIPTO:
2508		case O_DIVERT:
2509		case O_TEE:
2510			if (cmdlen != F_INSN_SIZE(ipfw_insn))
2511				goto bad_size;
2512check_action:
2513			if (have_action) {
2514				printf("ipfw: opcode %d, multiple actions"
2515					" not allowed\n",
2516					cmd->opcode);
2517				return EINVAL;
2518			}
2519			have_action = 1;
2520			if (l != cmdlen) {
2521				printf("ipfw: opcode %d, action must be"
2522					" last opcode\n",
2523					cmd->opcode);
2524				return EINVAL;
2525			}
2526			break;
2527		default:
2528			printf("ipfw: opcode %d, unknown opcode\n",
2529				cmd->opcode);
2530			return EINVAL;
2531		}
2532	}
2533	if (have_action == 0) {
2534		printf("ipfw: missing action\n");
2535		return EINVAL;
2536	}
2537	return 0;
2538
2539bad_size:
2540	printf("ipfw: opcode %d size %d wrong\n",
2541		cmd->opcode, cmdlen);
2542	return EINVAL;
2543}
2544
2545
2546/**
2547 * {set|get}sockopt parser.
2548 */
2549static int
2550ipfw_ctl(struct sockopt *sopt)
2551{
2552	int error, s, rulenum;
2553	size_t size;
2554	struct ip_fw *bp , *buf, *rule;
2555
2556	static u_int32_t rule_buf[255];	/* we copy the data here */
2557
2558	/*
2559	 * Disallow modifications in really-really secure mode, but still allow
2560	 * the logging counters to be reset.
2561	 */
2562	if (sopt->sopt_name == IP_FW_ADD ||
2563	    (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
2564#if __FreeBSD_version >= 500034
2565		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
2566		if (error)
2567			return (error);
2568#else /* FreeBSD 4.x */
2569		if (securelevel >= 3)
2570			return (EPERM);
2571#endif
2572	}
2573
2574	error = 0;
2575
2576	switch (sopt->sopt_name) {
2577	case IP_FW_GET:
2578		/*
2579		 * pass up a copy of the current rules. Static rules
2580		 * come first (the last of which has number IPFW_DEFAULT_RULE),
2581		 * followed by a possibly empty list of dynamic rule.
2582		 * The last dynamic rule has NULL in the "next" field.
2583		 */
2584		s = splimp();
2585		size = static_len;	/* size of static rules */
2586		if (ipfw_dyn_v)		/* add size of dyn.rules */
2587			size += (dyn_count * sizeof(ipfw_dyn_rule));
2588
2589		/*
2590		 * XXX todo: if the user passes a short length just to know
2591		 * how much room is needed, do not bother filling up the
2592		 * buffer, just jump to the sooptcopyout.
2593		 */
2594		buf = malloc(size, M_TEMP, M_WAITOK);
2595		if (buf == 0) {
2596			splx(s);
2597			error = ENOBUFS;
2598			break;
2599		}
2600
2601		bp = buf;
2602		for (rule = layer3_chain; rule ; rule = rule->next) {
2603			int i = RULESIZE(rule);
2604			bcopy(rule, bp, i);
2605			bcopy(&set_disable, &(bp->next_rule),
2606			    sizeof(set_disable));
2607			bp = (struct ip_fw *)((char *)bp + i);
2608		}
2609		if (ipfw_dyn_v) {
2610			int i;
2611			ipfw_dyn_rule *p, *dst, *last = NULL;
2612
2613			dst = (ipfw_dyn_rule *)bp;
2614			for (i = 0 ; i < curr_dyn_buckets ; i++ )
2615				for ( p = ipfw_dyn_v[i] ; p != NULL ;
2616				    p = p->next, dst++ ) {
2617					bcopy(p, dst, sizeof *p);
2618					bcopy(&(p->rule->rulenum), &(dst->rule),
2619					    sizeof(p->rule->rulenum));
2620					/*
2621					 * store a non-null value in "next".
2622					 * The userland code will interpret a
2623					 * NULL here as a marker
2624					 * for the last dynamic rule.
2625					 */
2626					bcopy(&dst, &dst->next, sizeof(dst));
2627					last = dst ;
2628					dst->expire =
2629					    TIME_LEQ(dst->expire, time_second) ?
2630						0 : dst->expire - time_second ;
2631				}
2632			if (last != NULL) /* mark last dynamic rule */
2633				bzero(&last->next, sizeof(last));
2634		}
2635		splx(s);
2636
2637		error = sooptcopyout(sopt, buf, size);
2638		free(buf, M_TEMP);
2639		break;
2640
2641	case IP_FW_FLUSH:
2642		/*
2643		 * Normally we cannot release the lock on each iteration.
2644		 * We could do it here only because we start from the head all
2645		 * the times so there is no risk of missing some entries.
2646		 * On the other hand, the risk is that we end up with
2647		 * a very inconsistent ruleset, so better keep the lock
2648		 * around the whole cycle.
2649		 *
2650		 * XXX this code can be improved by resetting the head of
2651		 * the list to point to the default rule, and then freeing
2652		 * the old list without the need for a lock.
2653		 */
2654
2655		s = splimp();
2656		free_chain(&layer3_chain, 0 /* keep default rule */);
2657		splx(s);
2658		break;
2659
2660	case IP_FW_ADD:
2661		rule = (struct ip_fw *)rule_buf; /* XXX do a malloc */
2662		error = sooptcopyin(sopt, rule, sizeof(rule_buf),
2663			sizeof(struct ip_fw) );
2664		size = sopt->sopt_valsize;
2665		if (error || (error = check_ipfw_struct(rule, size)))
2666			break;
2667
2668		error = add_rule(&layer3_chain, rule);
2669		size = RULESIZE(rule);
2670		if (!error && sopt->sopt_dir == SOPT_GET)
2671			error = sooptcopyout(sopt, rule, size);
2672		break;
2673
2674	case IP_FW_DEL:
2675		/*
2676		 * IP_FW_DEL is used for deleting single rules or sets,
2677		 * and (ab)used to atomically manipulate sets. Argument size
2678		 * is used to distinguish between the two:
2679		 *    sizeof(u_int32_t)
2680		 *	delete single rule or set of rules,
2681		 *	or reassign rules (or sets) to a different set.
2682		 *    2*sizeof(u_int32_t)
2683		 *	atomic disable/enable sets.
2684		 *	first u_int32_t contains sets to be disabled,
2685		 *	second u_int32_t contains sets to be enabled.
2686		 */
2687		error = sooptcopyin(sopt, rule_buf,
2688			2*sizeof(u_int32_t), sizeof(u_int32_t));
2689		if (error)
2690			break;
2691		size = sopt->sopt_valsize;
2692		if (size == sizeof(u_int32_t))	/* delete or reassign */
2693			error = del_entry(&layer3_chain, rule_buf[0]);
2694		else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */
2695			set_disable =
2696			    (set_disable | rule_buf[0]) & ~rule_buf[1] &
2697			    ~(1<<31); /* set 31 always enabled */
2698		else
2699			error = EINVAL;
2700		break;
2701
2702	case IP_FW_ZERO:
2703	case IP_FW_RESETLOG: /* argument is an int, the rule number */
2704		rulenum=0;
2705
2706		if (sopt->sopt_val != 0) {
2707		    error = sooptcopyin(sopt, &rulenum,
2708			    sizeof(int), sizeof(int));
2709		    if (error)
2710			break;
2711		}
2712		error = zero_entry(rulenum, sopt->sopt_name == IP_FW_RESETLOG);
2713		break;
2714
2715	default:
2716		printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
2717		error = EINVAL;
2718	}
2719
2720	return (error);
2721}
2722
2723/**
2724 * dummynet needs a reference to the default rule, because rules can be
2725 * deleted while packets hold a reference to them. When this happens,
2726 * dummynet changes the reference to the default rule (it could well be a
2727 * NULL pointer, but this way we do not need to check for the special
2728 * case, plus here he have info on the default behaviour).
2729 */
2730struct ip_fw *ip_fw_default_rule;
2731
2732/*
2733 * This procedure is only used to handle keepalives. It is invoked
2734 * every dyn_keepalive_period
2735 */
2736static void
2737ipfw_tick(void * __unused unused)
2738{
2739	int i;
2740	int s;
2741	ipfw_dyn_rule *q;
2742
2743	if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0)
2744		goto done;
2745
2746	s = splimp();
2747	for (i = 0 ; i < curr_dyn_buckets ; i++) {
2748		for (q = ipfw_dyn_v[i] ; q ; q = q->next ) {
2749			if (q->dyn_type == O_LIMIT_PARENT)
2750				continue;
2751			if (q->id.proto != IPPROTO_TCP)
2752				continue;
2753			if ( (q->state & BOTH_SYN) != BOTH_SYN)
2754				continue;
2755			if (TIME_LEQ( time_second+dyn_keepalive_interval,
2756			    q->expire))
2757				continue;	/* too early */
2758			if (TIME_LEQ(q->expire, time_second))
2759				continue;	/* too late, rule expired */
2760
2761			send_pkt(&(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN);
2762			send_pkt(&(q->id), q->ack_fwd - 1, q->ack_rev, 0);
2763		}
2764	}
2765	splx(s);
2766done:
2767	ipfw_timeout_h = timeout(ipfw_tick, NULL, dyn_keepalive_period*hz);
2768}
2769
2770static void
2771ipfw_init(void)
2772{
2773	struct ip_fw default_rule;
2774
2775	ip_fw_chk_ptr = ipfw_chk;
2776	ip_fw_ctl_ptr = ipfw_ctl;
2777	layer3_chain = NULL;
2778
2779	bzero(&default_rule, sizeof default_rule);
2780
2781	default_rule.act_ofs = 0;
2782	default_rule.rulenum = IPFW_DEFAULT_RULE;
2783	default_rule.cmd_len = 1;
2784	default_rule.set = 31;
2785
2786	default_rule.cmd[0].len = 1;
2787	default_rule.cmd[0].opcode =
2788#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
2789				1 ? O_ACCEPT :
2790#endif
2791				O_DENY;
2792
2793	add_rule(&layer3_chain, &default_rule);
2794
2795	ip_fw_default_rule = layer3_chain;
2796	printf("ipfw2 initialized, divert %s, "
2797		"rule-based forwarding enabled, default to %s, logging ",
2798#ifdef IPDIVERT
2799		"enabled",
2800#else
2801		"disabled",
2802#endif
2803		default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny");
2804
2805#ifdef IPFIREWALL_VERBOSE
2806	fw_verbose = 1;
2807#endif
2808#ifdef IPFIREWALL_VERBOSE_LIMIT
2809	verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
2810#endif
2811	if (fw_verbose == 0)
2812		printf("disabled\n");
2813	else if (verbose_limit == 0)
2814		printf("unlimited\n");
2815	else
2816		printf("limited to %d packets/entry by default\n",
2817		    verbose_limit);
2818	bzero(&ipfw_timeout_h, sizeof(struct callout_handle));
2819	ipfw_timeout_h = timeout(ipfw_tick, NULL, hz);
2820}
2821
2822static int
2823ipfw_modevent(module_t mod, int type, void *unused)
2824{
2825	int s;
2826	int err = 0;
2827
2828	switch (type) {
2829	case MOD_LOAD:
2830		s = splimp();
2831		if (IPFW_LOADED) {
2832			splx(s);
2833			printf("IP firewall already loaded\n");
2834			err = EEXIST;
2835		} else {
2836			ipfw_init();
2837			splx(s);
2838		}
2839		break;
2840
2841	case MOD_UNLOAD:
2842#if !defined(KLD_MODULE)
2843		printf("ipfw statically compiled, cannot unload\n");
2844		err = EBUSY;
2845#else
2846                s = splimp();
2847		untimeout(ipfw_tick, NULL, ipfw_timeout_h);
2848		ip_fw_chk_ptr = NULL;
2849		ip_fw_ctl_ptr = NULL;
2850		free_chain(&layer3_chain, 1 /* kill default rule */);
2851		splx(s);
2852		printf("IP firewall unloaded\n");
2853#endif
2854		break;
2855	default:
2856		break;
2857	}
2858	return err;
2859}
2860
2861static moduledata_t ipfwmod = {
2862	"ipfw",
2863	ipfw_modevent,
2864	0
2865};
2866DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PSEUDO, SI_ORDER_ANY);
2867MODULE_VERSION(ipfw, 1);
2868#endif /* IPFW2 */
2869